#### Fine-tuning a model
Steps:
- identify the model 
- prepare data
- build trainer  (HuggingFace `trainer` can be used for fine-tuning models; `TrainingArguments` are parameters passed into the trainer object )
- train

In [None]:
## Identifying model + preparing data
# Import modules
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Use tokenizer on text
dataset = dataset.map(lambda row: tokenizer(row["text"], padding=True, max_length=512, truncation=True), keep_in_memory=True)  # .map() to apply a function to each row

In [None]:
## Build trainer + train
from transformers import Trainer, TrainingArguments
# Create training arguments
training_args = TrainingArguments(output_dir="./results")
# Create the trainer
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=training_data, 
    eval_dataset=testing_data
)
# Start the trainer
trainer.train()

# save model to local path
local_path = "./fine_tuned_model"
trainer.save_model(local_path)

# use the fine-tuned model with the pipeline
# Create the classifier
classifier = pipeline(task="sentiment-analysis", model="./fine_tuned_model")  # your local path
# Classify the text
results = classifier(text=text_example)
print(results)

#### Embeddings
Vector/numerical representations of something
Represent human language to computers, enabling cool stuffs such as recommendation systems, search, fraud detection, ...

Benefits:
- semantic understanding (captures relationship between words)
- can be used as features within ML models
- improve generalization (understand unseen/inferred words because encoded semantic info)

Challenges:
- require large amount of data
- inherit biases
- blackbox, lower interpretability


In [None]:
from sentence_transformers import SentenceTransformer
# Create the first embedding model
embedder1 = SentenceTransformer("all-MiniLM-L6-v2")

# Embed the sentence
embedding = embedder1.encode([sentence])


#### Semantic Search
- instead of keyword search, it understand intend and context. Finds context that matches meaning. 
- Verctorization; calculating cosine similarity

Useful when searching is ambiguous, complex, and unstructured


In [None]:
from sentence_transformers import SentenceTransformer
sencoder = SentenceTransformers(model="all-MiniLM-L6-v2")
document_embeddings = encoder.encode(documents)
query = "What are the most recent wildlife articles?"
query_embedding = encoder.encode([query])
