In [None]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn

In [None]:
import pandas as pd
from datasets import load_dataset

emotion=load_dataset("emotion")
emotion.set_format(type="pandas")
df=emotion["train"][:]
df.head()



In [None]:
classes = emotion["train"].features['label'].names
classes

In [None]:
df['label_name']=df['label'].apply(lambda x: classes[x])
df.head()

data analysis section


In [None]:
from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
text = "I am new to machine learning! i feel so tensed to do tokenization."
encoded_text=tokenizer(text)
print(encoded_text)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
emotion.reset_format()

In [None]:
def tokenize(batch):
  temp = tokenizer(batch['text'],padding=True,truncation=True)
  return temp

print(tokenize(emotion['train'][:2]))

In [None]:
emotion_encoded = emotion.map(tokenize,batched=True,batch_size=None)

In [None]:
emotion_encoded

In [None]:
inputs = tokenizer(text,return_tensors="pt")
print(inputs)

In [None]:
from transformers import AutoModel
import torch

model = AutoModel.from_pretrained(model_ckpt)

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

In [None]:
last_hidden_states.shape

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = len(classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,num_labels=num_labels).to(device)

In [None]:
from transformers import TrainingArguments

In [None]:
batch_size=64
model_name = "distilbert-finetuned-emotion"

training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  eval_strategy='epoch',
                                  disable_tqdm=False)


In [None]:
from sklearn.metrics import accuracy_score,f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1=f1_score(labels,preds,average="weighted")
  acc=accuracy_score(labels,preds)
  return {"accuracy":acc,"f1":f1}

In [None]:
from transformers import Trainer

trainer = Trainer(model=model,args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotion_encoded["train"],
                  eval_dataset=emotion_encoded["validation"],
                  tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
preds_outputs = trainer.predict(emotion_encoded["test"])
preds_outputs.metrics

In [None]:
text = "a man just talked very doubtfully about his wife"
input_encoded = tokenizer(text,return_tensors="pt").to(device)
with torch.no_grad():
  outputs = model(**input_encoded)
logits = outputs.logits
pred = torch.argmax(logits,dim=1).item()
pred,classes[pred]

In [None]:
from google.colab import files
import shutil

# Folder path
folder_path = "/content/distilbert-finetuned-emotion"
zip_file_path = "/content/distilbert_finetuned_emotion.zip"

# Create a zip file from the folder
shutil.make_archive(base_name=zip_file_path.replace(".zip", ""), format='zip', root_dir=folder_path)

# Trigger download
files.download(zip_file_path)

print(f"Zipped and download initiated for '{zip_file_path}'")
