In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification
import numpy as np
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/content/drive//MyDrive/BERT/Emotion_final.csv")
df.head()

In [None]:
df['Emotion'].value_counts()

In [None]:
emotions = df.Emotion.unique()

In [None]:
data = pd.DataFrame()
for i in emotions:
  temp = df[df['Emotion']==i].iloc[:879]
  data = [data, temp]
  data = pd.concat(data)
data

In [None]:
data['Emotion'].value_counts()

In [None]:
data_dict = {0:'happy', 1:'anger', 2:'love', 3:'surprise', 4:'fear', 5:'sadness'}
my_dict = { data_dict[k]:k for k in data_dict}
data['Emotion'] = [my_dict.get(i,i) for i in list(data['Emotion'])]
data.head()

In [None]:
train, test = np.split(data.sample(frac=1, random_state=42), [int(.8*len(data))])

In [None]:
train['Emotion'].value_counts()

In [None]:
test['Emotion'].value_counts()

In [None]:
train.to_csv("/content/drive/MyDrive/BERT/train.csv", index=False)
test.to_csv("/content/drive/MyDrive/BERT/test.csv", index=False)

In [None]:
data_files = {'train': "/content/drive/MyDrive/BERT/train.csv",
              'test': "/content/drive/MyDrive/BERT/test.csv"}

In [None]:
dataset = load_dataset('csv', data_files=data_files)

In [None]:
dataset

In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["Text"])
tokenized_datasets = tokenized_datasets.rename_column("Emotion", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
test_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=8, collate_fn=data_collator
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
test = []
test_pred = []
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    test_pred = test_pred + (outputs.logits.argmax(axis=-1).flatten().tolist())
    test = test + batch['labels'].tolist()

In [None]:
test = [data_dict.get(i,i) for i in test]
test_pred = [data_dict.get(i,i) for i in test_pred]

In [None]:
cr_test = classification_report(test,test_pred)
test_accuracy = accuracy_score(test,test_pred)
print("Testing accuracy:", test_accuracy)
print(cr_test)

In [None]:
def confusion_ma(y_true, y_pred, class_names):
    cm = confusion_matrix(y_true, y_pred, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(cmap=plt.cm.Blues)
    return plt.show()

In [None]:
confusion_ma(test,test_pred, emotions)

#User Input

In [None]:
model.save_pretrained("/content/drive/MyDrive/BERT/emotion_model_v1")
tokenizer.save_pretrained("/content/drive/MyDrive/BERT/emotion_model_v1")


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/BERT/emotion_model_v1")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/BERT/emotion_model_v1")
def classify(text):
    text = text.lower()
    text = tokenizer(text, return_tensors="pt")
    model.eval()
    with torch.no_grad():
        outputs = model(**text)
    pred = outputs.logits.argmax(axis=-1).flatten().tolist()
    pred = [data_dict.get(i,i) for i in pred]
    return pred[0]

classify("I will miss you")
