# Youtube Comment Sentiment Analysis

## Import Libraries

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import re
import pandas as pd
import numpy as np
%env CUDA_LAUNCH_BLOCKING=1

## Fine Tune BERT

For best performance on youtube comments, we use a twitter sentiment model since the structure of both are quite similar

In [None]:
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer

In [None]:
model

In [None]:
model = model.to("cuda")

In [None]:
sample = ["I love this video", "This tutorial sucks!"]
tokenizer(sample, padding=True, truncation=True, max_length=512)

In [None]:
dataset = pd.read_csv("https://raw.githubusercontent.com/ArrafMelon/Youtube-Video-Sentiment/main/data/processed_data/CleanYoutubeCommentsDataSet.csv")
X = list(dataset["Comment"])
y = list(dataset["Sentiment"])
X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.2, stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
dataset["Sentiment"].value_counts()

In [None]:
len(X_train),len(X_val)

### Create torch dataset

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[index])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
def metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')

    return {"accuracy":accuracy, "recall":recall, "precision":precision, "f1":f1}

### Train and Test Model

In [None]:
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

## Save Model

In [None]:
trainer.save_model('sentimentPredictor')

In [None]:
!zip -r sentimentPredictor.zip sentimentPredictor

In [None]:
from google.colab import files
files.download('sentimentPredictor.zip')