# Youtube Comment Sentiment Analysis

## Import Libraries

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import re
import pandas as pd
import numpy as np
%env CUDA_LAUNCH_BLOCKING=1

env: CUDA_LAUNCH_BLOCKING=1


## Fine Tune BERT

For best performance on youtube comments, we use a twitter sentiment model since the structure of both are quite similar

In [None]:
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
model = model.to("cuda")

In [None]:
sample = ["I love this video", "This tutorial sucks!"]
tokenizer(sample, padding=True, truncation=True, max_length=512)

{'input_ids': [[0, 100, 657, 42, 569, 2], [0, 713, 35950, 29384, 328, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [None]:
dataset = pd.read_csv("https://raw.githubusercontent.com/ArrafMelon/Youtube-Video-Sentiment/main/data/processed_data/CleanYoutubeCommentsDataSet.csv")
X = list(dataset["Comment"])
y = list(dataset["Sentiment"])
X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.2, stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
dataset["Sentiment"].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
2,11054
1,4503
0,2317


In [None]:
len(X_train),len(X_val)

(14299, 3575)

### Create torch dataset

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[index])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
def metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')

    return {"accuracy":accuracy, "recall":recall, "precision":precision, "f1":f1}

### Train and Test Model

In [None]:
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=metrics
)

In [None]:
trainer.train()

Step,Training Loss
500,0.3777
1000,0.3225
1500,0.2174
2000,0.1574
2500,0.1096


TrainOutput(global_step=2682, training_loss=0.22695786307588786, metrics={'train_runtime': 4251.9046, 'train_samples_per_second': 10.089, 'train_steps_per_second': 0.631, 'total_flos': 1.1286776280278016e+16, 'train_loss': 0.22695786307588786, 'epoch': 3.0})

In [None]:
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.44230377674102783,
 'eval_accuracy': 0.8995804195804196,
 'eval_recall': 0.8542368807887984,
 'eval_precision': 0.8753817221106361,
 'eval_f1': 0.8639449225618239,
 'eval_runtime': 97.6738,
 'eval_samples_per_second': 36.601,
 'eval_steps_per_second': 4.576,
 'epoch': 3.0}

## Save Model

In [None]:
trainer.save_model('sentimentPredictor')

In [None]:
!zip -r sentimentPredictor.zip sentimentPredictor

  adding: sentimentPredictor/ (stored 0%)
  adding: sentimentPredictor/config.json (deflated 52%)
  adding: sentimentPredictor/training_args.bin (deflated 52%)
  adding: sentimentPredictor/model.safetensors (deflated 7%)


In [None]:
from google.colab import files
files.download('sentimentPredictor.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>