<a href="https://colab.research.google.com/github/Akomon333/Comments-classifier-V2/blob/main/commentclassificationv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers datasets torch

In [None]:
from google.colab import drive
drive.mount("/content/drive")
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification, AutoConfig
from datasets import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support
import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/YoutubeCommentsDataSet.csv")
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.1, seed=42)
dataset = dataset.rename_column("Sentiment", "labels")
print(dataset["train"].features)

{'Comment': Value('string'), 'labels': Value('int64')}


In [None]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, predictions)
    bal_acc = balanced_accuracy_score(labels, predictions)

    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        labels, predictions, average="macro"
    )

    precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
        labels, predictions, average=None
    )

    class_names = ["negative", "neutral", "positive"]

    metrics = {
        "accuracy": acc,
        "balanced_accuracy": bal_acc,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1_macro": f1_macro,
    }
    for i, class_name in enumerate(class_names):
        metrics[f"precision_{class_name}"] = precision_per_class[i]
        metrics[f"recall_{class_name}"] = recall_per_class[i]
        metrics[f"f1_{class_name}"] = f1_per_class[i]

    return metrics

In [None]:
# 0: negative, 1: neutral, 2: Positive

# 2    10642
# 1     3319
# 0     2296

config = AutoConfig.from_pretrained(
    "roberta-base",
    hidden_dropout_prob=0.15,
    attention_probs_dropout_prob=0.15,
    num_labels=3
)


tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config)
class_counts = torch.tensor([2296, 3319, 10642], dtype=torch.float)
weights = 1.0 / class_counts
weights = weights / weights.sum() * len(class_counts)
weights = weights.to('cuda')
loss_fn = nn.CrossEntropyLoss(weight=weights)

def focal_loss(logits, labels, gamma=2, weight=None):
    ce_loss = F.cross_entropy(logits, labels, weight=weight, reduction='none')
    pt = torch.exp(-ce_loss)
    loss = ((1 - pt) ** gamma * ce_loss).mean()
    return loss

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = focal_loss(logits.view(-1, model.config.num_labels),
                          labels.view(-1),
                          gamma=1.6,
                          weight=weights)
        return (loss, outputs) if return_outputs else loss

def tokenize_function(examples):
    tokens = tokenizer(
        examples["Comment"],
        padding="max_length",
        truncation=True,
        max_length=200
    )
    tokens["labels"] = examples["labels"]
    return tokens

tokenized_train = dataset['train'].map(tokenize_function, batched=True, num_proc=4)
tokenized_test = dataset['test'].map(tokenize_function, batched=True, num_proc=4)

tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
print(tokenized_train[0])

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.1,
    eval_strategy="epoch",
    report_to="none",
    max_grad_norm=1.0,
    warmup_ratio=0.05,
    lr_scheduler_type='linear',
    fp16=True,
    metric_for_best_model="balanced_accuracy",
    greater_is_better=True
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
    data_collator=None
)

In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Balanced Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Negative,Recall Negative,F1 Negative,Precision Neutral,Recall Neutral,F1 Neutral,Precision Positive,Recall Positive,F1 Positive
1,0.1247,0.177669,0.876999,0.817481,0.831896,0.817481,0.816223,0.75,0.889952,0.814004,0.830645,0.598837,0.695946,0.915044,0.963653,0.93872
2,0.092,0.171792,0.884994,0.849925,0.821524,0.849925,0.834026,0.75,0.875598,0.807947,0.756677,0.741279,0.748899,0.957895,0.932898,0.945231
3,0.0575,0.241748,0.900369,0.85127,0.860712,0.85127,0.855209,0.824074,0.851675,0.837647,0.819936,0.741279,0.778626,0.938126,0.960857,0.949355
4,0.0318,0.296236,0.899754,0.84375,0.861366,0.84375,0.850922,0.819444,0.84689,0.832941,0.831081,0.715116,0.76875,0.933573,0.969245,0.951075


TrainOutput(global_step=7316, training_loss=0.07631562341426226, metrics={'train_runtime': 1176.9706, 'train_samples_per_second': 49.724, 'train_steps_per_second': 6.216, 'total_flos': 6015019398091200.0, 'train_loss': 0.07631562341426226, 'epoch': 4.0})

In [None]:
trainer.evaluate()

In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()

    if prediction == 0:
        return "Negative"
    elif prediction == 1:
        return "Neutral"
    else:
      return "Good"