Imports and Environment Setup

In [None]:
import os
import re
import emoji
import contractions
from langdetect import detect
from googletrans import Translator
import nltk
from nltk.corpus import stopwords
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoTokenizer
)
from sklearn.model_selection import train_test_split
from datasets import Dataset  

os.environ["WANDB_MODE"] = "disabled"

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
translator = Translator()


: 

Cleaning Functions

In [None]:
def clean_comment(text):
    # Lowercase text
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Replace mentions with <USER>
    text = re.sub(r'@\w+', '<USER>', text)
    # Convert emojis to text
    text = emoji.demojize(text)
    # Expand contractions
    text = contractions.fix(text)
    # Reduce repeated characters
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    # Remove special characters and numbers (keep alphabets and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # Remove timestamps and promotional phrases
    text = re.sub(r'\b\d{1,2}:\d{2}\b', '', text)
    text = re.sub(r'(subscribe|check out my channel)', '', text)
    # Translate non-English text if necessary
    try:
        if detect(text) != 'en':
            text = translator.translate(text, dest='en').text
    except Exception as e:
        pass
    return text


Data Loading and Preparation

In [None]:
csv_path = r'D:\College\Projects\Comment Analyzer\data\youtube_comments_cleaned.csv'
df = pd.read_csv(csv_path)

df = df[['VideoID', 'CommentText', 'Sentiment', 'Likes', 'Replies']]

df['Likes'] = pd.to_numeric(df['Likes'], errors='coerce').fillna(0)
df['Replies'] = pd.to_numeric(df['Replies'], errors='coerce').fillna(0)

print(df.head())

alpha = 1.0
beta = 0.5

df['sample_weight'] = (alpha * df['Likes']) + (beta * df['Replies']) + 1
df['cleaned_comment'] = df['CommentText'].apply(clean_comment)

Custom PyTorch Dataset

In [None]:
label2id = {
    'negative': 0,
    'neutral': 1,
    'positive': 2,
    'Neutral': 1,
    'Positive': 2,
    'Negative': 0
}

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

class YouTubeCommentsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        encoding = self.tokenizer(
            row['cleaned_comment'],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(label2id[row['Sentiment']])
        item['sample_weight'] = torch.tensor(row['sample_weight'], dtype=torch.float)
        return item

# Create an instance of the custom dataset
custom_dataset = YouTubeCommentsDataset(df, tokenizer)

# Print the first 5 samples from the custom dataset
for i in range(5):
    print(f"Custom Dataset Sample {i}:", custom_dataset[i])


Feature Mapping for Hugging Face Trainer

In [None]:
label2id = {
    'negative': 0,
    'neutral': 1,
    'positive': 2,
    'Neutral': 1,
    'Positive': 2,
    'Negative': 0
}

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def create_input_features(example):
    encodings = tokenizer(example['CommentText'], padding='max_length', truncation=True, max_length=128)
    encodings['labels'] = label2id[example['Sentiment'].lower()]
    encodings['sample_weight'] = example['Weightage'] 
    return encodings

train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

train_df['Weightage'] = (alpha * train_df['Likes']) + (beta * train_df['Replies']) + 1
eval_df['Weightage'] = (alpha * eval_df['Likes']) + (beta * eval_df['Replies']) + 1

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

train_dataset = train_dataset.map(create_input_features)
eval_dataset = eval_dataset.map(create_input_features)


Define the Custom Model with Weighted Loss

In [None]:
class WeightedBertForSequenceClassification(BertForSequenceClassification):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        sample_weights = inputs.pop("sample_weight")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, reduction='none')
        weighted_loss = (loss * sample_weights).mean()
        return (weighted_loss, outputs) if return_outputs else weighted_loss

model = WeightedBertForSequenceClassification.from_pretrained(model_name, num_labels=3)


Set Up Training and Trainer

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,
    per_device_train_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = (predictions == labels).mean()
    return {'accuracy': accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)


Train, Save, and Use the Model

In [None]:
trainer.train()

model_save_path = "/content/fine_tuned_bert_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

def predict_sentiment(comment, likes=0, replies=0, alpha=1.0, beta=0.5):
    clean_text = clean_comment(comment)
    inputs = tokenizer(clean_text, return_tensors="pt", truncation=True, max_length=128, padding='max_length')
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    weight = (alpha * likes) + (beta * replies) + 1
    weighted_probabilities = probabilities * weight
    predicted_class = weighted_probabilities.argmax().item()
    return predicted_class, probabilities

#Test Case
test_comment = "This video is amazing! 😍"
pred_class, probs = predict_sentiment(test_comment, likes=10, replies=2)
print("Predicted sentiment class:", pred_class)
print("Probabilities:", probs)
