In [None]:
import pandas as pd

# Load datasets
train_df = pd.read_csv(
    r'C:\Users\bhuva\Desktop\projects_2025\LLM\dataset\twitter_training.csv', sep=',',
    header=None,
    names=['id', 'game', 'sentiment', 'text']
)
valid_df = pd.read_csv(
    r'C:\Users\bhuva\Desktop\projects_2025\LLM\dataset\twitter_validation.csv',
    sep=',',  
    header=None,
    names=['id', 'game', 'sentiment', 'text']
)

# Display data samples
print("Train Data Sample:")
print(train_df.head())
print("\nValidation Data Sample:")
print(valid_df.head())

# Check data distribution and quality
print("\nLabel distribution in training:", train_df['sentiment'].value_counts())
print("Label distribution in validation:", valid_df['sentiment'].value_counts())

#missing values
print("\nMissing values in training:", train_df.isnull().sum())
print("Missing values in validation:", valid_df.isnull().sum())

# Mapping sentiment labels to integers
label2id = {'Positive': 1, 'Negative': 0, 'Neutral': 2, 'Irrelevant': 3}
train_df['label'] = train_df['sentiment'].map(label2id)
valid_df['label'] = valid_df['sentiment'].map(label2id)


# Handle missing values
train_df['text'] = train_df['text'].fillna("no text provided")
valid_df['text'] = valid_df['text'].fillna("no text provided")


print("\nMissing values after handling:")
print("Training:", train_df.isnull().sum())
print("Validation:", valid_df.isnull().sum())

Train Data Sample:
     id         game sentiment  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  

Validation Data Sample:
     id       game   sentiment  \
0  3364   Facebook  Irrelevant   
1   352     Amazon     Neutral   
2  8312  Microsoft    Negative   
3  4371      CS-GO    Negative   
4  4433     Google     Neutral   

                                                text  
0  I mentioned on Facebook that I was struggling ...  
1  BBC News - Amazon boss Jeff Bezos rejects clai...  
2  @Microsoft Why do I pay for WORD when it funct..

In [2]:
from datasets import Dataset

# Converts the pandas DataFrames into Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)


print("\nExample from train_dataset:")
print(train_dataset[0])

print("\nDataset sizes:")
print("Training samples:", len(train_dataset))
print("Validation samples:", len(valid_dataset))


Example from train_dataset:
{'id': 2401, 'game': 'Borderlands', 'sentiment': 'Positive', 'text': 'im getting on borderlands and i will murder you all ,', 'label': 1}

Dataset sizes:
Training samples: 74682
Validation samples: 1000


In [None]:
from datasets import Value

#labels to int64
train_dataset = train_dataset.cast_column("label", Value("int64"))
valid_dataset = valid_dataset.cast_column("label", Value("int64"))

Casting the dataset:   0%|          | 0/74682 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
import re
import string

def preprocess_tweet(text):
    # Removes URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Removes Twitter handles
    text = re.sub(r'@\w+', '', text)
    # Removes hash '#' symbol from hashtags
    text = re.sub(r'#', '', text)
    # Removes numbers
    text = re.sub(r'\d+', '', text)
    # Removes HTML entities
    text = re.sub(r'&\w+;', '', text)
    # Converts text to lowercase
    text = text.lower()
    # Removes punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Normalizes whitespace

    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
import re
import string
from transformers import AutoTokenizer

#pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    texts = examples.get("text", [])
    cleaned_text = [preprocess_tweet(str(text)) for text in texts]
    if "label" in examples:
        examples["label"] = [int(label) if not pd.isna(label) else -100 for label in examples["label"]]
    return tokenizer(cleaned_text, padding="max_length", truncation=True, max_length=128)

#preprocessing the datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)


print("\nExample of processed training data:")
for i in range(3):
    print(f"\nExample {i+1}:")
    print("Text:", tokenizer.decode(train_dataset[i]['input_ids']))
    print("Label:", train_dataset[i]['label'])

Map:   0%|          | 0/74682 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Example of processed training data:

Example 1:
Text: [CLS] im getting on borderlands and i will murder you all [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Label: 1

Example 2:
Text: [CLS] i am coming to the borders and i will kill you all [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [6]:
columns_to_remove = ["id", "game", "text", "sentiment"]
train_dataset = train_dataset.remove_columns(columns_to_remove)
valid_dataset = valid_dataset.remove_columns(columns_to_remove)

# format for PyTorch
train_dataset.set_format("torch")
valid_dataset.set_format("torch")


print("\nTokenized and formatted training sample:")
print(train_dataset[0])


Tokenized and formatted training sample:
{'label': tensor(1), 'input_ids': tensor([  101, 10047,  2893,  2006,  3675,  8653,  1998,  1045,  2097,  4028,
         2017,  2035,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,   

In [7]:
from transformers import AutoModelForSequenceClassification
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch
from torch import nn

# class weights
labels = train_df['label'].values
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.FloatTensor(class_weights)

# Initializeing the model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=len(label2id)
)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=r'C:\Users\bhuva\Desktop\projects_2025\LLM\output',
    eval_strategy="epoch",           
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir=r'C:\Users\bhuva\Desktop\projects_2025\LLM\logging',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",      
    save_strategy="epoch",
    max_grad_norm=1.0,              
    warmup_ratio=0.1,               
    report_to=["tensorboard"],      
    save_total_limit=2,             
)


In [None]:
from transformers import Trainer, EarlyStoppingCallback
import evaluate
import numpy as np
import torch.nn as nn

def compute_metrics(eval_pred):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Compute classification metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    
    # Convert numeric labels back to text for ROUGE and BLEU
    id2label = {v: k for k, v in label2id.items()}
    pred_texts = [id2label[p] for p in predictions]
    label_texts = [id2label[l] for l in labels]
    
    # Compute ROUGE and BLEU
    rouge_metric = evaluate.load("rouge")
    bleu_metric = evaluate.load("bleu")
    
    rouge = rouge_metric.compute(predictions=pred_texts, references=label_texts)
    bleu = bleu_metric.compute(predictions=pred_texts, references=[[t] for t in label_texts])
    
    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "rouge1": rouge["rouge1"],
        "rouge2": rouge["rouge2"],
        "rougeL": rouge["rougeL"],
        "bleu": bleu["bleu"]
    }

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        inputs_without_labels = {k: v for k, v in inputs.items() if k != "labels"}
        
        outputs = model(**inputs_without_labels)
        logits = outputs.logits

        if labels is not None:
            # Create loss function with weights
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        else:
            loss = None

        return (loss, outputs) if return_outputs else loss

    def training_step(self, model, inputs, num_items_in_batch=None):
        model.train()
        inputs = self._prepare_inputs(inputs)

        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)

        if self.args.gradient_accumulation_steps > 1 and not self.deepspeed:
            loss = loss / self.args.gradient_accumulation_steps

        loss.backward()

        return loss.detach()
    
def validate_datasets():
    # Check if datasets are properly formatted
    sample = train_dataset[0]
    required_keys = ['input_ids', 'attention_mask', 'label']
    
    for key in required_keys:
        if key not in sample:
            raise ValueError(f"Missing required key {key} in dataset")
            
    # Verify label distribution
    train_labels = [example['label'] for example in train_dataset]
    valid_labels = [example['label'] for example in valid_dataset]
    
    print("\nLabel distribution in processed datasets:")
    print("Training:", np.bincount(train_labels))
    print("Validation:", np.bincount(valid_labels))
    
    return True


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    class_weights=class_weights
)

# Run validation and start training
if validate_datasets():
    print("\nDatasets validated successfully. Starting training...")
    trainer.train()   





Label distribution in processed datasets:
Training: [22542 20832 18318 12990]
Validation: [266 277 285 172]

Datasets validated successfully. Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rouge1,Rouge2,Rougel,Bleu
1,0.6495,0.381315,0.871,0.870973,0.871,0.0,0.872,0.0
2,0.3105,0.167275,0.945,0.944956,0.945,0.0,0.945,0.0
3,0.2093,0.184608,0.952,0.951991,0.952,0.0,0.952,0.0


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]


Label distribution in processed datasets:
Training: [22542 20832 18318 12990]
Validation: [266 277 285 172]

Datasets validated successfully. Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rouge1,Rouge2,Rougel,Bleu
0,0.1977,0.258356,0.946,0.946085,0.946,0.0,0.946,0.0


KeyboardInterrupt: 

In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = r"C:\Users\bhuva\Desktop\projects_2025\LLM\best_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model and tokenizer saved to {model_path}")


eval_results = trainer.evaluate()
print("Evaluation Results:")
print(eval_results)


def predict(texts):
    cleaned_texts = [preprocess_tweet(text) for text in texts]
    inputs = tokenizer(cleaned_texts, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # move inputs to the device used by the model
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    id2label = {v: k for k, v in label2id.items()}
    predicted_labels = [id2label[int(pred)] for pred in predictions]
    return predicted_labels

# Example usage:
new_tweets = [
    "This game is absolutely amazing!",
    "I didn't like how the match turned out."
]
predictions = predict(new_tweets)
print("Predictions on new tweets:")
for tweet, label in zip(new_tweets, predictions):
    print(f"Tweet: {tweet} --> Predicted Sentiment: {label}")

Model and tokenizer saved to C:\Users\bhuva\Desktop\projects_2025\LLM\best_model
Evaluation Results:
{'eval_loss': 0.2583555579185486, 'eval_accuracy': 0.946, 'eval_f1': 0.946084634519709, 'eval_rouge1': 0.946, 'eval_rouge2': 0.0, 'eval_rougeL': 0.946, 'eval_bleu': 0.0}
Predictions on new tweets:
Tweet: This game is absolutely amazing! --> Predicted Sentiment: Positive
Tweet: I didn't like how the match turned out. --> Predicted Sentiment: Neutral


In [14]:
def compute_metrics(eval_pred):
    """
    Compute evaluation metrics for the model predictions.
    This includes accuracy, F1, ROUGE, and BLEU scores.
    
    Parameters:
        eval_pred: A tuple containing the model logits and the true labels.
        
    Returns:
        A dictionary with keys: "accuracy", "f1", "rouge1", "rouge2", "rougeL", and "bleu".
    """
    import numpy as np
    import evaluate

    # Load metrics
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    rouge_metric = evaluate.load("rouge")
    bleu_metric = evaluate.load("bleu")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Compute classification metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')

    # Map numeric labels back to text since ROUGE and BLEU are text-oriented
    id2label = {v: k for k, v in label2id.items()}
    pred_texts = [id2label[p] for p in predictions]
    label_texts = [id2label[l] for l in labels]

    # Compute ROUGE scores (this computes multiple ROUGE metrics)
    rouge_results = rouge_metric.compute(predictions=pred_texts, references=label_texts)
    
    # Compute BLEU score
    bleu_results = bleu_metric.compute(
        predictions=pred_texts,
        references=[[ref] for ref in label_texts]
    )

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
        "bleu": bleu_results["bleu"]
    }

In [15]:
import numpy as np
from sklearn.metrics import classification_report


eval_results = trainer.evaluate()

print("=" * 30, "Evaluation Scores", "=" * 30)
for metric, score in eval_results.items():
    print(f"{metric:15s}: {score:,.4f}")



predictions_output = trainer.predict(valid_dataset)
logits = predictions_output.predictions
labels = predictions_output.label_ids

preds = np.argmax(logits, axis=1)

id2label = {v: k for k, v in label2id.items()}
true_labels_text = [id2label[label] for label in labels]
pred_labels_text = [id2label[pred] for pred in preds]

print("\n" + "=" * 30 + " Classification Report " + "=" * 30)
print(classification_report(true_labels_text, pred_labels_text))

eval_loss      : 0.2584
eval_accuracy  : 0.9460
eval_f1        : 0.9461
eval_rouge1    : 0.9460
eval_rouge2    : 0.0000
eval_rougeL    : 0.9460
eval_bleu      : 0.0000

              precision    recall  f1-score   support

  Irrelevant       0.96      0.91      0.93       172
    Negative       0.98      0.94      0.96       266
     Neutral       0.91      0.96      0.94       285
    Positive       0.94      0.96      0.95       277

    accuracy                           0.95      1000
   macro avg       0.95      0.94      0.94      1000
weighted avg       0.95      0.95      0.95      1000

