In [101]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from sklearn.metrics import accuracy_score

In [102]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [103]:
train_df = pd.read_csv('twitter_training.csv')
val_df = pd.read_csv('twitter_validation.csv')

In [104]:
train_df.head()

Unnamed: 0,Tweet_Id,Entity,labels,texts
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [105]:
val_df.head()

Unnamed: 0,Tweet_Id,Entity,labels,texts
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [106]:
train_df = train_df.iloc[:,2:]
val_df = val_df.iloc[:,2:]

In [107]:
train_df.shape,val_df.shape

((74682, 2), (1000, 2))

In [108]:
train_df.drop_duplicates(inplace=True)
train_df.drop_duplicates(inplace=True)

In [109]:
label_map = {"Negative":0,"Neutral":1,"Positive":2}

In [110]:
train_df = train_df[train_df['labels'].isin(label_map.keys())] # Remove irrelevant labels
train_df['labels'] = train_df['labels'].map(label_map)
val_df = val_df[val_df['labels'].isin(label_map.keys())] # Remove irrelevant labels
val_df['labels'] = val_df['labels'].map(label_map)

In [111]:
train_df.shape,val_df.shape

((57486, 2), (828, 2))

In [112]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [113]:
train_dataset,val_dataset

(Dataset({
     features: ['labels', 'texts', '__index_level_0__'],
     num_rows: 57486
 }),
 Dataset({
     features: ['labels', 'texts', '__index_level_0__'],
     num_rows: 828
 }))

In [114]:
train_dataset.set_format(type="torch", columns=["texts", "labels"])
val_dataset.set_format(type="torch", columns=["texts", "labels"])

In [115]:
train_dataset['labels'].to(torch.float64)
val_dataset['labels'].to(torch.float64)

tensor([1., 0., 0., 1., 0., 2., 2., 2., 0., 2., 2., 0., 1., 0., 2., 2., 0., 2.,
        0., 0., 1., 0., 1., 1., 0., 0., 2., 2., 0., 2., 0., 1., 1., 2., 1., 2.,
        1., 1., 1., 2., 1., 0., 0., 0., 1., 2., 0., 0., 2., 2., 2., 2., 2., 0.,
        0., 2., 2., 0., 1., 0., 1., 0., 2., 0., 0., 2., 2., 2., 1., 1., 1., 2.,
        2., 1., 2., 1., 0., 0., 1., 1., 0., 2., 0., 0., 0., 1., 2., 1., 0., 2.,
        2., 1., 2., 1., 2., 0., 1., 1., 1., 0., 1., 0., 1., 1., 2., 2., 1., 0.,
        0., 2., 0., 1., 0., 2., 1., 0., 1., 2., 1., 2., 2., 1., 1., 1., 1., 2.,
        1., 2., 2., 0., 1., 1., 1., 1., 0., 1., 2., 0., 1., 0., 1., 0., 0., 0.,
        2., 2., 2., 1., 1., 2., 1., 1., 1., 2., 1., 0., 0., 1., 2., 2., 1., 2.,
        2., 1., 1., 0., 0., 0., 0., 2., 1., 1., 2., 2., 2., 2., 0., 2., 2., 1.,
        0., 0., 0., 2., 2., 0., 0., 2., 2., 0., 2., 2., 0., 2., 1., 0., 1., 1.,
        2., 0., 2., 2., 1., 2., 0., 0., 2., 2., 2., 2., 1., 1., 2., 0., 1., 2.,
        1., 0., 1., 1., 0., 2., 2., 0., 

In [116]:
train_dataset[0]

{'labels': tensor(2),
 'texts': 'im getting on borderlands and i will murder you all ,'}

In [117]:
MODEL_NAME = 'openai-community/gpt2'

In [118]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

1

In [119]:
print(tokenizer.pad_token)

<|endoftext|>


In [120]:
def tokenize_function(example):
    return tokenizer(example["texts"], padding="max_length", truncation=True,max_length=128)

In [121]:
train_dataset = train_dataset.map(lambda x: {"texts": str(x["texts"])})
val_dataset = val_dataset.map(lambda x: {"texts": str(x["texts"])})

Map:   0%|          | 0/57486 [00:00<?, ? examples/s]

Map:   0%|          | 0/828 [00:00<?, ? examples/s]

In [122]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/57486 [00:00<?, ? examples/s]

Map:   0%|          | 0/828 [00:00<?, ? examples/s]

In [123]:
train_dataset[0]

{'labels': tensor(2),
 'texts': 'im getting on borderlands and i will murder you all ,',
 'input_ids': tensor([  320,  1972,   319,  4865,  4447,   290,  1312,   481,  5123,   345,
           477,   837, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256,

In [124]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=False)
)

In [125]:
model.config.pad_token_id = tokenizer.pad_token_id

In [126]:
peft_config = LoraConfig(
    r=4,  
    lora_alpha=16,  
    lora_dropout=0.1,  
    task_type="SEQ_CLS",
    target_modules=["c_attn", "c_proj"]  # LoRA applied to attention layers
)

In [127]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  

trainable params: 407,808 || all params: 124,849,920 || trainable%: 0.3266




In [143]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=8,  # Reduce for Colab Free
    per_device_eval_batch_size=2,
    num_train_epochs=1,  # Increase if Colab Pro
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=1000,
    save_steps=1000,
    eval_steps=1000,
    load_best_model_at_end=True,
    fp16=True,  # Mixed Precision for Memory Efficiency
    metric_for_best_model="accuracy",
    report_to="none"
)



In [144]:
def compute_accuracy(p):
    preds, labels = p
    preds = np.argmax(preds, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

In [145]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [146]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy,
    data_collator=data_collator
)

  trainer = Trainer(


In [147]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
1000,1.048,0.948839,0.560386
2000,0.8658,0.818894,0.63285
3000,0.8025,0.78282,0.665459
4000,0.798,0.786335,0.665459
5000,0.7889,0.774365,0.676329
6000,0.7744,0.76239,0.675121
7000,0.7848,0.767137,0.679952


TrainOutput(global_step=7186, training_loss=0.83616247477055, metrics={'train_runtime': 28164.0601, 'train_samples_per_second': 2.041, 'train_steps_per_second': 0.255, 'total_flos': 3773264574283776.0, 'train_loss': 0.83616247477055, 'epoch': 1.0})

In [148]:
trainer.model.save_pretrained('saved_model/gpt_sentiment_model')
tokenizer.save_pretrained('saved_model/gpt_sentiment_tokenizer')

('saved_model/gpt_sentiment_tokenizer\\tokenizer_config.json',
 'saved_model/gpt_sentiment_tokenizer\\special_tokens_map.json',
 'saved_model/gpt_sentiment_tokenizer\\vocab.json',
 'saved_model/gpt_sentiment_tokenizer\\merges.txt',
 'saved_model/gpt_sentiment_tokenizer\\added_tokens.json',
 'saved_model/gpt_sentiment_tokenizer\\tokenizer.json')

In [149]:
trainer.save_model('saved_model/gpt_sent_trainer')

In [150]:
torch.save(model.state_dict(),'saved_model/gpt_sentiment_weights.pth')

In [182]:
# Evaluate Model with Example Texts
def evaluate_example_texts(texts, true_labels):
    model.eval()
    model.to(device)
    correct, total = 0, 0
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

    with torch.no_grad():
        for text, true_label in zip(texts, true_labels):
            inputs = tokenizer(text, truncation=True, return_tensors='pt')
            # input_ids, attention_mask = inputs["input_ids"].to(device), inputs["attention_mask"].to(device)
            outputs = model(**inputs)

            pred_label =  torch.argmax(outputs.logits,dim=1) # Placeholder logic for sentiment extraction from Llama output
            print(f"Text: {text}\nPredicted Sentiment: {label_map[pred_label.item()]} (Actual: {label_map[true_label]})\n")
            if pred_label == true_label:
                correct += 1
            total += 1

    accuracy = correct / total
    print(f"Evaluation Accuracy: {accuracy:.4f}")

In [174]:
inputs = tokenizer("This is the worst service I've ever received.",truncation=True,return_tensors='pt')
true_label=[0]

In [161]:
outputs = model(**inputs)

In [176]:
preds = torch.argmax(outputs.logits,dim=1)

In [184]:
# Test Example Texts
test_texts = [
    "I love this product! It works perfectly.",
    "This is the worst service I've ever received.",
    "The movie was okay, nothing special but not bad either."
]
test_labels = [2, 0, 1]  # Expected sentiments: Positive, Negative, Neutral
evaluate_example_texts(test_texts, test_labels)

Text: I love this product! It works perfectly.
Predicted Sentiment: Positive (Actual: Positive)

Text: This is the worst service I've ever received.
Predicted Sentiment: Negative (Actual: Negative)

Text: The movie was okay, nothing special but not bad either.
Predicted Sentiment: Positive (Actual: Neutral)

Evaluation Accuracy: 0.6667
