In [None]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset,DataLoader
from transformers import BertTokenizer,BertModel
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
import numpy as np

In [None]:
training = pd.read_csv('twitter_training.csv')
validation = pd.read_csv('twitter_validation.csv')

In [None]:
print(training.head())
print(validation.head())

   Tweet_Id       Entity    labels  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                               texts  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
   Tweet_Id     Entity      labels  \
0      3364   Facebook  Irrelevant   
1       352     Amazon     Neutral   
2      8312  Microsoft    Negative   
3      4371      CS-GO    Negative   
4      4433     Google     Neutral   

                                               texts  
0  I mentioned on Facebook that I was struggling ...  
1  BBC News - Amazon boss Jeff Bezos rejects clai...  
2  @Microsoft Why do I pay for WORD when it fun

In [None]:
training.shape,validation.shape

((74682, 4), (1000, 4))

In [None]:
training.drop_duplicates(inplace=True)
validation.drop_duplicates(inplace=True)

In [None]:
training.drop(columns=["Tweet_Id",'Entity'],inplace=True)
validation.drop(columns=["Tweet_Id",'Entity'],inplace=True)

In [None]:
training.shape,validation.shape

((71981, 2), (1000, 2))

In [None]:
label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}

In [None]:
training = training[training["labels"].isin(label_map.keys())]  # Remove irrelevant labels
training["labels"] = training["labels"].map(label_map)
validation = validation[validation["labels"].isin(label_map.keys())]  # Remove irrelevant labels
validation["labels"] = validation["labels"].map(label_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training["labels"] = training["labels"].map(label_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation["labels"] = validation["labels"].map(label_map)


In [None]:
training.shape,validation.shape

((59397, 2), (828, 2))

In [None]:
training.head()

Unnamed: 0,labels,texts
0,2,im getting on borderlands and i will murder yo...
1,2,I am coming to the borders and I will kill you...
2,2,im getting on borderlands and i will kill you ...
3,2,im coming on borderlands and i will murder you...
4,2,im getting on borderlands 2 and i will murder ...


In [None]:
validation.head()

Unnamed: 0,labels,texts
1,1,BBC News - Amazon boss Jeff Bezos rejects clai...
2,0,@Microsoft Why do I pay for WORD when it funct...
3,0,"CSGO matchmaking is so full of closet hacking,..."
4,1,Now the President is slapping Americans in the...
5,0,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [None]:
MODEL_NAME = 'bert-base-uncased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# custom dataset class
class Twitter_Data(Dataset):
  def __init__(self,texts,labels,tokenizer,max_length=128):
    self.texts = [str(text) for text in texts]
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    encoding = self.tokenizer(self.texts[idx],padding='max_length',max_length=self.max_length,truncation=True,return_tensors='pt')
    return {
        'input_ids':encoding['input_ids'].squeeze(0),
        'attention_mask': encoding['attention_mask'].squeeze(0),
        'labels':torch.tensor(self.labels[idx],dtype=torch.long)
    }

In [None]:
train_dataset = Twitter_Data(texts=training['texts'].tolist(),labels=training['labels'].tolist(),tokenizer=tokenizer)
val_dataset = Twitter_Data(texts=validation['texts'].tolist(),labels=validation['labels'].tolist(),tokenizer=tokenizer)

In [None]:
train_dataset[0]

{'input_ids': tensor([  101, 10047,  2893,  2006,  3675,  8653,  1998,  1045,  2097,  4028,
          2017,  2035,  1010,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [None]:
train_loader = DataLoader(train_dataset,batch_size=8,shuffle=True)
val_loader = DataLoader(val_dataset,batch_size=8,shuffle=False)

In [None]:
class BertSentimentModel(nn.Module):
  def __init__(self,model_name=MODEL_NAME,num_labels=3):
    super(BertSentimentModel,self).__init__()
    self.bert = BertModel.from_pretrained(model_name)
    self.dropout = nn.Dropout(0.3)
    self.fc = nn.Linear(self.bert.config.hidden_size,num_labels)

  def forward(self,input_ids,attention_mask):
    outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
    return self.fc(self.dropout(outputs.pooler_output))

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
device

device(type='cuda')

In [None]:
model = BertSentimentModel().to(device)

In [None]:
optimizer = AdamW(model.parameters(),lr=2e-5)

In [None]:
loss_fn = nn.CrossEntropyLoss()

In [None]:
epochs = 3

In [None]:
# training loop
for epoch in range (epochs):
  model.train()
  total_loss = 0
  correct,total = 0,0
  for batch in tqdm.tqdm(train_loader):
    optimizer.zero_grad()
    input_ids,attention_mask,labels = batch['input_ids'].to(device),batch['attention_mask'].to(device),batch['labels'].to(device)

    outputs = model(input_ids,attention_mask)
    loss = loss_fn(outputs,labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
    preds = torch.argmax(outputs,dim=1)
    correct += (preds==labels).sum().item()
    total += labels.size(0)

  train_acc = correct/total
  print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}, Accuracy: {train_acc:.4f}')

100%|██████████| 7425/7425 [24:24<00:00,  5.07it/s]


Epoch 1/3, Loss: 0.5423884959508875, Accuracy: 0.7792


100%|██████████| 7425/7425 [24:21<00:00,  5.08it/s]


Epoch 2/3, Loss: 0.19010775117123632, Accuracy: 0.9254


100%|██████████| 7425/7425 [24:20<00:00,  5.08it/s]

Epoch 3/3, Loss: 0.11007289655183627, Accuracy: 0.9538





In [None]:
# save model
# save entire model
torch.save(model,'bert_sentiment_model.pth')
print(f'Model saved Successfully.')

Model saved Successfully.


In [None]:
# save model_weights
torch.save(model.state_dict(),'bert_sentiment_weights.pth')
print(f'Model Weights Saved Successfully')

Model Weights Saved Successfully


In [None]:
# Evaluate model
def evaluate_text(texts,true_labels):
  model.eval()
  correct,total = 0,0
  label_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
  with torch.no_grad():
    inputs = tokenizer(texts,padding='max_length',max_length=128,truncation=True,return_tensors='pt')
    input_ids,attention_mask = inputs['input_ids'].to(device),inputs['attention_mask'].to(device)
    outputs = model(input_ids,attention_mask)
    preds = torch.argmax(outputs,dim=1)

    for text,pred_label,true_label in zip(texts,preds,true_labels):
      print(f'Text: {text}, Predicted Label: {label_map[pred_label.item()]}, True Label: {label_map[true_label]}')
      if pred_label == true_label:
        correct += 1
      total += 1

    accuracy = correct/total
    print(f'Evaluation Accuracy: {accuracy:.4f}')

In [None]:
test_texts = [
    'I Love this product! It works perfactly',
    'Yesterday i had a fight with my friend. I got upset and angry too. I feel like i am gonna kill him if i get chance to do that',
    'This is the worst service i have ever received',
    'The movie was ok, nothing special but not bad at all either.'
]

In [None]:
test_labels = [2,0,0,1]

In [None]:
evaluate_text(test_texts,test_labels)

Text: I Love this product! It works perfactly, Predicted Label: Positive, True Label: Positive
Text: Yesterday i had a fight with my friend. I got upset and angry too. I feel like i am gonna kill him if i get chance to do that, Predicted Label: Negative, True Label: Negative
Text: This is the worst service i have ever received, Predicted Label: Negative, True Label: Negative
Text: The movie was ok, nothing special but not bad at all either., Predicted Label: Neutral, True Label: Neutral
Evaluation Accuracy: 1.0000


In [None]:
new_model = torch.load('bert_sentiment_model.pth',weights_only=False)

In [None]:
print(new_model())

BertSentimentModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [None]:
input = tokenizer('I know him since too long. he is a very good person.' ,padding='max_length',max_length=128,truncation=True,return_tensors='pt')
input_id,attention_mask = input['input_ids'].to(device),input['attention_mask'].to(device)
output = model(input_id,attention_mask)
preds = torch.argmax(output,dim=1)
label_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
if preds == 0:
  preds_label = 'Negative'
elif preds == 1:
  preds_label = 'Neutral'
else:
  preds_label = 'Positive'

In [None]:
print(preds_label)

Positive


In [None]:
new_model.load_state_dict(torch.load('bert_sentiment_weights.pth'))

<All keys matched successfully>

In [None]:
print(new_model)

BertSentimentModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen