In [None]:
## Dataset Used: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis
## Base Model: RoBERTa 
## Task: Sentiment Analysis

In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", device_map='cuda:0')
roberta_model = AutoModel.from_pretrained("FacebookAI/roberta-base", device_map='cuda:0')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pandas as pd

train_df = pd.read_csv('../datasets/twitter-entity-sentiment-analysis/twitter_training.csv')
val_df = pd.read_csv('../datasets/twitter-entity-sentiment-analysis/twitter_validation.csv')
train_df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [3]:
MAX_TOKENS=128
def get_input_ids(text):
    try:
        return tokenizer(text,
                         return_tensors='pt',
                         max_length=MAX_TOKENS,
                         padding='max_length',
                         truncation=True).input_ids
    except:
        text="None"
        return tokenizer(text,
                         return_tensors='pt',
                         max_length=MAX_TOKENS,
                         padding='max_length',
                         truncation=True).input_ids

def get_attention_mask(text):
    try:
        return tokenizer(text,
                         return_tensors='pt',
                         max_length=MAX_TOKENS,
                         padding='max_length',
                         truncation=True).attention_mask
    except:
        text="None"
        return tokenizer(text,
                         return_tensors='pt',
                         max_length=MAX_TOKENS,
                         padding='max_length',
                         truncation=True).attention_mask

def get_labels(label):
    if label=='Positive':
        return 1
    elif label=='Negative':
        return 2
    elif label=='Neutral':
        return 3
    else:
        return 4

In [4]:
train_df['Positive'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [5]:
# text = train_df['text'][0]
# tokenizer(text,return_tensors='pt').input_ids

In [6]:
val_df.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [7]:
train_df['text'] = train_df['im getting on borderlands and i will murder you all ,']
train_df['input_ids'] = train_df['text'].apply(get_input_ids)
train_df['attention_mask'] = train_df['text'].apply(get_attention_mask)
train_df['labels'] = train_df['Positive'].apply(get_labels)

In [8]:
val_df['text'] = val_df['I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣']
val_df['input_ids'] = val_df['text'].apply(get_input_ids)
val_df['attention_mask'] = val_df['text'].apply(get_attention_mask)
val_df['labels'] = val_df['Irrelevant'].apply(get_labels)

In [9]:
train_df = train_df[['input_ids','attention_mask','labels']]
val_df = val_df[['input_ids','attention_mask','labels']]

In [10]:
# train_df = train_df.sample(7000)

In [11]:
len(train_df),len(val_df)

(74681, 999)

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch.nn.functional as F

class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        input_ids = self.df['input_ids'].iloc[idx]
        attention_mask = self.df['attention_mask'].iloc[idx]
        label = self.df['labels'].iloc[idx]-1
        label = torch.tensor(label)
#         labels = F.one_hot(label,4)
#         labels = labels.float()
        return {
            'input_ids': input_ids.flatten(),
            'attention_mask': attention_mask.flatten(),
            'labels': label
        }

In [13]:
import numpy as np
def get_subset_random_sampler(dataset_size, percentage):
    indices = list(range(dataset_size))
    np.random.shuffle(indices)
    subset_size = int(np.floor(percentage * dataset_size))
    subset_indices = indices[:subset_size]
    return SubsetRandomSampler(subset_indices)

In [14]:
train_dataset = CustomDataset(train_df)
val_dataset = CustomDataset(val_df)

In [15]:
sampler = get_subset_random_sampler(len(train_dataset), percentage=0.1)
BATCH_SIZE=8
train_dataloader = DataLoader(train_dataset,
                             batch_size=BATCH_SIZE,
                             sampler=sampler)
val_dataloader = DataLoader(val_dataset,
                           batch_size=BATCH_SIZE,
                           shuffle=False)

In [16]:
len(train_dataloader)

934

In [19]:
roberta_model.device

device(type='cuda', index=0)

In [20]:
import torch
from torch import nn
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

class RoBERTaClassifier(nn.Module):
    def __init__(self, num_classes):
        super(RoBERTaClassifier, self).__init__()
        self.roberta = roberta_model  # Load RoBERTa model
        self.linear1 = nn.Linear(self.roberta.config.hidden_size, 512)  # First linear layer
        self.activation = nn.ReLU()  # Activation function
        self.linear2 = nn.Linear(512, num_classes)  # Second linear layer for classification

    def forward(self, input_ids, attention_mask):
#         print(input_ids)
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
#         print(outputs.device)
        last_hidden_state = outputs.last_hidden_state[:,0,:]
#         print(last_hidden_state.device)
        
        output = self.linear1(last_hidden_state)
#         print(output.device)
        output = self.activation(output)

        logits = self.linear2(output)
        
        return logits.to(device)

cuda


In [21]:
inpt = next(iter(train_dataloader))
model = RoBERTaClassifier(num_classes=4)
model.to(device)
model(inpt['input_ids'].to(device),inpt['attention_mask'].to(device))

tensor([[ 0.0589, -0.0291, -0.1679, -0.0498],
        [ 0.0560, -0.0181, -0.1545, -0.0632],
        [ 0.0605, -0.0296, -0.1634, -0.0694],
        [ 0.0606, -0.0135, -0.1510, -0.0571],
        [ 0.0577, -0.0262, -0.1464, -0.0637],
        [ 0.0479, -0.0269, -0.1472, -0.0739],
        [ 0.0518, -0.0245, -0.1464, -0.0429],
        [ 0.0688, -0.0120, -0.1702, -0.0400]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [22]:
from torch.optim import AdamW

epochs = 20
num_classes=4
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Check if GPU is available
model = RoBERTaClassifier(num_classes=num_classes)
optimizer = AdamW(model.parameters(), lr=2e-6)
loss_fn = nn.CrossEntropyLoss()
# Move model to the appropriate device
model.to(device)

from tqdm.auto import tqdm

for epoch in tqdm(range(epochs)):
    model.train()
    train_loss, val_loss = 0.0,0.0
    sampler = get_subset_random_sampler(len(train_dataset), percentage=0.15)
    train_dataloader = DataLoader(train_dataset,
                             batch_size=BATCH_SIZE,
                             sampler=sampler)
    for batch in tqdm(train_dataloader):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Step 1: Zero gradients
        optimizer.zero_grad()
        
        # Step 2: Forward pass
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Step 3: Compute loss
        loss = loss_fn(logits, labels)
        train_loss += loss.item()
        
        # Step 4: Backward pass
        loss.backward()
        
        # Step 5: Update parameters
        optimizer.step()
#         print('.', end='')
    model.eval()
    with torch.inference_mode():
        for batch in tqdm(val_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(logits, labels)
            val_loss += loss.item()
    
    # Calculate average loss for the epoch
    avg_train_loss = train_loss / len(train_dataloader)
    avg_val_loss = val_loss/len(val_dataloader)
    print(f'Epoch [{epoch+1}/{epochs}] | Train Loss: {avg_train_loss:.4f} | Validation Loss: {avg_val_loss:.4f}')

print('Training complete.')

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [1/20] | Train Loss: 1.1729 | Validation Loss: 1.0215


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [2/20] | Train Loss: 0.9965 | Validation Loss: 0.9045


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [3/20] | Train Loss: 0.9546 | Validation Loss: 0.8572


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [4/20] | Train Loss: 0.9101 | Validation Loss: 0.8208


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [5/20] | Train Loss: 0.8858 | Validation Loss: 0.7799


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [6/20] | Train Loss: 0.8489 | Validation Loss: 0.7578


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [7/20] | Train Loss: 0.8308 | Validation Loss: 0.7570


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [8/20] | Train Loss: 0.8133 | Validation Loss: 0.6671


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [9/20] | Train Loss: 0.7847 | Validation Loss: 0.6563


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [10/20] | Train Loss: 0.7488 | Validation Loss: 0.6298


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [11/20] | Train Loss: 0.7400 | Validation Loss: 0.5941


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [12/20] | Train Loss: 0.7035 | Validation Loss: 0.5578


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [13/20] | Train Loss: 0.6842 | Validation Loss: 0.4905


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [14/20] | Train Loss: 0.6525 | Validation Loss: 0.4846


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [15/20] | Train Loss: 0.6416 | Validation Loss: 0.4585


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [16/20] | Train Loss: 0.5974 | Validation Loss: 0.4228


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [17/20] | Train Loss: 0.5785 | Validation Loss: 0.4040


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [18/20] | Train Loss: 0.5536 | Validation Loss: 0.3955


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [19/20] | Train Loss: 0.5498 | Validation Loss: 0.3636


  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [20/20] | Train Loss: 0.5080 | Validation Loss: 0.3334
Training complete.


In [23]:
model.eval()
acc=0
with torch.inference_mode():
    for batch in tqdm(val_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
#         for la, lg in zip(labels,logits):
#             if(la.argmax()==lg.argmax()):
# #                 print(la, lg)
#                 acc+=1
#         print(labels,logits.argmax(axis=1))
        acc += (labels == logits.argmax(axis=1)).sum()
#         print(acc)
#         break
    acc=acc/(len(val_dataset))
    print(acc)

  0%|          | 0/125 [00:00<?, ?it/s]

tensor(0.8799, device='cuda:0')


In [24]:
len(val_dataloader)

125

In [25]:
import torch

# Save the model's state dictionary
model_save_path = '../model/model.pth'
torch.save(model.state_dict(), model_save_path)