In [1]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import torch
import os

os.environ["HSA_OVERRIDE_GFX_VERSION"] = "10.3.0"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
df = pd.read_csv('../data/tweets_main_sentiment.csv')
X = df['content']
y = df['sentiment']

In [4]:
df.head()

Unnamed: 0,id,username,content,sentiment
0,1663696515326517248,The_NewDispatch,Ukraine Shoots Down 52 Of 54 Iranian Made Dron...,negative
1,1663696124392136705,KabalexChild,"Legion ""Freedom of Russia"" recruits thousands ...",negative
2,1663695373485973504,GuardianAus,Russia-Ukraine war at a glance: what we know o...,neutral
3,1663695317819351041,VSNRadio,Russia-Ukraine war at a glance: what we know o...,neutral
4,1663694952424169475,tdhoanh,Drone strikes in capital bring Ukraine war to ...,negative


In [5]:
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
class TwitterDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_len):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        tweet = str(self.tweets.iloc[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',  # changed from pad_to_max_length=True
            truncation=True,  # added this line
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
def create_data_loader(X, y, tokenizer, max_len, batch_size):
    ds = TwitterDataset(
        tweets=X,
        labels=y,
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
    )

In [8]:
BATCH_SIZE = 16
MAX_LEN = 128

train_data_loader = create_data_loader(X_train, y_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(X_val, y_val, tokenizer, MAX_LEN, BATCH_SIZE)

device = "cuda:0"
model = model.to(device)

EPOCHS = 10

In [9]:
print(torch.cuda.get_device_properties("cuda:0"))

_CudaDeviceProperties(name='AMD Radeon RX 6650M', major=10, minor=3, gcnArchName='gfx1030', total_memory=8176MB, multi_processor_count=14)


In [10]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss().to(device)
for epoch in range(EPOCHS):
    for data in train_data_loader:
    
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        labels = data["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [11]:
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for data in val_data_loader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        labels = data["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs[0], dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += labels.shape[0]

print(f'Accuracy: {correct_predictions.double() / total_predictions}')

Accuracy: 0.8091353996737357


In [14]:
model_save_path = "../models/roberta_fine_Tuned_model.pt"
torch.save(model.state_dict(), model_save_path)