In [1]:
! pip install kaggle

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
!cp /content/drive/MyDrive/Kagglejson/kaggle.json ~/.kaggle/kaggle.json

In [10]:
!kaggle datasets download -d datatattle/covid-19-nlp-text-classification

Downloading covid-19-nlp-text-classification.zip to /content
  0% 0.00/4.38M [00:00<?, ?B/s]
100% 4.38M/4.38M [00:00<00:00, 212MB/s]


In [11]:
!unzip covid-19-nlp-text-classification.zip

Archive:  covid-19-nlp-text-classification.zip
  inflating: Corona_NLP_test.csv     
  inflating: Corona_NLP_train.csv    


In [12]:
! pip install transformers

Installing collected packages: tokenizers, safetensors, huggingface-hub, transformers
Successfully installed huggingface-hub-0.16.4 safetensors-0.3.3 tokenizers-0.13.3 transformers-4.32.1


In [127]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

In [128]:
from sklearn.preprocessing import LabelEncoder

In [129]:
train_df = pd.read_csv("/content/Corona_NLP_train.csv", encoding='latin1')
test_df = pd.read_csv("/content/Corona_NLP_test.csv", encoding='latin1')

In [130]:
train_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [131]:
train_df = train_df[['OriginalTweet','Sentiment']]
train_df.columns = ['tweet', 'sentiment']

test_df = test_df[['OriginalTweet','Sentiment']]
test_df.columns = ['tweet', 'sentiment']

In [132]:
train_df.head()

Unnamed: 0,tweet,sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [133]:
train_df['sentiment'].value_counts()

Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: sentiment, dtype: int64

In [134]:
# train_df['sentiment'] = train_df['sentiment'].astype('category').cat.codes

In [135]:
le = LabelEncoder()

train_df['sentiment'] = le.fit_transform(train_df['sentiment'])

In [136]:
le.classes_

array(['Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral',
       'Positive'], dtype=object)

In [137]:
# train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, shuffle = False)

In [138]:
val_df.head()

Unnamed: 0,tweet,sentiment
32925,"New Offers, Lower Prices, Financing, COVID-19 ...",2
32926,"""Vampire Pizza, a new at-home interactive food...",2
32927,Wholesale US egg prices triple in March https:...,3
32928,Delivering Food to Zimbabwean Doorsteps during...,1
32929,SOCIAL DISTANCING? ?\r\r\nReduce you risk o...,1


In [139]:
train_df.head()

Unnamed: 0,tweet,sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,3
1,advice Talk to your neighbours family to excha...,4
2,Coronavirus Australia: Woolworths to give elde...,4
3,My food stock is not the only one which is emp...,4
4,"Me, ready to go at supermarket during the #COV...",0


In [140]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, target_col, text_col):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.target_col = target_col
        self.text_col = text_col
        self.X = self.data[text_col]
        self.y = self.data[target_col]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.X[index]

        inputs = self.tokenizer(text,
                                add_special_tokens=True,
                                max_length=self.max_length,
                                padding='max_length',
                                return_attention_mask=True,
                                return_token_type_ids=True,
                                truncation=True)
        target = self.y[index]

        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(inputs['token_type_ids'], dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }


In [141]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = len(le.classes_))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [142]:
max_length = 128
batch_size = 32

train_dataset = CustomDataset(train_df, tokenizer, max_length, target_col='sentiment', text_col='tweet')

In [143]:
from torch.utils.data import random_split

train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size

train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

In [144]:

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [145]:
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,num_workers = 2)

In [146]:
val_dataset[0]

{'input_ids': tensor([  101,  2074, 24185,  2102,  4921,  2063,  2042, 24341,  1012,  1012,
          2017,  2342,  2000,  9378,  2115,  5909,  1998, 11546,  2007,  7815,
          1024,  6819, 13153, 22522, 23283,  2008,  2522, 17258,  1011,  2539,
          2064,  5788,  2006,  4840, 17006,  3965,  2074,  2066,  2151,  2060,
          3302,  1012,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [147]:
train_df.iloc[7557]

tweet        Regional Australian supermarket hires security...
sentiment                                                    4
Name: 7557, dtype: object

In [148]:
data_batch = next(iter(train_loader))
input_ids_dtype = data_batch['input_ids'].dtype
print(input_ids_dtype)

torch.int64


In [149]:
train_df['sentiment'].value_counts()

4    11422
2     9917
3     7713
1     6624
0     5481
Name: sentiment, dtype: int64

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0  # Initialize total loss for the epoch
    num_batches = len(train_loader)

    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        targets = batch['target'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                        labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # Accumulate loss for the epoch

        # Print training loss after every few batches
        if (batch_idx + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}] - Batch [{batch_idx+1}/{num_batches}] - Loss: {loss.item():.4f}")

    average_epoch_loss = total_loss / num_batches
    print(f"Epoch [{epoch+1}/{num_epochs}] - Average Training Loss: {average_epoch_loss:.4f}")


    # Validation
    model.eval()
    val_accuracy = 0
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)  # Remove token_type_ids
            val_loss += outputs.loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            val_accuracy += (preds == targets).sum().item()

    val_accuracy /= len(val_dataset)
    val_loss /= len(val_loader)

    print(f"Epoch {epoch+1}/{num_epochs} - Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")


In [151]:
def classify_sentiment(text):
    inputs = tokenizer(text,
                       add_special_tokens=True,
                       max_length=512,
                       padding='max_length',
                       return_attention_mask=True,
                       return_token_type_ids=True,
                       truncation=True)
    input_ids = torch.tensor(inputs['input_ids']).to(device)
    attention_mask = torch.tensor(inputs['attention_mask']).to(device)
    token_type_ids = torch.tensor(inputs['token_type_ids']).to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    return predicted_class

# Example inference
text = "I really hated the movie. It was horrible!"
predicted_sentiment = classify_sentiment(text)
print(f"Predicted Sentiment: {predicted_sentiment}")


Predicted Sentiment: 0


In [155]:
train_df.head()

Unnamed: 0,tweet,sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,3
1,advice Talk to your neighbours family to excha...,4
2,Coronavirus Australia: Woolworths to give elde...,4
3,My food stock is not the only one which is emp...,4
4,"Me, ready to go at supermarket during the #COV...",0


In [156]:
test_df.head()

Unnamed: 0,tweet,sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,When I couldn't find hand sanitizer at Fred Me...,Positive
2,Find out how you can protect yourself and love...,Extremely Positive
3,#Panic buying hits #NewYork City as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [159]:
test_df['encoded_sentiment'] = le.transform(test_df['sentiment'])

In [160]:
test_df.head()

Unnamed: 0,tweet,sentiment,encoded_sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,0
1,When I couldn't find hand sanitizer at Fred Me...,Positive,4
2,Find out how you can protect yourself and love...,Extremely Positive,1
3,#Panic buying hits #NewYork City as anxious sh...,Negative,2
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,3


In [161]:
test_df['predictions'] = test_df['tweet'].apply(classify_sentiment)

In [167]:
test_df['sentiment_pred'] = le.inverse_transform(test_df['predictions'])

In [163]:
from sklearn.metrics import classification_report

In [168]:
print(classification_report(test_df['sentiment'], test_df['sentiment_pred']))

                    precision    recall  f1-score   support

Extremely Negative       0.87      0.89      0.88       592
Extremely Positive       0.81      0.92      0.86       599
          Negative       0.83      0.85      0.84      1041
           Neutral       0.93      0.83      0.88       619
          Positive       0.81      0.77      0.79       947

          accuracy                           0.84      3798
         macro avg       0.85      0.85      0.85      3798
      weighted avg       0.85      0.84      0.84      3798

