In [8]:
import numpy as np
import pandas as pd

In [9]:
sample_train_data = pd.read_csv("/workspaces/NLP---Text-Classification-of-Coronavirus-Tweets/dataset/Corona_NLP_train.csv", encoding = 'latin1')
sample_test_data = pd.read_csv("/workspaces/NLP---Text-Classification-of-Coronavirus-Tweets/dataset/Corona_NLP_test.csv", encoding = 'latin1')

In [10]:
sample_train_data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [11]:
sample_test_data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [12]:
sample_train_data["Sentiment"].value_counts()

Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

In [13]:
sample_test_data["Sentiment"].value_counts()

Sentiment
Negative              1041
Positive               947
Neutral                619
Extremely Positive     599
Extremely Negative     592
Name: count, dtype: int64

so it is interesting to note that dataset is more or less balanced for training and testing so there is not a huge class imbalance. Thus we can first start with the usual modelling methodology and then move to the class weighting if necessary

In [14]:
training_tweets = sample_train_data["OriginalTweet"]
training_labels = sample_train_data["Sentiment"]

testing_tweets = sample_test_data["OriginalTweet"]
testing_labels = sample_test_data["Sentiment"]

In [17]:
class_map = {"Extremely Negative": 0, "Negative": 1, "Neutral": 2, "Positive": 3, "Extremely Positive": 4}
training_labels.map(class_map)

0        2
1        3
2        3
3        3
4        0
        ..
41152    2
41153    0
41154    3
41155    2
41156    1
Name: Sentiment, Length: 41157, dtype: int64

In [19]:
training_tweets.map(lambda x: x.lower())

0        @menyrbie @phil_gahan @chrisitv https://t.co/i...
1        advice talk to your neighbours family to excha...
2        coronavirus australia: woolworths to give elde...
3        my food stock is not the only one which is emp...
4        me, ready to go at supermarket during the #cov...
                               ...                        
41152    airline pilots offering to stock supermarket s...
41153    response to complaint not provided citing covi...
41154    you know itâs getting tough when @kameronwild...
41155    is it wrong that the smell of hand sanitizer i...
41156    @tartiicat well new/used rift s are going for ...
Name: OriginalTweet, Length: 41157, dtype: object

In [20]:
max_length = 128
padding = "max_length"
truncation = True

import torch
from transformers import AutoTokenizer
from transformers import BertModel


tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [22]:
print(f"How the tokenizer vocabulary looks like: {tokenizer.vocab}")
print(f"How many unique tokens are in the tokenizer vocabulary : {tokenizer.vocab_size}")

How many unique tokens are in the tokenizer vocabulary : 30522


In [26]:
len(tokenizer.encode(training_tweets[4], padding=padding, truncation=truncation, max_length=max_length))

128

In [27]:
sample_tokens = tokenizer(training_tweets[4], padding=padding, truncation=truncation, max_length=max_length, return_tensors="pt")
sample_embeddings = model.embeddings(sample_tokens['input_ids'])

In [28]:
sample_embeddings.shape

torch.Size([1, 128, 768])

In [2]:
import torch
torch.randn(3, 5), torch.empty(3).random_(5)

(tensor([[-1.1312,  0.1094, -1.3003, -2.1178,  0.5072],
         [-0.4946, -0.0192,  0.0202,  2.2409, -0.1149],
         [ 0.4625, -0.1682, -1.1113,  2.2179,  1.1092]]),
 tensor([2., 3., 0.]))

In [4]:
from torch import nn
rnn = nn.LSTM(input_size=768, hidden_size=64, num_layers=2, batch_first=True, bidirectional=True)
input = torch.randn(4, 64, 768)  # N L dim
h0 = torch.randn(4, 4, 64)     # D*num_layers, N, hidden
c0 = torch.randn(4, 4, 64)     # D*num_layers, N, hidden
output, (hn, cn) = rnn(input, (h0, c0)) 

In [None]:
print(output.shape, hn.shape, cn.shape)
first_token = output[:, 0, :]
last_token = output[:, -1, :]
(first_token + last_token).shape, torch.concatenate([first_token, last_token], axis=-1).shape

In [3]:
import torch
from torch import nn
rnn = nn.GRU(input_size=768, hidden_size=64, num_layers=2, batch_first=True, bidirectional=True)
input = torch.randn(4, 64, 768)  # N L dim
h0 = torch.randn(4, 4, 64)     # D*num_layers, N, hidden
output, hn = rnn(input, h0) 

In [4]:
print(output.shape, hn.shape)
first_token = output[:, 0, :]
last_token = output[:, -1, :]
(first_token + last_token).shape, torch.concatenate([first_token, last_token], axis=-1).shape

torch.Size([4, 64, 128]) torch.Size([4, 4, 64])


(torch.Size([4, 128]), torch.Size([4, 256]))

(torch.Size([64, 40]), torch.Size([64, 80]))