1. Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import re
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import time

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

2024-04-05 19:27:46.021964: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-05 19:27:46.022077: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-05 19:27:46.144831: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2. Importing the dataset

In [2]:
data = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/train.csv", encoding='ISO-8859-1')

data = data[['text', 'sentiment']]
data['sentiment'] = data['sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0})

In [3]:
data['sentiment'].value_counts()

sentiment
1    11118
2     8582
0     7781
Name: count, dtype: int64

In [4]:
data.dropna(inplace = True)

3. Creating train-test split 

In [5]:
X = data.text.values
y = data.sentiment.values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1)

In [6]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
Device name: Tesla T4


4. Preprocessing the text

In [7]:
def text_preprocessing(s):
    s = s.lower()
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

5. TF-IDF vectorization

In [8]:
X_train_preprocessed = np.array([text_preprocessing(text) for text in X_train])
X_val_preprocessed = np.array([text_preprocessing(text) for text in X_val])

tf_idf = TfidfVectorizer(ngram_range=(1, 3), binary=True, smooth_idf=False)
X_train_tfidf = tf_idf.fit_transform(X_train_preprocessed)
X_val_tfidf = tf_idf.transform(X_val_preprocessed)

6. Tokenizing the text

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def preprocessing_for_bert(data):
    input_ids = []
    attention_masks = []
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent),
            add_special_tokens=True,     
            max_length=MAX_LEN,      
            pad_to_max_length=True,        
            return_attention_mask=True  
            )
        
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in data.text.values]
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)

Max length:  110


In [11]:
MAX_LEN = 110
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


7. Creating the dataloader

In [12]:
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

8. Defining the model for fine-tuning and adding additional layers

In [13]:
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertClassifier, self).__init__()
        D_in, H, D_out = 768, 50, 3
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
        )

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        last_hidden_state_cls = outputs[0][:, 0, :]
        logits = self.classifier(last_hidden_state_cls)
        return logits

In [14]:
def initialize_model(epochs=4):
    bert_classifier = BertClassifier(freeze_bert=False)
    bert_classifier.to(device)
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,  
                      eps=1e-8 
                      )
    
    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

9. Defining the training loop

In [15]:
loss_fn = nn.CrossEntropyLoss()

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    for epoch_i in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()
        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            logits = model(b_input_ids, b_attn_mask)
            
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch

                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)

        if evaluation == True:

            val_loss, val_accuracy, _ = evaluate(model, val_dataloader)
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")
    
    
def evaluate(model, val_dataloader):
    model.eval()

    val_accuracy = []
    val_loss = []
    pred_list = []
    for batch in val_dataloader:

        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()
        pred_list.append(preds)

        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy, pred_list


In [16]:
bert_classifier, optimizer, scheduler = initialize_model(epochs=10)
train(bert_classifier, train_dataloader, val_dataloader, epochs=10, evaluation=True)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   1.048253   |     -      |     -     |   11.34  
   1    |   40    |   0.811053   |     -      |     -     |   9.87   
   1    |   60    |   0.706635   |     -      |     -     |   10.00  
   1    |   80    |   0.699727   |     -      |     -     |   10.08  
   1    |   100   |   0.694775   |     -      |     -     |   10.17  
   1    |   120   |   0.651119   |     -      |     -     |   10.27  
   1    |   140   |   0.721370   |     -      |     -     |   10.41  
   1    |   160   |   0.614837   |     -      |     -     |   10.54  
   1    |   180   |   0.692098   |     -      |     -     |   10.63  
   1    |   200   |   0.669405   |     -      |     -     |   10.78  
   1    |   220   |   0.664895   |     -      |     -     |   10.92  
   1    |   240   |   0.667189   |     -      |     -     |   11.02  
   1    |   260   |

In [17]:
_1, _2, pred_list = evaluate(bert_classifier, val_dataloader)

In [18]:
actual_lab = []
for step, batch in enumerate(val_dataloader):
    _1, _2, _ = batch
    actual_lab.append(_)

10. Reporting the classification report

In [19]:
from sklearn.metrics import classification_report
print(classification_report(torch.stack(pred_list[:-1]).view(-1).cpu(), torch.stack(actual_lab[:-1]).view(-1).cpu()))

              precision    recall  f1-score   support

           0       0.76      0.81      0.79       775
           1       0.79      0.73      0.76      1106
           2       0.83      0.86      0.85       839

    accuracy                           0.80      2720
   macro avg       0.80      0.80      0.80      2720
weighted avg       0.79      0.80      0.80      2720

