In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer, BertConfig, AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, matthews_corrcoef

from tqdm.notebook import trange, tqdm

import random
import os
import io

print('All imports are working good')

# Check for train devices
device = torch.device("cpu")
n_gpu = 0  # No GPU available

SEED = 19

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Read data
df_train = pd.read_csv("Twitter_Data.csv")
df_train.isnull().sum()

# Observation
print(df_train.head())

# Target distribution
print(df_train['category'].unique())
print(df_train['category'].value_counts())

# Cleaning
df_train = df_train[~df_train['category'].isnull()]
df_train = df_train[~df_train['clean_text'].isnull()]
print('Data cleaning done')

# Target encoding
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df_train['category_1'] = labelencoder.fit_transform(df_train['category'])
print('Text encoding done')

df_train[['category', 'category_1']].drop_duplicates(keep='first')
df_train.rename(columns={'category_1': 'label'}, inplace=True)
print('Column renaming done')

# Data preparation for BERT model
sentences = df_train.clean_text.values

# Distribution of data based on labels
print("Distribution of data based on labels: ", df_train.label.value_counts())

# Set the maximum sequence length
MAX_LEN = 256

# Import BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print('Data preparation for BERT is done')

input_ids = [tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True, truncation=True) for sent in sentences]
print('Tokenization is done')

labels = df_train.label.values

print("Actual sentence before tokenization: ", sentences[2])
print("Encoded Input from dataset: ", input_ids[2])

# Create attention mask
attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]
print(attention_masks[2])
print('Attention masks done')

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=41, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=41, test_size=0.1)
print('Data splitting done')

# Convert all data into torch tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Select a batch size for training
batch_size = 32

# Create DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

print(train_data[0])

# Load BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).to(device)

# Parameters
lr = 2e-5
adam_epsilon = 1e-8
epochs = 3
num_warmup_steps = 0
num_training_steps = len(train_dataloader) * epochs

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=lr, eps=adam_epsilon, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

# Training and Inference
for epoch in trange(1, epochs+1, desc='Epoch'):
    print(f"<{'='*22} Epoch {epoch} {'='*22}>")
    batch_loss = 0

    model.train()
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training", leave=False)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        batch_loss += loss.item()

    avg_train_loss = batch_loss / len(train_dataloader)
    print(f"\n\tAverage Training loss: {avg_train_loss}")

    model.eval()
    eval_accuracy, eval_mcc_accuracy, nb_eval_steps = 0, 0, 0

    for batch in tqdm(validation_dataloader, desc="Validation", leave=False):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
        logits = logits[0].cpu().numpy()
        label_ids = b_labels.cpu().numpy()
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        
        df_metrics = pd.DataFrame({'Epoch': epoch, 'Actual_class': labels_flat, 'Predicted_class': pred_flat})
        
        tmp_eval_accuracy = accuracy_score(labels_flat, pred_flat)
        tmp_eval_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)
        
        eval_accuracy += tmp_eval_accuracy
        eval_mcc_accuracy += tmp_eval_mcc_accuracy
        nb_eval_steps += 1

    print(f"\n\tValidation Accuracy: {eval_accuracy / nb_eval_steps}")
    print(f"\n\tValidation MCC Accuracy: {eval_mcc_accuracy / nb_eval_steps}")

# Plotting Confusion Matrix
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

# Emotion labels
label2int = {
  "Negative": 0,
  "Neutral": 1,
  "Positive": 2
}

print(classification_report(df_metrics['Actual_class'].values, df_metrics['Predicted_class'].values, target_names=label2int.keys(), digits=len(label2int)))

# Save model and tokenizer
model_save_folder = 'model/'
tokenizer_save_folder = 'tokenizer/'

path_model = f'/kaggle/working/{model_save_folder}'
path_tokenizer = f'/kaggle/working/{tokenizer_save_folder}'

os.makedirs(path_model, exist_ok=True)
os.makedirs(path_tokenizer, exist_ok=True)

model.save_pretrained(path_model)
tokenizer.save_pretrained(path_tokenizer)

model_save_name = 'fineTuneModel.pt'
path = f'{path_model}/{model_save_name}'
torch.save(model.state_dict(), path)
