In [42]:
import torch
import pandas as pd
from tqdm import tqdm

In [2]:

df = pd.read_csv(
    'data/Merged_Tweets_Sentiments.csv')

In [3]:
df.head()

Unnamed: 0,text,sentiment,polarity,subjectivity,Preprocess
0,This is the best summary you will find of Seas...,Pos,0.25,0.366667,this is the best summary you will find of seas...
1,More tragic than the red wedding # got # gameo...,Neg,-0.083333,0.416667,more tragic than the red wedding got gameofthr...
2,Check out this awesome @ GameOfThrones Final S...,Pos,0.5,0.833333,check out this awesome gameofthrones final sea...
3,If u saw this just know i wrote it just to see...,Pos,1.0,0.7,if saw this just know wrote it just to see the...
4,Now they just need 3 people,Neu,0.0,0.0,now they just need 3 people


In [4]:
df.sentiment.value_counts()

Neu    1863
Neg     570
Pos     557
Name: sentiment, dtype: int64

In [5]:
label_dict = {l:i for i,l in enumerate(df.sentiment.unique())}
num2label = {v:k for k, v in label_dict.items()}

In [6]:
num2label

{0: 'Pos', 1: 'Neg', 2: 'Neu'}

In [7]:

label_dict

{'Pos': 0, 'Neg': 1, 'Neu': 2}

In [8]:
df['label'] = df.sentiment.replace(label_dict)
df.head()

Unnamed: 0,text,sentiment,polarity,subjectivity,Preprocess,label
0,This is the best summary you will find of Seas...,Pos,0.25,0.366667,this is the best summary you will find of seas...,0
1,More tragic than the red wedding # got # gameo...,Neg,-0.083333,0.416667,more tragic than the red wedding got gameofthr...,1
2,Check out this awesome @ GameOfThrones Final S...,Pos,0.5,0.833333,check out this awesome gameofthrones final sea...,0
3,If u saw this just know i wrote it just to see...,Pos,1.0,0.7,if saw this just know wrote it just to see the...,0
4,Now they just need 3 people,Neu,0.0,0.0,now they just need 3 people,2


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
SEED = 42069

In [11]:

Xt, Xv, yt, yv = train_test_split(
    df.index.values,
    df.label.values,
    test_size=0.15,
    random_state=SEED,
    stratify=df.label.values
)

In [12]:
df['type'] = ['tmp'] * df.shape[0]

In [13]:
df.head()


Unnamed: 0,text,sentiment,polarity,subjectivity,Preprocess,label,type
0,This is the best summary you will find of Seas...,Pos,0.25,0.366667,this is the best summary you will find of seas...,0,tmp
1,More tragic than the red wedding # got # gameo...,Neg,-0.083333,0.416667,more tragic than the red wedding got gameofthr...,1,tmp
2,Check out this awesome @ GameOfThrones Final S...,Pos,0.5,0.833333,check out this awesome gameofthrones final sea...,0,tmp
3,If u saw this just know i wrote it just to see...,Pos,1.0,0.7,if saw this just know wrote it just to see the...,0,tmp
4,Now they just need 3 people,Neu,0.0,0.0,now they just need 3 people,2,tmp


In [14]:
df.loc[Xt, 'type'] = 'train'
df.loc[Xv, 'type'] = 'val'

In [15]:
df.groupby(['sentiment', 'label', 'type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text,polarity,subjectivity,Preprocess
sentiment,label,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Neg,1,train,485,485,485,485
Neg,1,val,85,85,85,85
Neu,2,train,1583,1583,1583,1583
Neu,2,val,280,280,280,280
Pos,0,train,473,473,473,473
Pos,0,val,84,84,84,84


In [17]:

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [18]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [19]:

encoded_data_train = tokenizer.batch_encode_plus(
    df[df.type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [20]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.type=='train'].label.values)


In [21]:
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.type=='val'].label.values)

In [22]:
train_ds = TensorDataset(input_ids_train, attention_masks_train, labels_train)
val_ds = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [23]:
print(len(train_ds), len(val_ds))

2541 449


In [24]:
from transformers import BertForSequenceClassification

In [25]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [26]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [27]:
BATCH_SIZE = 32
EPOCHS = 10

In [28]:
train_dl = DataLoader(
    train_ds,
    sampler=RandomSampler(train_ds),
    batch_size=BATCH_SIZE
)

val_dl = DataLoader(
    val_ds,
    sampler=RandomSampler(val_ds),
    batch_size=BATCH_SIZE
)

In [29]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [30]:
optim = AdamW(
    model.parameters(),
    lr=1e-5,
    eps=1e-8
)

In [31]:
scheduler = get_linear_schedule_with_warmup(
    optim,
    num_warmup_steps=0,
    num_training_steps=len(train_dl)*EPOCHS
)

In [32]:
# Defining Performance Metrics
import numpy as np
from sklearn.metrics import f1_score

In [33]:

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [34]:
def print_acc_per_class(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    for label in np.unique(labels_flat):
        total_true = preds_flat[labels_flat == label]
        true_positives = total_true[total_true == label]
        print(f'Class: {num2label[label]}')
        print(f'Accuracy: {len(true_positives)}/{len(total_true)}')

In [35]:
import random

In [36]:

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [37]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cpu


In [38]:
def evaluate(model, val_dl):
    model.eval()
    val_loss = 0
    predictions, true_vals = [], []
    for batch in tqdm(val_dl):
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]
        val_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    val_loss_avg = val_loss/len(val_dl)
    prediction = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    return val_loss_avg, predictions, true_vals

In [39]:
!mkdir Models

In [43]:
def fit(model, epochs, train_dl, val_dl, optimizer):
    for epoch in range(epochs):
        model.train()
        training_loss = 0
        tqdm_progress = tqdm(
            train_dl,
            desc=f'Epoch {epoch+1}',
            leave=False,
            disable=False
            )
        for batch in tqdm_progress:
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {
                'input_ids' : batch[0],
                'attention_mask' : batch[1],
                'labels' : batch[2]
            }
            outputs = model(**inputs)
            loss = outputs[0]
            training_loss += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            tqdm_progress.set_postfix({'training_loss': f'{loss.item()/len(batch):.3f}'})
        
        torch.save(model.state_dict(), f'Models/BERT_ft_epoch{epoch}.model')
        tqdm.write(f'Epoch {epoch}')
        training_loss_avg = training_loss/len(train_dl)
        tqdm.write(f'Training Loss: {training_loss_avg}')

        val_loss, val_preds, val_true = evaluate(model, val_dl)
        val_preds = np.vstack(np.array(val_preds))
        val_f1 = f1_score_func(val_preds, val_true)
        tqdm.write(f'Val Loss: {val_loss}')
        tqdm.write(f'F1 Score (weighted): {val_f1}')

In [44]:
fit(model, EPOCHS, train_dl, val_dl, optim)

  0%|          | 0/15 [00:00<?, ?it/s]                                        

Epoch 0
Training Loss: 0.8412546463310718


100%|██████████| 15/15 [03:49<00:00, 15.30s/it]
Epoch 2:   0%|          | 0/80 [00:00<?, ?it/s]

Val Loss: 0.7327196756998698
F1 Score (weighted): 0.7029652900691704


  0%|          | 0/15 [00:00<?, ?it/s]                                          

Epoch 1
Training Loss: 0.5767748348414898


100%|██████████| 15/15 [04:16<00:00, 17.07s/it]
Epoch 3:   0%|          | 0/80 [00:00<?, ?it/s]

Val Loss: 0.523770926396052
F1 Score (weighted): 0.8425171404864841


  0%|          | 0/15 [00:00<?, ?it/s]                                         

Epoch 2
Training Loss: 0.4126587310805917


100%|██████████| 15/15 [04:08<00:00, 16.55s/it]
Epoch 4:   0%|          | 0/80 [00:00<?, ?it/s]

Val Loss: 0.3175016118834416
F1 Score (weighted): 0.8882272960683701


  0%|          | 0/15 [00:00<?, ?it/s]                                         

Epoch 3
Training Loss: 0.2992943444289267


100%|██████████| 15/15 [03:45<00:00, 15.03s/it]
Epoch 5:   0%|          | 0/80 [00:00<?, ?it/s]

Val Loss: 0.3804077486197154
F1 Score (weighted): 0.8761306939070062


  0%|          | 0/15 [00:00<?, ?it/s]                                         

Epoch 4
Training Loss: 0.2252683797851205


100%|██████████| 15/15 [03:46<00:00, 15.10s/it]
Epoch 6:   0%|          | 0/80 [00:00<?, ?it/s]

Val Loss: 0.28549180378516514
F1 Score (weighted): 0.9148398836304942


  0%|          | 0/15 [00:00<?, ?it/s]                                        

Epoch 5
Training Loss: 0.17654089415445923


100%|██████████| 15/15 [03:45<00:00, 15.04s/it]
Epoch 7:   0%|          | 0/80 [00:00<?, ?it/s]

Val Loss: 0.2674065591146549
F1 Score (weighted): 0.9080754364627016


  0%|          | 0/15 [00:00<?, ?it/s]                                       

Epoch 6
Training Loss: 0.12879519644193352


100%|██████████| 15/15 [03:45<00:00, 15.00s/it]
Epoch 8:   0%|          | 0/80 [00:00<?, ?it/s]

Val Loss: 0.24745209577182928
F1 Score (weighted): 0.9282608108116945


  0%|          | 0/15 [00:00<?, ?it/s]                                        

Epoch 7
Training Loss: 0.10068791122175753


100%|██████████| 15/15 [03:49<00:00, 15.27s/it]
Epoch 9:   0%|          | 0/80 [00:00<?, ?it/s]

Val Loss: 0.26045269121726355
F1 Score (weighted): 0.917608355040572


  0%|          | 0/15 [00:00<?, ?it/s]                                         

Epoch 8
Training Loss: 0.08625127605628222


100%|██████████| 15/15 [03:50<00:00, 15.37s/it]
Epoch 10:   0%|          | 0/80 [00:00<?, ?it/s]

Val Loss: 0.26640767020483813
F1 Score (weighted): 0.924273486257531


  0%|          | 0/15 [00:00<?, ?it/s]                                          

Epoch 9
Training Loss: 0.07630580805707723


100%|██████████| 15/15 [03:44<00:00, 14.98s/it]

Val Loss: 0.2612705962111553
F1 Score (weighted): 0.9240964038729145





In [47]:

_, predictions, true_vals = evaluate(model, val_dl)

100%|██████████| 15/15 [03:35<00:00, 14.37s/it]


In [48]:
print_acc_per_class(np.vstack(predictions), true_vals)

Class: Pos
Accuracy: 69/84
Class: Neg
Accuracy: 77/85
Class: Neu
Accuracy: 269/280
