In [None]:
pip install -q torch

In [28]:
import os
import shutil
import tensorflow as tf
import tensorflow_datasets as tfds
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
import torch
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import f1_score


In [29]:
df_t1 = pd.read_csv('train/tr1.csv',names=['text'])
df_t1.drop(df_t1.head(39000).index,inplace=True)

df_t1['category'] = 'positive'
df_t0 = pd.read_csv('train/tr0.csv',names=['text'])
df_t0.drop(df_t0.head(54000).index,inplace=True)

df_t0['category'] = 'negative'

df = df_t1.append(df_t0,ignore_index=True)

# df_te1 = pd.read_csv('test/te1.csv',names=['text'])
# df_te1['category'] = 'positive'
# df_te0 = pd.read_csv('test/te0.csv',names=['text'])
# df_te0['category'] = 'negative'

# test = df_te1.append(df_te0,ignore_index=True)


# df_v1 = pd.read_csv('d1.csv',names=['text'])
# df_v1['category'] = 'positive'
# df_v0 = pd.read_csv('d0.csv',names=['text'])
# df_v0['category'] = 'negative'

# val = df_v1.append(df_v0,ignore_index=True)




In [30]:
df.size

21510

In [31]:
possible_labels = df.category.unique()

label_dict = {}
label_dict['negative'] = 0
label_dict['positive'] = 1


In [32]:
df['label'] = df.category.replace(label_dict)


In [33]:
df.head()


Unnamed: 0,text,category,label
0,I'd be too shy to talk to you :) Those eyes ar...,positive,1
1,"Beautiful handwriting, beautiful skin.",positive,1
2,"Your weight isn't excessive, but your beauty s...",positive,1
3,"Gorgeous eyes, amazing hair, great lips, cute ...",positive,1
4,you have movie star quality eyes and lips.,positive,1


In [34]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=17, 
                                                  stratify=df.label.values)

In [35]:
df['data_type'] = ['not_set']*df.shape[0]

In [36]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [37]:
df.groupby(['category', 'label', 'data_type']).count()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
negative,0,train,5371
negative,0,val,948
positive,1,train,3770
positive,1,val,666


In [38]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [39]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [40]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [41]:
len(dataset_train)

9141

In [42]:
len(dataset_val)

1614

In [43]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [44]:
batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [45]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)

In [46]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [47]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [48]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [49]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [50]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cpu


In [51]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [52]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/286 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.23356960045004432
Validation loss: 0.16540065878892646
F1 Score (Weighted): 0.942998760842627


Epoch 2:   0%|          | 0/286 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.09324750645001503
Validation loss: 0.15172837926622698
F1 Score (Weighted): 0.9467511258832713


Epoch 3:   0%|          | 0/286 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.06319637602305861
Validation loss: 0.17385537897273168
F1 Score (Weighted): 0.9473416700930045


In [53]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [54]:
model.load_state_dict(torch.load('finetuned_BERT_epoch_3.model', map_location=torch.device('cpu')))


<All keys matched successfully>

In [55]:
_, predictions, true_vals = evaluate(dataloader_validation)
# print(dataloader_validation)

In [56]:
accuracy_per_class(predictions, true_vals)
# print(dataloader_validation)

Class: negative
Accuracy: 905/948

Class: positive
Accuracy: 624/666



In [61]:
inputs = tokenizer(" beautiful day", return_tensors="pt")
outputs = model(**inputs)
print(outputs)
x = outputs.logits.detach().numpy()[0]
preds_flat = np.argmax([x], axis=1).flatten()
print(preds_flat)


SequenceClassifierOutput(loss=None, logits=tensor([[ 0.5570, -0.9029]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
[0]


In [None]:
x = outputs.logits.detach().numpy()[0]
preds_flat = np.argmax([x], axis=1).flatten()
print(preds_flat)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
inputs = tokenizer("hellow", return_tensors="pt")
# labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
model.predict()
outputs = model(**inputs)
loss = outputs.loss
logits = outputs.logits
print()

In [None]:

encoded_data_val = tokenizer(
    "thats where you got your hair color", 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor([0])


In [None]:
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=1)
_, predictions, true_vals = evaluate(dataloader_validation)


In [None]:
preds_flat = np.argmax(predictions, axis=1).flatten()
print(preds_flat)

In [None]:
from transformers import TextClassificationPipeline


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
pipe("you look beaut.")