In [1]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.utils.data import Dataset
import pandas as pd


In [2]:
task_df = pd.read_csv('../res/preprocessed/task1/task1.csv')
task_df = task_df.iloc[:,1:]
task_df['labels'] = (task_df['task1'] == 'HOF').astype(int)
task_df = task_df.dropna()

In [3]:
all_texts, all_labels = list(task_df['text']), list(task_df['labels'])
train_texts, test_texts, train_labels, test_labels = train_test_split(all_texts, all_labels,test_size=0.2)
# print(len(train_texts))
# print(len(test_texts))

In [4]:
labels = ['NOT', 'HOF']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

print(id2label)
print(label2id)


{0: 'NOT', 1: 'HOF'}
{'NOT': 0, 'HOF': 1}


In [5]:
class TwitterDataset(Dataset):
   def __init__(self, encodings, labels):
      self.encodings = encodings
      self.labels = labels

   def __getitem__(self, idx):
      item = {key: torch.tensor(val[idx])
              for key, val in self.encodings.items()}
      item['labels'] = torch.tensor(self.labels[idx])
      return item

   def __len__(self):
      return len(self.labels)

In [6]:
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

train_encodings = tokenizer(train_texts, truncation=True,max_length=512, padding=True)
test_encodings = tokenizer(test_texts, truncation=True,max_length=512, padding=True)

train_dataset = TwitterDataset(train_encodings, train_labels)
test_dataset = TwitterDataset(test_encodings, test_labels)

In [7]:
# train_dataset[0]

In [8]:
from torch.utils.data import DataLoader
from transformers import AdamW
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(labels), id2label=id2label, label2id=label2id)

device = torch.device(
    'cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [9]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [10]:
# iter(test_loader).next()

In [37]:
optim = AdamW(model.parameters(), lr=5e-5)

num_train_epochs = 10
for epoch in range(num_train_epochs):
   for batch in train_loader:

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      
      labels = batch['labels'].to(device)
      
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

      loss = outputs[0]
      optim.zero_grad()

      loss.backward()
      optim.step()


In [38]:
import pickle
filename = './saved_models/task1_model.pkl'
outfile = open(filename, 'wb')
pickle.dump(model, outfile)
outfile.close()

In [39]:
import pickle
from transformers import DistilBertTokenizerFast

pickle_in = open('./saved_models/task1_model.pkl', 'rb')
model = pickle.load(pickle_in)

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)


In [42]:
import torch.nn.functional as F

with torch.no_grad():
   n_correct = 0
   n_samples = 0

   for item in test_loader:
      
      test_batch_text = item['input_ids']
      test_batch_attention_mask = item['attention_mask']
      test_batch_label_id = item['labels']
      test_batch_text = test_batch_text.to(device)
      test_batch_attention_mask = test_batch_attention_mask.to(device)
      test_batch_label_id = test_batch_label_id.to(device)

      outputs = model(test_batch_text)
      prediction_batch = F.softmax(outputs.logits, dim=1)
      prediction_batch_label_id = torch.argmax(prediction_batch, dim=1)
      # prediction_labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
      
      n_samples += len(test_batch_text)
      n_correct += (prediction_batch_label_id == test_batch_label_id).sum().item()

      for i in range(len(test_batch_text)):
         if n_samples %100==0:
            sample_test_text = tokenizer.decode(test_batch_text[i], skip_special_tokens=True)
            sample_test_label = model.config.id2label[int(
                test_batch_label_id[i])]
            sample_prediction_label = model.config.id2label[int(prediction_batch_label_id[i])]

            print(f'Prediction :{sample_prediction_label} | Actual: {sample_test_label} ')

   acc = 100.0 * n_correct/n_samples
   print(f'Accuracy: {acc}')



Prediction :NOT | Actual: NOT 
Prediction :HOF | Actual: HOF 
Prediction :HOF | Actual: HOF 
Prediction :HOF | Actual: HOF 
Prediction :HOF | Actual: HOF 
Prediction :NOT | Actual: NOT 
Prediction :NOT | Actual: HOF 
Prediction :NOT | Actual: NOT 
Prediction :NOT | Actual: HOF 
Prediction :NOT | Actual: NOT 
Prediction :NOT | Actual: NOT 
Prediction :NOT | Actual: NOT 
Prediction :HOF | Actual: HOF 
Prediction :NOT | Actual: NOT 
Prediction :NOT | Actual: NOT 
Prediction :NOT | Actual: NOT 
Prediction :NOT | Actual: NOT 
Prediction :NOT | Actual: HOF 
Prediction :NOT | Actual: HOF 
Prediction :NOT | Actual: NOT 
Prediction :HOF | Actual: HOF 
Prediction :HOF | Actual: HOF 
Prediction :HOF | Actual: HOF 
Prediction :NOT | Actual: NOT 
Accuracy: 82.34501347708895
