In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [70]:
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel

In [135]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

In [150]:
from torch.utils.data import DataLoader, Dataset

In [71]:
# use gpu for torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [72]:
model = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model, device = device)
model = GPT2LMHeadModel.from_pretrained(model)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [73]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [74]:
inputs = "Lorem ipsum dolor sit"
input_ids = tokenizer(inputs, return_tensors='pt')
input_ids = input_ids.to(device)
input_ids

{'input_ids': tensor([[   43, 29625,   220,  2419,   388,   288, 45621,  1650]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [75]:
# output = model(**input_ids, output_hidden_states=True, return_dict=True)
output = model(**input_ids, output_hidden_states=True, return_dict=True)

In [76]:
output.keys()

odict_keys(['logits', 'past_key_values', 'hidden_states'])

In [77]:
len(output['logits'])

1

In [78]:
output['logits'].shape

torch.Size([1, 8, 50257])

In [79]:
# predicting the next word
to_proba = nn.Softmax(dim=-1)
proba = to_proba(output['logits'][0][-1])
proba = proba.detach().cpu().numpy()
# find argmax of proba
argmax = np.argmax(proba)
# find the word corresponding to argmax
tokenizer.decode([argmax])

' am'

In [93]:
model.lm_head.out_features

50257

In [108]:
model.transformer.h[-1].ln_2.normalized_shape[0]

768

In [114]:
to_proba(output.hidden_states[-4][:,-1,:]).shape

torch.Size([1, 768])

###  Extending Model to use the Extra Binary Head

In [80]:
model.parameters()

<generator object Module.parameters at 0x7fd24e429660>

In [81]:
# freezing the model layers, just to train the classifier head
for param in model.parameters():
    param.requires_grad = False

In [126]:
# plan 
# freeze the model weights 
# add a new layer on top of the model
# train the new layer on the new dataset

class BinaryHeadModel(nn.Module):
    def __init__(self, layer_to_use = -4):
        super().__init__()
        self.model = model
        self.freeze()
        self.layer_to_use = layer_to_use
        self.input_dim = self.model.transformer.h[self.layer_to_use].ln_2.normalized_shape[0]
        # self.input_dim = self.model.lm_head.out_features
        self.linear = nn.Linear(self.input_dim, 1)
        self.softmax = nn.Softmax(dim=-1)
        self.sigmoid = nn.Sigmoid()
        self.loss = nn.BCELoss()

    def forward(self, x):
        x = self.model(**x, output_hidden_states=True, return_dict=True)
        # x = self.softmax(x['logits'][0][-1])
        x = x.hidden_states[self.layer_to_use][:,-1,:]
        x = self.linear(x)
        x = self.sigmoid(x)
        return x

    def freeze(self):
        for param in self.model.parameters():
            param.requires_grad = False
    
    def loss(self, y_pred, y_true):
        return self.loss(y_pred, y_true)

In [127]:
binary_head_model = BinaryHeadModel()

In [128]:
print(binary_head_model)

BinaryHeadModel(
  (model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  )
  (linear): Linear(in_features=7

In [129]:
df = pd.read_csv('data/cities_sample.csv')
df.head()

Unnamed: 0,statement,label,embeddings
0,Al Fqih Ben Calah is a name of a country.,0,"[-1.1005859375, 0.466064453125, 0.86474609375,..."
1,Londrina is a city in Indonesia.,0,"[-1.4140625, -0.00347900390625, 1.37890625, -1..."
2,Klang is a city in Japan.,0,"[-1.3759765625, 0.34814453125, 0.8681640625, -..."
3,Luhansk is a name of a country.,0,"[-0.75146484375, 0.7978515625, 1.0927734375, -..."
4,Nagasaki is a city in Turkey.,0,"[-1.046875, -0.38671875, 1.259765625, -0.72314..."


In [130]:
test_input = df['statement'].iloc[0]
test_input

'Al Fqih Ben Calah is a name of a country.'

In [132]:
test_input = tokenizer(test_input, return_tensors="pt")
test_input = test_input.to(device)
test_input

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [134]:
binary_head_model.to(device)
binary_head_model(test_input)

tensor([[0.4918]], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [138]:
def preprocess_input(x):
    return tokenizer(x,  return_tensors="pt")


In [144]:
# apply preprocessing to the statement data
X = df['statement'].apply((lambda x: preprocess_input(x))).values
y = df['label'].values

In [145]:
# train test split X, y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [146]:
print(f'X_train: {X_train.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_train: {y_train.shape}')
print(f'y_test: {y_test.shape}')

X_train: (800,)
X_test: (200,)
y_train: (800,)
y_test: (200,)


In [148]:
# # write a pytorch training loop for the model
# epochs = 10
# for epoch in range(epochs):
#     for batch in
train_dataloader = DataLoader((X_train, y_train), batch_size=16, shuffle=True)

In [163]:
batch_size = 32
epochs = 1
for epoch in range(epochs):
    for i in range(0, len(X_train), batch_size):
        x = X_train[i:i+batch_size]
        y = y_train[i:i+batch_size]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        print(f"Batch {i}: loss {loss.item()}")

0
32
64
96
128
160
192
224
256
288
320
352
384
416
448
480
512
544
576
608
640
672
704
736
768


In [162]:
X_train[].shape

(67,)