In [1]:
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch import optim

from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# use gpu for torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"{device = }")

device = device(type='cuda')


In [3]:
# load data 
df = pd.read_csv('data/cities_sample.csv')
df.head()

Unnamed: 0,statement,label,embeddings
0,Al Fqih Ben Calah is a name of a country.,0,"[-1.1005859375, 0.466064453125, 0.86474609375,..."
1,Londrina is a city in Indonesia.,0,"[-1.4140625, -0.00347900390625, 1.37890625, -1..."
2,Klang is a city in Japan.,0,"[-1.3759765625, 0.34814453125, 0.8681640625, -..."
3,Luhansk is a name of a country.,0,"[-0.75146484375, 0.7978515625, 1.0927734375, -..."
4,Nagasaki is a city in Turkey.,0,"[-1.046875, -0.38671875, 1.259765625, -0.72314..."


In [8]:
# load gpt-2 model
model = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model, device = device)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model)
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [25]:
# Extend the model for binary classification
class BinaryHeadModel(nn.Module):
    def __init__(self, layer_to_use = -4):
        super().__init__()
        self.model = model
        self.freeze()
        self.layer_to_use = layer_to_use
        self.input_dim = self.model.transformer.h[self.layer_to_use].ln_2.normalized_shape[0]
        # self.input_dim = self.model.lm_head.out_features
        self.linear_1 = nn.Linear(self.input_dim, 256)
        self.linear_2 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()
        self.loss = nn.BCELoss()

    def forward(self, x):
        x = self.model(**x, output_hidden_states=True, return_dict=True)
        # x = self.softmax(x['logits'][0][-1])
        x = x.hidden_states[self.layer_to_use][:,-1,:]
        # print(x.shape)
        x = self.linear_1(x)
        # print(x.shape)
        x = self.linear_2(x)
        # print(x.shape)
        x = self.sigmoid(x)
        # print(x.shape)
        return x

    def freeze(self):
        for param in self.model.parameters():
            param.requires_grad = False
    
    def loss_fn(self, y_pred, y_true):
        return self.loss(y_pred, y_true)
    
custom_model = BinaryHeadModel()
custom_model.to(device)

BinaryHeadModel(
  (model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  )
  (linear_1): Linear(in_features

In [26]:
# test model
test_input = df['statement'].iloc[0:5]
print(f'Statement: {test_input}')
test_input = tokenizer(list(test_input), padding=True, return_tensors="pt")
test_input = test_input.to(device)
print(test_input)
test_output = custom_model(test_input)
print(test_output)

Statement: 0    Al Fqih Ben Calah is a name of a country.
1             Londrina is a city in Indonesia.
2                    Klang is a city in Japan.
3              Luhansk is a name of a country.
4                Nagasaki is a city in Turkey.
Name: statement, dtype: object
{'input_ids': tensor([[ 2348,   376,    80,  4449,  3932,  2199,   993,   318,   257,  1438,
           286,   257,  1499,    13],
        [   43,   623, 22267,   318,   257,  1748,   287, 16256,    13, 50256,
         50256, 50256, 50256, 50256],
        [   42, 17204,   318,   257,  1748,   287,  2869,    13, 50256, 50256,
         50256, 50256, 50256, 50256],
        [   43,  7456, 34738,   318,   257,  1438,   286,   257,  1499,    13,
         50256, 50256, 50256, 50256],
        [   45,   363, 33846,   318,   257,  1748,   287,  7137,    13, 50256,
         50256, 50256, 50256, 50256]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1,

In [27]:
# get X and y variable
X = df['statement'].values
y = df['label'].values
# split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)
print('y_train shape: ', y_train.shape)
print('y_test shape: ', y_test.shape)

X_train shape:  (800,)
X_test shape:  (200,)
y_train shape:  (800,)
y_test shape:  (200,)


In [28]:
class dataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.len = len(X)

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        batch_X = self.X[index]
        batch_y = self.y[index]
        return batch_X, batch_y    

    def preprocess(self, X):
        return tokenizer(X, padding=True, return_tensors="pt")

    def collate(self, batch):
        X = [item[0] for item in batch]
        X = self.preprocess(X).to(device)
        y = torch.tensor([item[1] for item in batch], dtype = torch.float).to(device)
        return X, y

In [39]:
# create train and test dataset
train_dataset = dataset(X_train, y_train)
test_dataset = dataset(X_test, y_test)

print(f"{train_dataset[0] = }")
print(f"{test_dataset[0] = }")

train_dataset[0] = ('United States is a name of a city.', 0)
test_dataset[0] = ('Japan is a name of a country.', tensor(1., device='cuda:0'))


In [40]:
# train the model
batch_size = 16
epochs = 3
optimizer = optim.AdamW(custom_model.parameters(), lr=5e-5, eps=1e-8)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate)
for epoch in range(epochs):
    for i, batch in tqdm(enumerate(train_loader)):
        optimizer.zero_grad()
        x = batch[0]
        y = batch[1]
        y_pred = custom_model(x)
        loss = custom_model.loss_fn(torch.squeeze(y_pred, dim = -1), y)
        loss.backward()
        optimizer.step()
        if i % 10 == 0:
            print(f"Epoch: {epoch}, Batch: {i}, Loss: {loss.item()}")
            with torch.no_grad():
                test_dataloder = DataLoader(test_dataset, 
                                          batch_size=len(test_dataset), 
                                          shuffle=False, 
                                          collate_fn=test_dataset.collate)
                x_test, y_test = next(iter(test_dataloder))
                y_pred = custom_model(x_test).detach().cpu().numpy()
                y_pred = np.argmax(y_pred, axis=1)
                print(f"Test Accuracy: {accuracy_score(y_test.detach().cpu().numpy(), y_pred)}")

0it [00:00, ?it/s]

Epoch: 0, Batch: 0, Loss: 0.9540480375289917


3it [00:01,  2.89it/s]

Test Accuracy: 0.45


8it [00:01,  8.02it/s]

Epoch: 0, Batch: 10, Loss: 0.8070427775382996


14it [00:02,  6.92it/s]

Test Accuracy: 0.45


20it [00:02, 11.39it/s]

Epoch: 0, Batch: 20, Loss: 0.6789127588272095


24it [00:03,  6.88it/s]

Test Accuracy: 0.45


28it [00:04,  9.73it/s]

Epoch: 0, Batch: 30, Loss: 0.7118586301803589


34it [00:05,  7.25it/s]

Test Accuracy: 0.45


38it [00:05, 10.06it/s]

Epoch: 0, Batch: 40, Loss: 0.6950407028198242


43it [00:06,  6.72it/s]

Test Accuracy: 0.45


50it [00:06,  7.27it/s]
0it [00:00, ?it/s]

Epoch: 1, Batch: 0, Loss: 0.8260173201560974


4it [00:01,  4.97it/s]

Test Accuracy: 0.45


8it [00:01,  9.60it/s]

Epoch: 1, Batch: 10, Loss: 0.7586684226989746


14it [00:02,  7.08it/s]

Test Accuracy: 0.45


19it [00:02, 10.62it/s]

Epoch: 1, Batch: 20, Loss: 0.7162606716156006


23it [00:03,  6.51it/s]

Test Accuracy: 0.45


29it [00:03, 11.07it/s]

Epoch: 1, Batch: 30, Loss: 0.6944011449813843


33it [00:04,  6.44it/s]

Test Accuracy: 0.45


38it [00:05, 10.43it/s]

Epoch: 1, Batch: 40, Loss: 0.7130075693130493


44it [00:06,  7.64it/s]

Test Accuracy: 0.45


50it [00:06,  7.62it/s]
0it [00:00, ?it/s]

Epoch: 2, Batch: 0, Loss: 0.6595249176025391


4it [00:01,  4.92it/s]

Test Accuracy: 0.45


10it [00:01, 11.53it/s]

Epoch: 2, Batch: 10, Loss: 0.6845293641090393


15it [00:02,  6.93it/s]

Test Accuracy: 0.45


18it [00:02,  9.26it/s]

Epoch: 2, Batch: 20, Loss: 0.6926208138465881


23it [00:03,  6.65it/s]

Test Accuracy: 0.45


28it [00:03,  9.92it/s]

Epoch: 2, Batch: 30, Loss: 0.6696198582649231


34it [00:04,  7.49it/s]

Test Accuracy: 0.45


39it [00:05, 10.69it/s]

Epoch: 2, Batch: 40, Loss: 0.7456796169281006


43it [00:06,  6.68it/s]

Test Accuracy: 0.45


50it [00:06,  7.59it/s]


In [32]:
torch.squeeze(y_pred, dim = -1)

tensor([0.9529, 0.9631, 0.9685, 0.9492, 0.9691, 0.6772, 0.9659, 0.9665, 0.9732,
        0.9686, 0.9479, 0.9657, 0.9526, 0.9712, 0.9713, 0.9688],
       device='cuda:0', grad_fn=<SqueezeBackward1>)

In [33]:
y

tensor([1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0.],
       device='cuda:0')