# Finetuning for classification

Download dataset

In [70]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extract_path = "sms_spam_collection"
data_file_path = Path(extract_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(
        url, zip_path, extrated_path, data_file_path
):
    if data_file_path.exists():
        print(f"Data file already exists at {data_file_path}")
        return
    
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_path)

    original_file_path = Path(extract_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}   ")

download_and_unzip_spam_data(url, zip_path, extract_path, data_file_path)

Data file already exists at sms_spam_collection\SMSSpamCollection.tsv


load dataset

In [71]:
import pandas as pd

df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
df


Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [72]:
print(df["Label"].value_counts())


Label
ham     4825
spam     747
Name: count, dtype: int64


Create a balanced dataset

In [73]:
def create_balanced_dataset(df):
    num_spam = df[df["Label"] == "spam"].shape[0]
    ham_sunset = df[df["Label"] == "ham"].sample(
        num_spam, random_state=123
    )
    balanced_df = pd.concat([
        ham_sunset, df[df["Label"]=="spam"]
    ])
    return balanced_df
balanced_dataset = create_balanced_dataset(df)
print(balanced_dataset["Label"].value_counts())


Label
ham     747
spam    747
Name: count, dtype: int64


convert labels into integer class

In [74]:
balanced_dataset["Label"] = balanced_dataset["Label"].map({"ham":0, "spam":1})

Split dataset into train-70%, val-10%, test-20%

In [75]:
def random_split(df, train_frac, val_frac):
    #shuffle dataframe
    df = df.sample(
        frac=1, random_state=123
    ).reset_index(drop=True)
    train_end = int(len(df)*train_frac)
    val_end = train_end + int(len(df)*val_frac)
    train_df = df[:train_end]
    val_df = df[train_end:val_end]
    test_df = df[val_end:]
    return train_df, val_df, test_df

In [76]:
train_df, validation_df, test_df = random_split(balanced_dataset, 0.7, 0.1)

In [77]:
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [78]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


Create the dataset and dtaloaders

In [79]:
import torch
from torch.utils.data import Dataset, DataLoader

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)

        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            self.encoded_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]

        self.encoded_texts = [
            encoded_text + [pad_token_id] *
            (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

In [80]:
train_dataset = SpamDataset(
    csv_file="train.csv",
    tokenizer=tokenizer,
    max_length=None
)

In [81]:
print(train_dataset.max_length)

120


In [82]:
validation_dataset = SpamDataset(
    csv_file="validation.csv",
    tokenizer=tokenizer,
    max_length=train_dataset.max_length
)
test_dataset = SpamDataset(
    csv_file="test.csv",
    tokenizer=tokenizer,
    max_length=train_dataset.max_length
)

In [83]:
num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)

validation_loader = DataLoader(
    dataset = validation_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)

In [84]:
for input, target in train_loader:
    pass
print(f"Input: {input.shape}")
print(f"Target: {target.shape}")

Input: torch.Size([8, 120])
Target: torch.Size([8])


Initialize model with pretrained weights

In [85]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMT = "Every effort moves"
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True
}
model_configs = {
    "gpt2-small (124M)": {"emb_dim":768, "n_layers":12, "n_heads":12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers":24, "n_heads":16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers":36, "n_heads":20},
    "gpt2-xl (1558M)": {"emb_dim":1600, "n_layers":48, "n_heads":25}
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [100]:
import numpy as np

def load_weights_into_gpt(gpt,params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"],3,axis=-1)
        gpt.trf_block[b].att.W_query.weight = assign(
            gpt.trf_block[b].att.W_query.weight, q_w.T
        )
        gpt.trf_block[b].att.W_key.weight = assign(
            gpt.trf_block[b].att.W_key.weight, k_w.T
        )
        gpt.trf_block[b].att.W_value.weight = assign(
            gpt.trf_block[b].att.W_value.weight, v_w.T
        )

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"],3,axis=-1)
        gpt.trf_block[b].att.W_query.bias = assign(
            gpt.trf_block[b].att.W_query.bias, q_b
        )
        gpt.trf_block[b].att.W_key.bias = assign(
            gpt.trf_block[b].att.W_key.bias, k_b
        )
        gpt.trf_block[b].att.W_value.bias = assign(
            gpt.trf_block[b].att.W_value.bias, v_b
        )

        gpt.trf_block[b].att.out_ptoj.weight = assign(
            gpt.trf_block[b].att.out_ptoj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T
        )
        gpt.trf_block[b].att.out_ptoj.bias = assign(
            gpt.trf_block[b].att.out_ptoj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"]
        )

        gpt.trf_block[b].ff.layers[0].weight = assign(
            gpt.trf_block[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T
        )
        gpt.trf_block[b].ff.layers[0].bias = assign(
            gpt.trf_block[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"]
        )
        gpt.trf_block[b].ff.layers[2].weight = assign(
            gpt.trf_block[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T
        )
        gpt.trf_block[b].ff.layers[2].bias = assign(
            gpt.trf_block[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"]
        )

        gpt.trf_block[b].norm1.scale = assign(
            gpt.trf_block[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"]
        )
        gpt.trf_block[b].norm1.shift = assign(
            gpt.trf_block[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"]
        )
        gpt.trf_block[b].norm2.scale = assign(
            gpt.trf_block[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"]
        )
        gpt.trf_block[b].norm2.shift = assign(
            gpt.trf_block[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"]
        )

        gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
        gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
        gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])


def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape},"
                         f"Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [101]:
from gpt_download import download_and_load_gpt2
from gpt import GPTModel

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size, models_dir="gpt2"
)
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model,params)
model.eval()

File already exists and is up-to-date: gpt2\124M\checkpoint
File already exists and is up-to-date: gpt2\124M\encoder.json
File already exists and is up-to-date: gpt2\124M\hparams.json
File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\124M\model.ckpt.index
File already exists and is up-to-date: gpt2\124M\model.ckpt.meta
File already exists and is up-to-date: gpt2\124M\vocab.bpe


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_block): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_ptoj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=76

In [102]:
from generate import generate_text_sample, text_to_token_ids, token_ids_to_text

text_1 = "Every effort moves you"
token_ids = generate_text_sample(
    model=model,
    idx=text_to_token_ids(text_1,tokenizer),
    max_new_tokens=15,
    context_size=BASE_CONFIG["context_length"]
)
print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you to the first time of the "c" and "up" to the


Freeze the model - make all the layers non trainable

In [103]:
for param in model.parameters():
    param.requires_grad = False

out_head by default has requires_grad true

In [104]:
torch.manual_seed(123)
num_classes=2
model.out_head=torch.nn.Linear(
    in_features=BASE_CONFIG["emb_dim"],
    out_features=num_classes
)

make last transformer block and final normalization block trainable

In [105]:
for param in model.trf_block[-1].parameters():
    param.requires_grad = True
for param in model.final_norm.parameters():
    param.requires_grad = True

In [106]:
input = tokenizer.encode("Do you have time")
input = torch.tensor(input).unsqueeze(0)
print(f"Input: {input}")
print(f"Input shape: {input.shape}")

Input: tensor([[5211,  345,  423,  640]])
Input shape: torch.Size([1, 4])


In [107]:
with torch.no_grad():
    outputs = model(input)
print(f"logits: {outputs}")
print(f"Output shape: {outputs.shape}")

logits: tensor([[[-0.6075,  0.4976],
         [-2.1403,  4.6263],
         [-1.8642,  4.4269],
         [-3.1595,  4.4539]]])
Output shape: torch.Size([1, 4, 2])


In [108]:
print(f"Last output token: {outputs[:,-1,:]}")

Last output token: tensor([[-3.1595,  4.4539]])
