Classification Fine-Tuning (text messages --> Spam / Not Spam)

In [None]:
import urllib.request
import zipfile
import os 
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_dataset(url,zip_path,extracted_path,data_file_path):
    if data_file_path.exists():
        print(f'{data_file_path} already exists. Skipping download and extraction.')
        return
    with urllib.request.urlopen(url) as response : ## downloads the file
        with open(zip_path,"wb") as out_file : 
            out_file.write(response.read())
    with zipfile.ZipFile(zip_path,"r") as zip_ref : 
        zip_ref.extractall(extracted_path)
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path , data_file_path)
    print(f"File Downloaded and saved as {data_file_path}")

In [None]:
download_and_unzip_dataset(url,zip_path,extracted_path,data_file_path)

In [None]:
import pandas as pd 
df = pd.read_csv("sms_spam_collection/SMSSpamCollection.tsv" ,sep="\t",header=None,names=["Label" , "Text"])
print(df["Label"].value_counts())
df.head()

Dataset Preparation

In [None]:
## Balancing the dataset : 747 spam , 4825 ham
def balance_dataset(df) : 
    num_spam = df[df["Label"]=="spam"].shape[0]
    ham_subset = df[df["Label"]=="ham"].sample(num_spam,random_state=123)
    balanced = pd.concat([ham_subset , df[df["Label"]=="spam"]])
    return balanced

balanced_df = balance_dataset(df) ## balanced dataset 
print(balanced_df["Label"].value_counts())

In [None]:
balanced_df["Label"] = balanced_df["Label"].map({"ham":0,"spam":1})

In [None]:
## splitting the dataset 
def split_dataset(df,train_frac,val_frac) : 
    df = df.sample(
        frac=1 , random_state=123
    ).reset_index(drop=True)
    train_end = int(len(df)*train_frac)
    validation_end = train_end + int(len(df) * val_frac)
    train_df = df[:train_end]
    val_df = df[train_end:validation_end]
    test_df = df[validation_end:]
    return train_df , val_df , test_df
train_df , val_df , test_df = split_dataset(balanced_df , 0.7,0.1)

In [None]:
train_df.to_csv("train.csv" , index=None)
val_df.to_csv("val.csv" , index=None)
test_df.to_csv("test.csv" , index=None)

In [None]:
import torch 
from torch.utils.data import Dataset
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

class SpamDataset(Dataset):
    def __init__(self,csv_file,tokenizer,max_length=None , pad_token_id=50256) : 
        self.data = pd.read_csv(csv_file)
        self.texts_encoded = [tokenizer.encode(text) for text in self.data["Text"]]
        if max_length is None : 
            self.max_length = self._longest_encoded_text()
        else : 
            self.max_length = max_length 
            self.texts_encoded = [
                text_encoded[:self.max_length] for text_encoded in self.texts_encoded ## truncate text if it is longer than the specified max length 
            ]
        self.texts_encoded = [ ## add padding to texts to mach the longest text in the dataset
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) for encoded_text in self.texts_encoded
        ]
        
    def __getitem__(self, index):
        encoded = self.texts_encoded[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded,dtype=torch.long),
            torch.tensor(label,dtype=torch.long)
        )
    def __len__(self):
        return len(self.data)
    def _longest_encoded_text(self) : 
        max_length = 0 
        for encoded_text in self.texts_encoded : 
            encoded_length = len(encoded_text)
            if encoded_length > max_length : 
                max_length = encoded_length
        return max_length

In [None]:
train_dataset = SpamDataset("train.csv" , tokenizer)

In [None]:
print(train_dataset.max_length)

In [None]:
val_dataset = SpamDataset("val.csv" , tokenizer,max_length=train_dataset.max_length)
test_dataset = SpamDataset("test.csv" , tokenizer,max_length=train_dataset.max_length)
print(val_dataset.max_length)
print(test_dataset.max_length)

In [None]:
from torch.utils.data import DataLoader
num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [None]:
for input_batch , target_batch in val_loader : 
    pass
print(f"Input batch shape  : {input_batch.shape}")
print(f"Target batch shape : {target_batch.shape}")

In [None]:
print(f"{len(train_loader)} : training batches")
print(f"{len(val_loader)}  : validation batches")
print(f"{len(test_loader)}  : test batches")

Model Preparation

In [None]:
chosen_model = "gpt2_small (124M)"
input_prompt = "Every effort moves you"
base_config = {
    "vocab_size":50257,
    "context_length":1024,
    "drop_rate":0.0,
    "qkv_bias":True
}

models_config = {
    "gpt2_small (124M)" : {"emb_dim":768 , "n_layers":12 ,"n_heads":12} , 
    "gpt2_medium (355M)" : {"emb_dim":1024 , "n_layers":24 ,"n_heads":16} , 
    "gpt2_large (774M)" : {"emb_dim":1280 , "n_layers":36,"n_heads":20} , 
    "gpt2_xl (1558M)" : {"emb_dim":1600 , "n_layers":48 ,"n_heads":25} , 
}
base_config.update(models_config[chosen_model])
print(base_config)


In [None]:
from gpt_download import download_and_load_gpt2
from modules import  GPTModel , load_weights_into_gpt
model_size = chosen_model.split(" ")[-1].lstrip("(").rstrip(")")
print(f"model size : ",model_size)
settings , params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

In [None]:
gpt = GPTModel(base_config)
load_weights_into_gpt(gpt,params)
gpt.eval()

In [None]:
import modules
import importlib
importlib.reload(modules)
from modules import generate , text_to_token_ids , ids_token_to_text , generate_next_token

In [None]:
ids = generate(
    model=gpt,
    idx = text_to_token_ids(input_prompt,tokenizer),
    context_size=base_config["context_length"],
    max_new_tokens=20,
    temp=1.4,
    top_k=15
)

In [None]:
print(f"Model output : \n{ids_token_to_text(ids,tokenizer)}")

In [None]:
""" 
    before fine-tunning the model , let's test its capacity to follow instructions
    ==> the result shows that the model is struggling with the input prompt it was given because it lacks the ability to understand and 
    follow instructions , which is done via fine-tunning as we are going to do next.
"""
text_2 = (
"Is the following text 'spam'? Answer with 'yes' or 'no':"
" 'You are a winner you have been specially"
" selected to receive $1000 cash or a $2000 award.'"
)
token_ids = generate_next_token(
model=gpt,
idx=text_to_token_ids(text_2, tokenizer),
max_new_tokens=50,
context_size=base_config["context_length"]
)
print(ids_token_to_text(token_ids, tokenizer))

In [None]:
print(gpt.out_head)

In [None]:
""" 
    for our classification task we need the output to be the probabilities between two classes 0:ham , 1:spam
    to do that we need to map the output in the out_head layer from dim=768 to dim=2 (instead of 50257)
    this is called Fine-tuning selected since we are fine-tuning only the last layer (near the output)
    in order to do this we need to "freeze" the model : make all layers non-trainable
"""
n_classes = 2
torch.manual_seed(123)
gpt.out_head = torch.nn.Linear( ## ==> requires_grad = True ====> this layer is trainable
    in_features=base_config["emb_dim"],
    out_features=n_classes
)
print(gpt.out_head)

In [None]:
## Last transformer blocks and the final normalization layer are also trainable (this may add more efficiency to our model)

for param in gpt.trf_blocks[-1].parameters() : 
    param.requires_grad = True
for param in gpt.final_norm.parameters() : 
    param.requires_grad = True

In [None]:
inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0)
print(f"Inputs : {inputs}")
print(inputs.shape)

In [None]:
with torch.no_grad() : 
    outputs = gpt(inputs.to(device))
print(f"Output : {outputs}")
print(f"Output shape : {outputs.shape}") ## similar input would have [1,4,50257] as output shape 
print(f"Last output token : {outputs[:,-1,:]}")

In [None]:
probs = torch.softmax(outputs[:,-1,:] , dim=-1)
print(f"probs : {probs}")
label = torch.argmax(probs)
print(f"class : {label.item()}")

In [None]:
def calc_accuracy(data_loader , model , device ,  num_batches = None) : 
    model.eval()
    correct_preds , num_examples = 0,0
    if num_batches is None : 
        num_batches = len(data_loader)
    else : 
        num_batches = min(len(data_loader), num_batches)
    for i , (input_batch , target_batch) in enumerate(data_loader):
        if i<num_batches : 
            input_batch = input_batch.to(device)
            target_batch= target_batch.to(device)
            with torch.no_grad() : 
                logits = model(input_batch)[:,-1,:]
            preds = torch.argmax(logits , dim=-1)
            num_examples += preds.shape[0]
            correct_preds += (preds == target_batch).sum().item()
        
        else : 
            break
    return correct_preds / num_examples

In [None]:
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
torch.manual_seed(123)
gpt.to(device)
train_accuracy = calc_accuracy(train_loader , gpt,device,num_batches=10)
val_accuracy = calc_accuracy(val_loader , gpt,device,num_batches=10)
test_accuracy = calc_accuracy(test_loader , gpt,device,num_batches=10)

print(f"Training accuracy : {train_accuracy}")
print(f"Validation accuracy : {val_accuracy}")  
print(f"Test accuracy : {test_accuracy}")

In [None]:
### Calculating the Loss over all the input batch : 
def calc_loss_batch(input_batch,target_batch,model,device) : 
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)[:,-1,:]
    loss = torch.nn.functional.cross_entropy(logits , target_batch)
    return loss 

### Calculating the Loss in the data loader :
def calc_loss_loader(data_loader,model,device,num_batches=None) :
    total_loss = 0.
    if len(data_loader) == 0 : 
        return float("nan")
    elif num_batches is None : 
        num_batches = len(data_loader)
    else : 
        num_batches = min(num_batches , len(data_loader))
    for i , (input_batch , target_batch) in enumerate(data_loader) : 
        if i<num_batches : 
            loss = calc_loss_batch(input_batch,target_batch,model,device)
            total_loss += loss.item()
        else : 
            break
    return total_loss / num_batches
        


In [None]:
with torch.no_grad() : 
    train_loss = calc_loss_loader(train_loader,gpt,device,num_batches=5)
    val_loss   = calc_loss_loader(val_loader,gpt,device,num_batches=5)
    test_loss  = calc_loss_loader(test_loader,gpt,device,num_batches=5)
print(f"Training Loss   : {train_loss:.3f}")
print(f"Validation Loss : {val_loss:.3f}")  
print(f"Test Loss       : {test_loss:.3f}") 

In [None]:
def evaluate_model(model,train_loader,val_loader,device,eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader,model,device,num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader,model,device,num_batches=eval_iter)
    model.train()
    return train_loss , val_loss

In [None]:
def train_classifier_simple(model,train_loader,val_loader,optimizer,device,num_epochs,eval_freq,eval_iter):
    train_losses , val_losses , train_accs , val_accs = [],[],[],[]
    ex_seen , global_step = 0,-1
    for epoch in range(num_epochs) : 
        model.train() 
        for input_batch , target_batch in train_loader : 
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch,target_batch,model,device)
            loss.backward()
            optimizer.step()
            ex_seen += input_batch.shape[0]
            global_step += 1 
            
            if global_step % eval_freq ==0 : 
                train_loss , eval_loss = evaluate_model(model,train_loader,val_loader,device,eval_iter)
                val_losses.append(eval_loss)
                train_losses.append(train_loss)
                print(f"Ep {epoch+1} (step {global_step:06d}) :"
                      f"Train Loss {train_loss:.3f}   "
                      f"Validation Loss {eval_loss:.3f}")
        train_accuracy = calc_accuracy(train_loader,model,device,num_batches=eval_iter)
        val_accuracy   = calc_accuracy(val_loader,model,device,num_batches=eval_iter)
        print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
        print(f"Validation accuracy: {val_accuracy*100:.2f}%")
        
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)
        
    return train_losses , val_losses , train_accs , val_accs , ex_seen

In [None]:
import time 
start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(gpt.parameters(),lr=5e-5,weight_decay=0.1)
num_epochs=5

train_losses , val_losses , train_accs , val_accs , ex_seen = \
    train_classifier_simple(gpt,train_loader,val_loader,optimizer,device
                            ,num_epochs,eval_freq=50,eval_iter=5)

end_time = time.time()
exec_time = (end_time - start_time ) / 60
print(f"Training time : {exec_time:.2f} minutes")


In [None]:
import matplotlib.pyplot as plt
def plot_values(epochs_seen,examples_seen,train_values,val_values,label="loss") : 
    fig , ax1 = plt.subplots(figsize=(5,3))
    ax1.plot(epochs_seen,train_values,label=f"Training {label}")
    ax1.plot(epochs_seen,val_values,linestyle='-.',label=f"validation {label}")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel(label.capitalize())
    ax1.legend()
    
    ax2 = ax1.twiny()
    ax2.plot(examples_seen,train_values,alpha=0)
    ax2.set_xlabel("examples seen")
    fig.tight_layout()        
    plt.savefig(f"{label}-plot.pdf")
    plt.show()
    

In [None]:
epochs_tensor = torch.linspace(0,num_epochs,len(train_losses))
examples_seen_tensor = torch.linspace(0,ex_seen,len(train_losses))
plot_values(epochs_tensor,examples_seen_tensor,train_losses,val_losses)


In [None]:
epochs_tensor = torch.linspace(0,num_epochs,len(train_accs))
examples_seen_tensor = torch.linspace(0,ex_seen,len(train_accs))
plot_values(epochs_tensor,examples_seen_tensor,train_accs,val_accs,label="Accuracy")

In [None]:
train_accuracy = calc_accuracy(train_loader,gpt,device)
val_accuracy = calc_accuracy(val_loader,gpt,device)
test_accuracy = calc_accuracy(test_loader,gpt,device)
print(f"Training Accuracy   : {train_accuracy:.2f}")
print(f"Validation Accuracy : {val_accuracy:.2f}")
print(f"Test Accuracy       : {test_accuracy:.2f}")

In [None]:
def classify_review(text,model,tokenizer,device,max_length = None ,pad_token_id=50256):
    model.eval()
    input_ids = tokenizer.encode(text)
    supported_context_length = model.pos_emb.weight.shape[1]
    input_ids = input_ids[:min(len(input_ids),supported_context_length)]
    input_ids +=  [pad_token_id] * (max_length  - len(input_ids)) ## pads sequences to the longest sequence
    input_tensor = torch.tensor(input_ids , device=device).unsqueeze(0) ## adds batch dimension
    with torch.no_grad() : 
        logits = model(input_tensor)[:,-1,:]
    predicted_label = torch.argmax(logits,dim=-1).item()
    return "spam" if predicted_label == 1 else "not spam"
    

In [None]:
text_1 = (
"You are a winner you have been specially"
" selected to receive $1000 cash or a $2000 award."
)
print(classify_review(text_1,gpt,tokenizer,device,max_length=train_dataset.max_length))

In [None]:
text_2 = (
"Hey, just wanted to check if we're still on"
" for dinner tonight? Let me know!"
)
print(classify_review(text_2,gpt,tokenizer,device,max_length=train_dataset.max_length))

In [None]:
torch.save(gpt.state_dict() , "review_classifier.pth")