In [None]:
## --LIBRARY IMPORTS--

import os 

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from model_architecture import GPTModel
import LoRA_parameterizer as lp

# initializing dailymail dataset and device

device = 'cuda' if torch.cuda.is_available() else 'cpu'
train_df = pd.read_csv(os.path.join('../dataset/', 'dailymail/train.csv'))
val_df = pd.read_csv(os.path.join('../dataset/', 'dailymail/validation.csv'))
test_df = pd.read_csv(os.path.join('../dataset/', 'dailymail/test.csv'))

print(device)
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

#initializing and loading model

model = GPTModel()

model_state = torch.load(os.path.join('../models/subset19/', 'mp_rank_00_model_states.pt'), map_location = device)

missing, unexpected = model.load_state_dict(model_state['module'], strict = False)

print("Missing keys:", missing)
print("Unexpected keys:", unexpected)

model = model.to(device)

In [2]:
class DailyMailSet(Dataset):

    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        
        # get a single item
        item = self.df.iloc[idx]

        # return its text and highlight
        return {
            'text': item['text'],
            'highlights': item['highlights']
        }

In [3]:
train_dl = DataLoader(DailyMailSet(train_df), batch_size = 32, shuffle = True)

val_dl = DataLoader(DailyMailSet(val_df), batch_size = 32, shuffle = True)

test_dl = DataLoader(DailyMailSet(test_df), batch_size = 16, shuffle = True)

In [4]:
optimizer = model.optimizer

tokenizer = model.config.tokenizer

eos_token = model.eos_token_idx

pad_token = model.pad_token_idx

sos_token = tokenizer.convert_tokens_to_ids('<|startoftext|>')

# using unk token as context becuase I am an idiot and forgot to add a context token in the tokenizer

context_token = tokenizer.convert_tokens_to_ids('<|unknown|>') 

print("e, p, s, c : ", eos_token, pad_token, sos_token, context_token)

e, p, s, c :  50256 50258 50257 50259


In [None]:
# copy the original weights and biases into a dictionary to compare post SFT and make sure they don't get lost along the way

original_weights = {}

for name, param in model.named_modules():

    if(hasattr(param, "weight")):

    # original_weights[name] = param.clone().detach()
        print("copied layer: ", name,"of size: ", param.weight.shape)


In [None]:
model = lp.apply_lora_to_model(model)