In [1]:
print("Importing packages...")
import numpy as np
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras import preprocessing as k_preproc
from GPT2_utils import *

print("Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
print("Loading pretrained model...")
model = GPT2LMHeadModel.from_pretrained('gpt2')



Importing packages...
Loading tokenizer...
Loading pretrained model...


## Parameters

In [2]:
# Do not change this parameter, it is used for testing
TEST_BATCH_SIZE = 1

VOCAB_SIZE = tokenizer.vocab_size
# Max epochs number, in case you want to stop at a certain epoch
NUM_EPOCHS = 70
MAX_LEN = 500
TRAIN_BATCH_SIZE = 5
VAL_BATCH_SIZE = 5
# Set to false if you want to generate new data
LOAD_DATA = True

# Path to data used whenever LOAD_DATA == False to generate new splits
EXPERIMENT_2_DATA_PATH = "Data/GPT2_fixed_data/encode_geo_aggregated.csv"

# This parameter indicates the ID of the training, used for directory for results and models saved
TRAIN_NUMBER = 4
    
#LOAD_MODEL_PATH = f"Saved_models/4th_train_GPT2/GPT2_{start_epochs - 1}epochs.pth"
VAL_LOSS_INCREASED = False
SPLIT = 0.9


In [3]:
tokenizer.add_special_tokens({'pad_token': '<pad>'})
model.resize_token_embeddings(len(tokenizer))

print("Activating GPU...")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device used: {}".format(device))

Activating GPU...
Device used: cuda:0


In [4]:
if(LOAD_DATA):
    #model.load_state_dict(torch.load(LOAD_MODEL_PATH))
    train = pd.read_csv("Data/GPT2_fixed_data/trainset.csv")
    test = pd.read_csv("Data/GPT2_fixed_data/testset.csv")
    val = pd.read_csv("Data/GPT2_fixed_data/validationset.csv")
    
    
else:
    train_df = pd.read_csv(EXPERIMENT_2_DATA_PATH)
    print("Splitting in train/test/val with split = {}".format(str(SPLIT)))
    train, test = train_test_split(train_df, train_size = SPLIT)
    train, val = train_test_split(train, train_size = SPLIT)
    print("Saving data...")
    test.to_csv(f"Data/GPT2_fixed_data/testset.csv", index = False)
    val.to_csv(f"Data/GPT2_fixed_data/validationset.csv", index = False)
    train.to_csv(f"Data/GPT2_fixed_data/trainset.csv", index = False)


### Train and validation data will be using "Professor forcing" so they will be composed of single sequences in the form "Input = output"

### Test data will just contain input sequences and - separately - target data

In [5]:
print("Generating test data...")
test_inp, test_out = generate_test_data(test)
test_tokens_X, test_tokens_y = tokenize_test_sequences(test_inp, test_out, tokenizer, MAX_LEN)
X_tensors, Y_tensors = create_tensors(test_tokens_X), create_tensors(test_tokens_y)
test_data = []
for x,y in zip(X_tensors, Y_tensors):
    tmp = [x,y]
    test_data.append(tmp)

testloader = torch.utils.data.DataLoader(test_data, batch_size=TEST_BATCH_SIZE,shuffle=False)

print("Generating professor forcing train/val data...")
train_data = generate_professor_forcing(train)
val_data = generate_professor_forcing(val)
print("Tokenizing train/val data...")
train_tokenized_data = tokenize_sequences(train_data, tokenizer, MAX_LEN)
val_tokenized_data = tokenize_sequences(val_data, tokenizer, MAX_LEN)
print("Padding train/val data...")
train_padded_data = pad_sequences(train_tokenized_data, value = int(tokenizer.pad_token_id), maxlen = MAX_LEN)
val_padded_data = pad_sequences(val_tokenized_data, value = int(tokenizer.pad_token_id), maxlen = MAX_LEN)
print("Converting train/val sequences to tensors...")
tr_data = create_tensors(train_padded_data)
vl_data = create_tensors(val_padded_data)



trainloader = torch.utils.data.DataLoader(tr_data, batch_size = TRAIN_BATCH_SIZE,shuffle=True)
valloader = torch.utils.data.DataLoader(vl_data, batch_size = VAL_BATCH_SIZE, shuffle = False)


Generating test data...
241 values were excluded because exceed a MAX LENGTH of: 500
Generating professor forcing train/val data...
Tokenizing train/val data...
856 values were excluded because exceed a MAX LENGTH of: 500
83 values were excluded because exceed a MAX LENGTH of: 500
Padding train/val data...
Converting train/val sequences to tensors...


## Activating loss, model and optimizer

In [6]:
loss = GPT2Loss()
running_loss = 0.0
# model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
validation_losses = []

## Training

In [7]:
epochs = NUM_EPOCHS
for epoch in range(epochs):
    iterations = tqdm(trainloader)
    result_loss = 0.0
    val_loss = 0.0
    for i, batch in enumerate(iterations):
        batch = batch.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()

        outputs = model(batch)[0]

        result_loss = loss(outputs,batch)
        running_loss += result_loss.item()
        training_loss = running_loss / (i + 1)
        result_loss.backward()
        optimizer.step()
        iterations.set_description('Training Epoch : {}/{} - Training Loss: {}'.format(epoch, epochs, training_loss))
        del batch
        
    ## VALIDATION
    val_iterations = tqdm(valloader)
    running_loss = 0.0
    with torch.no_grad():
        for i, batch in enumerate(val_iterations):
            batch = batch.to(device)
            outputs = model(batch)[0]
            result_loss = loss(outputs, batch)
            running_loss += result_loss.item()
            val_loss = running_loss / (i + 1)
            val_iterations.set_description('Validation Epoch : {}/{} - Validation Loss: {}'.format(epoch, epochs, val_loss))
            del batch
            
    ## SAVING AND EVALUATING EARLY STOPPING
    print("Saving model...")
    torch.save(model.state_dict(), f"Saved_models/{TRAIN_NUMBER}th_train_GPT2/GPT2_{epoch}epochs.pth")
    print(f"VALIDATION LOSS = {val_loss} FOR EPOCH {epoch}")
    #APPENDING VALIDATION LOSSES
    validation_losses.append([epoch, val_loss])
    if(epoch == 0):
        old_val_loss = 999
    else:
        old_val_loss = validation_losses[-2][1]
    if(epoch > 3 and val_loss > old_val_loss and VAL_LOSS_INCREASED):
        print("Stopping because validation loss overcame old validation loss for second time in a row...")
        break
    elif(epoch > 3 and val_loss > old_val_loss and not VAL_LOSS_INCREASED):
        print("Validation loss increased! Waiting next epoch to see if stopping is necessary...")
        VAL_LOSS_INCREASED = True
    else:
        VAL_LOSS_INCREASED = False

  0%|          | 0/2238 [00:00<?, ?it/s]

RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

## Loading best model

In [None]:
torch.cuda.empty_cache()
model.load_state_dict(torch.load(f"Saved_models/{TRAIN_NUMBER}th_train_GPT2/GPT2_{epoch - 2}epochs.pth"))
model = model.to(device)

## Testing

In [None]:
results = []
with torch.no_grad():
    for it, elem in enumerate(tqdm(testloader)):
        inputs = elem[0]
        labels = elem[1]
        target_sequence = inputs[0,0].tolist() + labels[0,0].tolist()
        inputs = inputs.to(device)
        predicted_token = 0
        generated_sequence = inputs[0,0].tolist()
        # Computing output tensors
        # Maximizing last tensor (predicted_token_tensor)
        # Concatenating to the input sequence
        # Appending [prediction, target] to results
        while(predicted_token != tokenizer.encode("_") and predicted_token != tokenizer.pad_token_id 
              and len(generated_sequence) < MAX_LEN):
            out = model(inputs)[0][0,0]
            last_tensor = out[-1]
            predicted_token_tensor = torch.argmax(last_tensor)
            predicted_token = predicted_token_tensor.item()
            inputs = torch.cat((inputs,predicted_token_tensor.view(1,1,1)), dim=-1)
            generated_sequence.append(predicted_token)
        tmp = [generated_sequence, target_sequence]
        results.append(tmp)
        del inputs

print("Aggregating inputs and outputs...")
text_results = []
for elem in results:
    pred = tokenizer.decode(elem[0])
    targ = tokenizer.decode(elem[1])
    tmp = [pred,targ]
    text_results.append(tmp)

print("Saving result dataframe...")
text_test_df = pd.DataFrame(text_results, columns=["Predicted", "Target"])
text_test_df.to_csv(f"Results/GPT2_{epoch-2}epochs_early_stopping_{TRAIN_NUMBER}th_train.csv", index = False)
