In [1]:
import pandas as pd
import numpy as np
import os
import gc

import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# set a seed value
torch.manual_seed(555)

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score

import transformers
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW

import warnings
warnings.filterwarnings("ignore")


In [2]:
# load train test data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
# shuffle
df = shuffle(df_train)

# initialize kfold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1024)

# for stratification
y = df['label']

# Put the folds into a list. This is a list of tuples.
fold_list = list(kf.split(df, y))

train_df_list = []
val_df_list = []

for i, fold in enumerate(fold_list):

    # map the train and val index values to dataframe rows
    df_train = df[df.index.isin(fold[0])]
    df_val = df[df.index.isin(fold[1])]
    
    train_df_list.append(df_train)
    val_df_list.append(df_val)


print(len(train_df_list))
print(len(val_df_list))


5
5


In [None]:
MODEL_TYPE = 'xlm-roberta-base'

NUM_FOLDS = 5
NUM_FOLDS_TO_TRAIN = 3

L_RATE = 1e-5
MAX_LEN = 256

NUM_EPOCHS = 3
BATCH_SIZE = 32
NUM_CORES = os.cpu_count()

NUM_CORES

In [5]:
# download tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)

In [6]:
# Create the dataloader
class CompDataset(Dataset):

    def __init__(self, df, train_data=True):
        self.df_data = df
        self.train_data = train_data
    
    def __getitem__(self, index):

        # get the sentence from the dataframe
        sentence1 = self.df_data.loc[index, 'premise']
        sentence2 = self.df_data.loc[index, 'hypothesis']

        # Process the sentence
        encoded_dict = tokenizer.encode_plus(
                    sentence1, sentence2,           # Sentences to encode.
                    add_special_tokens = True,      # Add the special tokens.
                    max_length = MAX_LEN,           # Pad & truncate all sentences.
                    pad_to_max_length = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
               )

        # These are torch tensors.
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]

        sample = (padded_token_list, att_mask)

        if not self.train_data:
            target = (torch.tensor(self.df_data.loc[index, 'label']),)
            sample += target
        
        return sample
    
    def __len__(self):
        return len(self.df_data)

In [7]:
# Set the seed.
seed_val = 101

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [8]:
# Create a list of lists to store the val acc results.
# The number of items in this list will correspond to
# the number of folds that the model is being trained on.
fold_val_acc_list = []
for i in range(0, NUM_FOLDS):
    
    # append an empty list
    fold_val_acc_list.append([])


In [None]:
for epoch in range(0, NUM_EPOCHS):
    
    #print("\nNum folds used for training:", NUM_FOLDS_TO_TRAIN)
    print("\nNum folds used for training:", NUM_FOLDS)
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, NUM_EPOCHS))
    
    # Get the number of folds
    num_folds = len(train_df_list)

    # For this epoch, store the val acc scores for each fold in this list.
    # We will use this list to calculate the cv at the end of the epoch.
    epoch_acc_scores_list = []
    
    #for fold_index in range(0, NUM_FOLDS_TO_TRAIN):
    for fold_index in range(0, NUM_FOLDS):

        print('\n== Fold Model', fold_index)

        # Load the fold model
        if epoch == 0:
            
            # define the model
            model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_TYPE, num_labels = 3,)
                        
            optimizer = AdamW(model.parameters(), lr = L_RATE, eps = 1e-8)
            
        else:
            # Get the fold model
            path_model = 'model_' + str(fold_index) + '.bin'
            model.load_state_dict(torch.load(path_model))
        
        # Set up the train and val dataloaders
        # Intialize the fold dataframes
        df_train = train_df_list[fold_index]
        df_val = val_df_list[fold_index]
        
        # Reset the indices or the dataloader won't work.
        df_train = df_train.reset_index(drop=True)
        df_val = df_val.reset_index(drop=True)
    
        # Create the dataloaders
        train_data = CompDataset(df_train)
        val_data = CompDataset(df_val)

        train_dataloader = torch.utils.data.DataLoader(train_data,
                                                batch_size=BATCH_SIZE,
                                                shuffle=True,
                                                num_workers=NUM_CORES)

        val_dataloader = torch.utils.data.DataLoader(val_data,
                                                batch_size=BATCH_SIZE,
                                                shuffle=True,
                                                num_workers=NUM_CORES)
    
        # TRAINING
        stacked_val_labels = []
        targets_list = []

        print('Training...')

        # put the model into train mode
        model.train()

        # This turns gradient calculations on and off.
        torch.set_grad_enabled(True)

        # Reset the total loss for this epoch.
        total_train_loss = 0

        for i, batch in enumerate(train_dataloader):

            train_status = 'Batch ' + str(i+1) + ' of ' + str(len(train_dataloader))

            print(train_status, end='\r')

            b_input_ids = batch[0]
            b_input_mask = batch[1]
            b_labels = batch[2]

            model.zero_grad()        

            outputs = model(b_input_ids,
                        attention_mask=b_input_mask,
                        labels=b_labels)

            # Get the loss from the outputs tuple: (loss, logits)
            loss = outputs[0]

            # Convert the loss from a torch tensor to a number.
            # Calculate the total loss.
            total_train_loss = total_train_loss + loss.item()

            # Zero the gradients
            optimizer.zero_grad()

            # Perform a backward pass to calculate the gradients.
            loss.backward()
            
            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Use the optimizer to update Weights
            optimizer.step()
        
        print('Train loss:' ,total_train_loss)

        # VALIDATION
        print('\nValidation...')

        # Put the model in evaluation mode.
        model.eval()

        # Turn off the gradient calculations.
        torch.set_grad_enabled(False)

        # Reset the total loss for this epoch.
        total_val_loss = 0


        for j, val_batch in enumerate(val_dataloader):

            val_status = 'Batch ' + str(j+1) + ' of ' + str(len(val_dataloader))

            print(val_status, end='\r')

            b_input_ids = val_batch[0]
            b_input_mask = val_batch[1]
            b_labels = val_batch[2]    

            outputs = model(b_input_ids,
                    attention_mask=b_input_mask, 
                    labels=b_labels)

            # Get the loss from the outputs tuple: (loss, logits)
            loss = outputs[0]

            # Convert the loss from a torch tensor to a number.
            # Calculate the total loss.
            total_val_loss = total_val_loss + loss.item()

            # Get the preds
            preds = outputs[1]

            # Move preds to the CPU
            val_preds = preds.detach().cpu().numpy()

            # Move the labels to the cpu
            targets_np = b_labels.to('cpu').numpy()

            # Append the labels to a numpy list
            targets_list.extend(targets_np)

            if j == 0:  # first batch
                stacked_val_preds = val_preds

            else:
                stacked_val_preds = np.vstack((stacked_val_preds, val_preds))

        # Calculate the validation accuracy for this fold
        y_true = targets_list
        y_pred = np.argmax(stacked_val_preds, axis=1)

        val_acc = accuracy_score(y_true, y_pred)
        
        epoch_acc_scores_list.append(val_acc)

        print('Val loss:' ,total_val_loss)
        print('Val acc: ', val_acc)
        
        # Save the best model
        if epoch == 0:
            
            # Save the Model
            model_name = 'model_' + str(fold_index) + '.bin'
            torch.save(model.state_dict(), model_name)
            print('Saved model as ', model_name)
            
        if epoch != 0:
        
            val_acc_list = fold_val_acc_list[fold_index]
            best_val_acc = max(val_acc_list)
            
            if val_acc > best_val_acc:
                # save the model
                model_name = 'model_' + str(fold_index) + '.bin'
                torch.save(model.state_dict(), model_name)
                print('Val acc improved. Saved model as ', model_name)
                
        # Save the val_acc for this fold model
        fold_val_acc_list[fold_index].append(val_acc)
        
        # Use the garbage collector to save memory.
        gc.collect()
        

    # Calculate the CV accuracy score over all folds in this epoch
    # Print the average val accuracy for all 5 folds
    cv_acc = sum(epoch_acc_scores_list)/NUM_FOLDS_TO_TRAIN
    print("\nCV Acc:", cv_acc)


In [None]:
# The accuracy scores for each fold model.
fold_val_acc_list

In [None]:
# Create the dataloader

test_data = CompDataset(df_test, train_data=False)


test_dataloader = torch.utils.data.DataLoader(test_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=False,
                                       num_workers=NUM_CORES)

print(len(test_dataloader))

In [None]:
# Test Set
print('\nTest Set...')

model_preds_list = []

print('Total batches:', len(test_dataloader))

for fold_index in range(0, NUM_FOLDS_TO_TRAIN):
    
    print('\nFold Model', fold_index)

    # Load the fold model
    path_model = 'model_' + str(fold_index) + '.bin'
    model.load_state_dict(torch.load(path_model))

    stacked_val_labels = []

    # Put the model in evaluation mode.
    model.eval()

    # Turn off the gradient calculations.
    torch.set_grad_enabled(False)

    # Reset the total loss for this epoch.
    total_val_loss = 0

    for j, h_batch in enumerate(test_dataloader):

        inference_status = 'Batch ' + str(j + 1)

        print(inference_status, end='\r')

        b_input_ids = h_batch[0]
        b_input_mask = h_batch[1]

        outputs = model(b_input_ids,
                attention_mask=b_input_mask)

        # Get the preds
        preds = outputs[0]

        # Move preds to the CPU
        val_preds = preds.detach().cpu().numpy()
        
        # Stack the predictions.
        if j == 0:  # first batch
            stacked_val_preds = val_preds

        else:
            stacked_val_preds = np.vstack((stacked_val_preds, val_preds))
        
    model_preds_list.append(stacked_val_preds)
    
print('\nPrediction complete.')  

In [None]:
# Sum the predictions of all fold models
for i, item in enumerate(model_preds_list):
    
    if i == 0:        
        preds = item        
    else:    
        # Sum the matrices
        preds = item + preds

# Average the predictions
avg_preds = preds/(len(model_preds_list))

test_preds = np.argmax(avg_preds, axis=1)

In [None]:
# Read sample_submission file
df_sample = pd.read_csv('sample_submission.csv')

# Assign the preds to the prediction column
df_sample['prediction'] = test_preds

# Create a submission csv file
df_sample.to_csv('idv_submission.csv', index=False)