## Set up

In [1]:
# !pip install transformers==3.0.2

In [2]:
# Importing the libraries needed

import os
import numpy as np
import pandas as pd
import random

from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, logging

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, SequentialSampler, RandomSampler, DataLoader

from tqdm.notebook import tqdm

import gc; gc.enable()
from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold

logging.basicConfig(level=logging.ERROR)

# Setting up the device for GPU usage

from torch import cuda
torch.cuda.empty_cache()



In [14]:
# Definitions
# The model is tuned based on roberta-large
MODEL_DIR = 'roberta-large'
HIDDEN_SIZE = 1024
NUM_HIDDEN_LAYERS = 24

MAX_LENGTH = 300
LR = 2e-5
EPS = 1e-8

SEED = 42

NUM_FOLDS = 5

SEEDS = [66, 17]

EPOCHS = 5
TRAIN_BATCH_SIZE = 8
VAL_BATCH_SIZE = 32

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Prepare Dataset and Dataloader

In [4]:
# Set random seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [5]:
# K-Fold
class ContinuousStratifiedKFold(StratifiedKFold):
    def split(selfself, x, y, groups=None):
        num_bins = int(np.floor(1 + np.log2(len(y))))
        bins = pd.cut(y, bins=num_bins, labels=False)
        return super().split(x, bins, groups)

### Dataloader
- Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.
- This control is achieved using the parameters such as `batch_size` and `max_len`.
- Training and Validation dataloaders are used in the training and validation part of the flow respectively

In [6]:
def get_data_loaders(data, fold):
    
    x_train = data.loc[data.fold != fold, 'excerpt'].tolist()
    y_train = data.loc[data.fold != fold, 'target'].values
    x_val = data.loc[data.fold == fold, 'excerpt'].tolist()
    y_val = data.loc[data.fold == fold, 'target'].values
    
    # Tokenize the text
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    
    encoded_train = tokenizer.batch_encode_plus(
        x_train, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )
    
    encoded_val = tokenizer.batch_encode_plus(
        x_val, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )
    
    dataset_train = TensorDataset(
        encoded_train['input_ids'],
        encoded_train['attention_mask'],
        torch.tensor(y_train)
    )
    
    dataset_val = TensorDataset(
        encoded_val['input_ids'],
        encoded_val['attention_mask'],
        torch.tensor(y_val)
    )
    # Create the dataloader
    dataloader_train = DataLoader(
        dataset_train,
        sampler = RandomSampler(dataset_train),
        batch_size=TRAIN_BATCH_SIZE
    )

    dataloader_val = DataLoader(
        dataset_val,
        sampler = SequentialSampler(dataset_val),
        batch_size=VAL_BATCH_SIZE
    )

    return dataloader_train, dataloader_val

## Create Neural Network

### Neural Network
 - We will be creating a neural network with the `MeanPoolingModel`.
 - Last hidden state `[batch, maxlen, hidden_state]`  is the sequence of hidden-states at the output of the last layer of the model.
 - In this model, we use the mean embedding in the last hidden state of RoBERTa model to be the word embedding method.
 - Detailed Explanation in https://www.kaggle.com/rhtsingh/utilizing-transformer-representations-efficiently


In [7]:
# Create the Mean Pooling Model
class MeanPoolingModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(HIDDEN_SIZE, 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        # Get the last hidden state
        last_hidden_state = outputs[0]
        # Expand Attention Mask from [batch_size, max_len] to [batch_size, max_len, hidden_size]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        # Sum Embeddings along max_len axis so now we have [batch_size, hidden_size]
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        # Sum Mask along max_len axis. This is done so that we can ignore padding tokens.
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        # Take Average
        mean_embeddings = sum_embeddings / sum_mask
        # regression head
        logits = self.linear(mean_embeddings)
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds

### Read Data

In [8]:
# Read data
data = pd.read_csv('train.csv')

# Create stratified folds
kf = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

for f, (t_, v_) in enumerate(kf.split(data, data.target)):
    data.loc[v_, 'fold'] = f
    
data['fold'] = data['fold'].astype(int)

### Evaluator

In [9]:
# Set up the evaluator
def evaluate(model, val_dataloader):

    model.eval()
    
    loss_val_total = 0
    
    for batch in val_dataloader:
        
        batch = tuple(b.to(DEVICE) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            loss = model(**inputs)
            
        loss_val_total += loss.item()

    loss_val_avg = loss_val_total/len(val_dataloader) 
            
    return loss_val_avg


## Fine Tuning

In [15]:
# Defining the training function

def train(model, train_dataloader, val_dataloader):
    
    optimizer = AdamW(model.parameters(), lr = LR, eps = EPS)

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                                num_training_steps=len(train_dataloader) * EPOCHS)
    best_val_loss = 1
    best_model = None
    
    model.train()                               
    for epoch in range(EPOCHS):
    
        loss_train_total = 0
        for batch in tqdm(train_dataloader):
    
            model.zero_grad()
            batch = tuple(b.to(DEVICE) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }
        
            loss = model(**inputs)
            loss_train_total += loss.item()
        
            loss.backward()
            optimizer.step()
            scheduler.step()
        
        loss_train_avg = loss_train_total / len(train_dataloader)
        loss_val_avg = evaluate(model, val_dataloader)
        print(f'epoch:{epoch+1}/{EPOCHS} train loss={loss_train_avg}  val loss={loss_val_avg}')
   
        if loss_val_avg < best_val_loss:
            best_val_loss = loss_val_avg
            best_model = model
                       
    return best_val_loss, best_model 

In [None]:
# Change to True if you want to tune the model
TRAINING=False

if TRAINING: 
    # The model is tune based on the number of seeds
    for i, seed in enumerate(SEEDS):

        print(f'********* seed({i}) = {seed} ***********')
    
        for fold in range(NUM_FOLDS):
            print(f'*** fold = {fold} ***')
            seed_everything(seed)
            train_dataloader, val_dataloader = get_data_loaders(data, fold)

            model = MeanPoolingModel(MODEL_DIR)
            model.to(DEVICE)

            loss, best_model = train(model, train_dataloader, val_dataloader)

            model_path = f"model_{seed + 1}_{fold + 1}.pth"
            # Save the tuned model
            torch.save(best_model.state_dict(), model_path)

            del model, best_model        
            gc.collect()

## Predictions

In [None]:
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))

In [None]:
TEST_BATCH_SIZE = 1

def get_test_loader(data):

    x_test = data.excerpt.tolist()
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

    encoded_test = tokenizer.batch_encode_plus(
        x_test, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )

    dataset_test = TensorDataset(
        encoded_test['input_ids'],
        encoded_test['attention_mask']
    )

    dataloader_test = DataLoader(
        dataset_test,
        sampler = SequentialSampler(dataset_test),
        batch_size=TEST_BATCH_SIZE
    )
    
    return dataloader_test

test_dataloader = get_test_loader(test)

In [None]:
all_predictions = [] 
for seed in SEEDS:
    
    fold_predictions = []
    
    for fold in tqdm(range(NUM_FOLDS)):
        
        model_path = f"model_{seed + 1}_{fold + 1}.pth" 
        print(f"\nUsing {model_path}")   
            
        model = MeanPoolingModel(MODEL_DIR)
        model.load_state_dict(torch.load(model_path)) 
        model.to(DEVICE)
        model.eval()

        predictions = []
        for batch in test_dataloader:

            batch = tuple(b.to(DEVICE) for b in batch)

            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         None,
                     }

     
            preds = model(**inputs).item()
            predictions.append(preds)
            
        del model 
        gc.collect()
            
        fold_predictions.append(predictions)
    all_predictions.append(np.mean(fold_predictions, axis=0).tolist())
    
model_predictions = np.mean(all_predictions,axis=0)

In [None]:
submit = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))
submit.target = model_predictions
submit


In [None]:
submit.to_csv('submission.csv',index=False)