# Import Dependencies


In [4]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
import torch
from torch import nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import random
import gc
from pathlib import Path
from sklearn.model_selection import StratifiedKFold,StratifiedShuffleSplit

# Functions and constants

In [5]:
def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 2021 #
BINS = [float('inf'), 1.5, 1, 0.5, 0, -0.5, -1, -1.5, -2, -2.5, -3, -3.5, float('-inf')] # map the raw score to readability level from 1 to 12(easy to hard)
MAX_LENGTH = 256 # the maximum length of the texts feed to the model
CORPORA_LIST = ['simplewiki','wiki','bookcorpus']


TRAIN_FILE_ORIG=os.path.join(Path(os.getcwd()).parent,'data',"training","original","train.csv")
TRAIN_FILE_SPLIT=os.path.join(Path(os.getcwd()).parent,'data',"training","original","train_split.csv")
VAL_FILE_SPLIT=os.path.join(Path(os.getcwd()).parent,'data',"training","original","val_split.csv")
TRAIN_FILE_EXTENDED=os.path.join(Path(os.getcwd()).parent,'data','training','extended','train_augmented.csv')

BASELINE_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","base")
PRETRAIN_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","pretrain")
FINETUNE_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","finetune")
FINAL_MODEL_DIR=os.path.join(Path(os.getcwd()).parent,"models","final")

EMBEDDINGS_DIR=os.path.join(Path(os.getcwd()).parent,'data','embeddings')
EXTENDED_DATA_DIR=os.path.join(Path(os.getcwd()).parent,'data','training','extended')

In [3]:
def train(model_name, training_file, out_dir, model_path=None, pretrain=False, val_file=None, num_epochs=3):
    #Initialize the model and tokenizer   
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    config = AutoConfig.from_pretrained(model_name, num_labels=1)
    if model_path:
        model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
    #Load the training data, tokenize the text and covert into training dataloader
    training_df = pd.read_csv(training_file)
    training_embeddings=tokenizer(training_df['excerpt'].to_list(), padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors='pt')
    training_dataset=TensorDataset(training_embeddings['input_ids'],training_embeddings['attention_mask'],torch.tensor(training_df['target'].astype('float32')))
    training_sampler = RandomSampler(training_dataset)
    training_dataloader = DataLoader(training_dataset,sampler=training_sampler,batch_size=8)
    #Load the validation data(if it is provided), tokenize it and convert into validation dataloader
    if val_file:
        val_df = pd.read_csv(val_file)
        val_embeddings=tokenizer(val_df['excerpt'].to_list(), padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors='pt')
        val_dataset=TensorDataset(val_embeddings['input_ids'],val_embeddings['attention_mask'],torch.tensor(val_df['target'].astype('float32')))
        val_sampler = SequentialSampler(val_dataset)
        val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=8)
    #Set up training optimizer and learning rate
    num_training_steps = num_epochs * len(training_dataloader)
    if pretrain: 
        lr=1e-6 # lower learning rate for pretraining
    else:
        lr=3e-5 # higher learning rate for fine-tuning
    optimizer = AdamW(model.parameters(), lr=lr)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
    #if you have GPU on you computer, specify the device as""cuda", or else you can comment out next 2 lines
    device = torch.device("cuda")
    model.to(device)
    #training 
    for epoch in range(num_epochs):
        #Set the model into train mode, then torch could do the backpropagation over the loss to update the weights of model 
        model.train()
        print('epoch=',epoch+1)
        epoch_iterator = tqdm(training_dataloader, desc="Iteration")
        tr_loss=0
        for batch in epoch_iterator:
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[2],
                    }
            outputs = model(**inputs)
            loss = outputs.loss
            epoch_iterator.set_description('(loss=%g)' % loss)
            tr_loss+=loss.item()
            loss.backward() #compute gradient of loss over parameters
            optimizer.step() # update weights/parameters
            lr_scheduler.step() #update learning rate
            optimizer.zero_grad() # clear the gradient from this step
        tr_loss/=len(training_dataloader)
        print('tr_loss=',tr_loss)
        #If there is validation data, evaluate the model at each epoch
        if val_file:
            model.eval()
            val_loss=0
            for batch in val_dataloader:
                #only need evalute the model(no mini-batch Stochastic Gradient Descent needed), set the torch with no_grad to speed up 
                with torch.no_grad():
                    batch = tuple(t.to(device) for t in batch)
                    inputs = {
                            "input_ids": batch[0],
                            "attention_mask": batch[1],
                                }
                    labels=batch[2]
                    outputs = model(**inputs).logits
                    loss_fct = torch.nn.MSELoss()
                    eval_loss = loss_fct(outputs.view(-1), labels.view(-1)).item()
                    val_loss+=eval_loss
            val_loss=val_loss/len(val_dataloader)
            print('eval_loss=',val_loss)
    #Save the model after finish the training
    model.save_pretrained(out_dir)
    #Clean the memory
    del model
    torch.cuda.empty_cache()
    gc.collect() 

In [7]:
def predict(model_name, model_dir,excerpt):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    config = AutoConfig.from_pretrained(model_dir, num_labels=1)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir, config=config)
    embeddings=tokenizer(excerpt, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors='pt')
    model.eval()
    inputs = {"input_ids": embeddings['input_ids'],"attention_mask": embeddings['attention_mask']}
    with torch.no_grad():
        outputs = model(**inputs).logits
        score=outputs.view(-1).item()
    return score

In [5]:
#split the original training dataset into train and validate dataset
seed_everything(seed=SEED)
BINS = [float('inf'), 1.5, 1, 0.5, 0, -0.5, -1, -1.5, -2, -2.5, -3, -3.5, float('-inf')] # map the raw score to readability level from 1 to 12(easy to hard)
train_df_orig=pd.read_csv(TRAIN_FILE_ORIG)
train_df_orig['level']=train_df_orig['target'].apply(lambda x: np.digitize(x,BINS))
skf = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=67)
splits=skf.split(train_df_orig,train_df_orig.level)
for train_idx, test_idx in splits:
    train_df=train_df_orig.loc[train_idx]
    val_df=train_df_orig.loc[test_idx]
train_df.reset_index(inplace=True,drop=True)
val_df.reset_index(inplace=True,drop=True)
train_df.to_csv(TRAIN_FILE_SPLIT)
val_df.to_csv(VAL_FILE_SPLIT)

In [6]:
train(model_name="roberta-base",training_file=TRAIN_FILE_SPLIT, out_dir=BASELINE_MODEL_DIR, val_file=VAL_FILE_SPLIT, num_epochs=3)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

epoch= 1


(loss=0.335987): 100%|██████████| 319/319 [01:40<00:00,  3.18it/s]


tr_loss= 0.5338854886427942
eval_loss= 0.3738563994152678
epoch= 2


(loss=0.0996832): 100%|██████████| 319/319 [01:31<00:00,  3.48it/s]


tr_loss= 0.23649449868355424
eval_loss= 0.3873924497101042
epoch= 3


(loss=0.269642): 100%|██████████| 319/319 [01:30<00:00,  3.52it/s]


tr_loss= 0.13304433722755518
eval_loss= 0.32840617270105416


In [8]:
val_df=pd.read_csv(VAL_FILE_SPLIT)
sample_excerpt=val_df.iloc[[16]]
text=sample_excerpt['excerpt'].to_list()[0]
prediction=predict('roberta-base',BASELINE_MODEL_DIR,text)
print(text,"\n")
print(f"the prediction is {prediction}, the ground truth target is {sample_excerpt['target'].to_list()[0]}")

As a statesman, it was the good fortune of Mr. Gladstone that his career was not associated with war. The reforms which he effected, the triumphs which he achieved, were not won by the supreme arbitrament of the sword. The reforms which he effected and the triumphs which he achieved were the result of his power of persuasion over his fellow-men. The reforms which he achieved in many ways amounted to a revolution. They changed, in many particulars, the face of the realm. After Sir Robert Peel had adopted the great principle which eventually carried England from protection to free trade, it was Mr. Gladstone who created the financial system which has been admitted ever since by all students of finance, as the secret of Great Britain's commercial success. He enforced the extension of the suffrage to the masses of the nation, and practically thereby made the government of monarchical England as democratic as that of any republic. 

the prediction is -1.4239850044250488, the ground truth ta