# Preparation

In [1]:
import numpy as np

import pandas as pd
from pandas import DataFrame

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

import re

import os
import random
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
torch.cuda.empty_cache()

import sys
sys.path = [
    '../input/readability-package',
] + sys.path
import readability
import spacy

from sklearn import model_selection

import transformers
import torch
import pytorch_lightning as pl
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset

import random

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold

import lightgbm as lgb

from fastprogress.fastprogress import  progress_bar

In [2]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
#train_df=train_df.head(10)
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

train_df['excerpt'] = train_df['excerpt'].apply(lambda e: e.replace('\n', ''))
test_df['excerpt'] = test_df['excerpt'].apply(lambda e: e.replace('\n', ''))

# Cleaning Texts Function

In [3]:
train_df['excerpt_preprocessed'] =  train_df['excerpt']
train_df['excerpt_preprocessed'] = train_df['excerpt_preprocessed'].replace('\s+', ' ', regex=True)

test_df["excerpt_preprocessed"] = train_df['excerpt']
test_df['excerpt_preprocessed'] = test_df['excerpt_preprocessed'].replace('\s+', ' ', regex=True)

In [4]:
pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_colwidth', None)
train_df['excerpt_preprocessed'].head(3)

0    When the young people returned to the ballroom...
1    All through dinner time, Mrs. Fayre was somewh...
2    As Roger had predicted, the snow departed as q...
Name: excerpt_preprocessed, dtype: object

# Fetch some features

In [5]:
#source: https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline/data

def readability_measurements(passage: str):
    """
    This function uses the readability library for feature engineering.
    It includes textual statistics, readability scales and metric, and some pos stats
    """
    results = readability.getmeasures(passage, lang='en')
    
    chars_per_word = results['sentence info']['characters_per_word']
    syll_per_word = results['sentence info']['syll_per_word']
    words_per_sent = results['sentence info']['words_per_sentence']
    
    
    tobeverb = results['word usage']['tobeverb']
    auxverb = results['word usage']['auxverb']
    conjunction = results['word usage']['conjunction']
    pronoun = results['word usage']['pronoun']
    preposition = results['word usage']['preposition']
    nominalization = results['word usage']['nominalization']
    
    pronoun_b = results['sentence beginnings']['pronoun']
    interrogative = results['sentence beginnings']['interrogative']
    article = results['sentence beginnings']['article']
    subordination = results['sentence beginnings']['subordination']
    conjunction_b = results['sentence beginnings']['conjunction']
    preposition_b = results['sentence beginnings']['preposition']

    
    return [chars_per_word, syll_per_word, words_per_sent,
            tobeverb, auxverb, conjunction, pronoun, preposition, nominalization,
            pronoun_b, interrogative, article, subordination, conjunction_b, preposition_b]

In [6]:
def spacy_features(df: pd.DataFrame):
    """
    This function generates features using spacy en_core_wb_lg
    I learned about this from these resources:
    https://www.kaggle.com/konradb/linear-baseline-with-cv
    https://www.kaggle.com/anaverageengineer/comlrp-baseline-for-complete-beginners
    """
    
    nlp = spacy.load('en_core_web_lg')
    with nlp.disable_pipes():
        vectors = np.array([nlp(text).vector for text in df.excerpt])
        
    return vectors

def get_spacy_col_names():
    names = list()
    for i in range(300):
        names.append(f"spacy_{i}")
        
    return names

In [7]:
def pos_tag_features(passage: str):
    """
    This function counts the number of times different parts of speech occur in an excerpt
    """
    pos_tags = ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"]
    
    tags = pos_tag(word_tokenize(passage))
    tag_list= list()
    
    for tag in pos_tags:
        tag_list.append(len([i[0] for i in tags if i[1] == tag]))
    
    return tag_list

In [8]:
def generate_other_features(passage: str):
    """
    This function is where I test miscellaneous features
    This is experimental
    """
    # punctuation count
    hyphens = passage.count("-")
    periods = passage.count(".")
    commas = passage.count(",")
    semis = passage.count(";")
    exclaims = passage.count("!")
    questions = passage.count("?")
    
    # Some other stats
    num_char = len(passage)
    num_words = len(passage.split(" "))
    unique_words = len(set(passage.split(" ") ))
    word_diversity = unique_words/num_words
    
    word_len = [len(w) for w in passage.split(" ")]
    longest_word = np.max(word_len)
    avg_len_word = np.mean(word_len)
    
    return [hyphens,periods, commas, semis, exclaims, questions,
            num_char, num_words, unique_words, word_diversity,
            longest_word, avg_len_word]

In [9]:
##Just testing

just_testingdf=pd.DataFrame(train_df["excerpt_preprocessed"].head().apply(lambda p : generate_other_features(p)).tolist(),
                                columns=["hyphens","periods", "commas", "semis", "exclaims", "questions",
                                         "num_char", "num_words", "unique_words", "word_diversity",
                                         "longest_word", "avg_len_word"])
just_testingdf

Unnamed: 0,hyphens,periods,commas,semis,exclaims,questions,num_char,num_words,unique_words,word_diversity,longest_word,avg_len_word
0,2,11,14,0,0,0,987,174,112,0.643678,13,4.678161
1,0,10,24,0,5,2,932,164,123,0.75,14,4.689024
2,2,11,17,2,1,0,904,162,124,0.765432,14,4.58642
3,3,5,23,2,0,0,908,163,117,0.717791,13,4.576687
4,4,5,13,10,0,0,723,147,51,0.346939,12,3.92517


In [10]:
!pip install ../input/pyphen011whl/pyphen-0.11.0-py3-none-any.whl
!pip install ../input/textstatwhl/textstat-0.7.1-py3-none-any.whl

Processing /kaggle/input/pyphen011whl/pyphen-0.11.0-py3-none-any.whl
Installing collected packages: pyphen
Successfully installed pyphen-0.11.0
Processing /kaggle/input/textstatwhl/textstat-0.7.1-py3-none-any.whl
Installing collected packages: textstat
Successfully installed textstat-0.7.1


In [11]:
from nltk.corpus import stopwords
import spacy
import timeit
import re
import textstat


nlp = spacy.load('en_core_web_sm')
punct=";|!|:|;|,|-|'"
stop=set(stopwords.words('english'))

def preprocess_dataframe(df):
    #Set a unique Numbering for each exerpt
    df=df.reset_index()  
    #Average excerpt length
    train_df['excerpt_length']=train_df['excerpt'].str.len()
    avg_excerpt_len=round(train_df['excerpt_length'].mean(),0) #Avg. excerpt length
    #Convert all text to lowecase
    df['excerpt_preprocess']=df['excerpt_preprocessed'].str.lower()         
    #FEATURE ENGINEERING: Get the legth of each excerpt
    df['excerpt_actual_length']=df['excerpt_preprocess'].str.len()
    #Remove common words from excerpt
    df['excerpt_preprocess']=df['excerpt_preprocess'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))
    #FEATURE ENGINEERING: Get the legth of the preprocessed excerpt
    df['excerpt_preprocessed_length']=df['excerpt_preprocess'].str.len()
    #FEATURE ENGINEERING: Percent frequent words
    df['excerpt_stopword_freq']=(df['excerpt_actual_length']-df['excerpt_preprocessed_length'])/df['excerpt_actual_length']
    #FEATURE ENGINEERING: Get count of punctuations in the excerpt
    df['excerpt_punct_count']=df['excerpt'].apply(lambda x: len(re.findall(punct, x)))
    #Convert excerpt into setences
    df['excerpt_sentence'] = df['excerpt_preprocess'].apply(lambda x: list(nlp(x).sents))
    #Convert each setence of the exerpt into a pandas row
    df=df.explode('excerpt_sentence')
    #Convert spacy object to string object
    df['excerpt_sentence']=df['excerpt_sentence'].apply(lambda x: x.text)    
    ##FEATURE ENGINEERING: Get sentence length
    df['sentence_length']=df['excerpt_sentence'].str.len()
    ##FEATURE ENGINEERING: Get word count
    df['totalwords'] = df['excerpt_sentence'].str.split().map(len)
    ##FEATURE ENGINEERING: Get normalized word count
    df['normalized_word_count'] = round(df['sentence_length']/df['totalwords'],2)
    ##FEATURE ENGINEERING: Get normalized stopword frequency
    df['normalized_stopword_freq']=round(df['excerpt_stopword_freq']*avg_excerpt_len,1)
    ##FEATURE ENGINEERING: Get average senetence length
    df['avg sent length']=df[['sentence_length', 'index']].groupby(['index']).agg(['median'])
    ##FEATURE ENGINEERING: Get average senetence length
    df=df[['index','excerpt','avg sent length','normalized_word_count','normalized_stopword_freq']].drop_duplicates(subset ='index').set_index('index')
    
    #Features from textstat
    df['flesch_reading_ease']=df['excerpt'].apply(lambda x: textstat.flesch_reading_ease(x))
    df['smog_index']=df['excerpt'].apply(lambda x: textstat.smog_index(x))
    df['flesch_kincaid_grade']=df['excerpt'].apply(lambda x: textstat.flesch_kincaid_grade(x))
    df['coleman_liau_index']=df['excerpt'].apply(lambda x: textstat.coleman_liau_index(x))
    df['automated_readability_index']=df['excerpt'].apply(lambda x: textstat.automated_readability_index(x))
    df['dale_chall_readability_score']=df['excerpt'].apply(lambda x: textstat.dale_chall_readability_score(x))
    df['difficult_words']=df['excerpt'].apply(lambda x: textstat.difficult_words(x))
    df['linsear_write_formula']=df['excerpt'].apply(lambda x: textstat.linsear_write_formula(x))
    df['gunning_fog']=df['excerpt'].apply(lambda x: textstat.gunning_fog(x))
    df['text_standard']=df['excerpt'].apply(lambda x: textstat.text_standard(x, float_output=True))
    df['fernandez_huerta']=df['excerpt'].apply(lambda x: textstat.fernandez_huerta(x))
    df['szigriszt_pazos']=df['excerpt'].apply(lambda x: textstat.szigriszt_pazos(x))
    df['gutierrez_polini']=df['excerpt'].apply(lambda x: textstat.gutierrez_polini(x))
    df['crawford']=df['excerpt'].apply(lambda x: textstat.crawford(x))
    return df.drop(columns=['excerpt'])

In [12]:
pd.set_option('display.max_colwidth', None)
train_df['excerpt_preprocessed'].head(2)

0    When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.The floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches.At each end of the room, on the wall, hung a beautiful bear-skin rug.These rugs were for prizes, one for the girls and one for the boys. And this was the game.The girls were gathered at one end of the room and the boys at the other, and one end was called the North Pole, and the other the South Pole. Each player was given a small flag which they were to plant on reaching the Pole.This would have been an easy matter, but each traveller was obliged to wear snowshoes.
1      

In [13]:
preprocess_dataframe(train_df.head(3))

Unnamed: 0_level_0,avg sent length,normalized_word_count,normalized_stopword_freq,flesch_reading_ease,smog_index,flesch_kincaid_grade,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,fernandez_huerta,szigriszt_pazos,gutierrez_polini,crawford
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,71.0,8.88,322.4,67.42,10.5,11.1,9.53,15.1,7.34,25,15.0,13.44,11.0,99.3,96.2,41.18,2.3
1,24.5,7.5,292.5,71.75,9.1,7.3,7.65,8.9,6.09,17,8.833333,8.51,9.0,106.1,105.34,47.29,2.0
2,45.0,6.86,312.2,73.41,10.9,8.8,7.96,11.8,6.44,17,20.0,11.46,11.0,105.3,102.55,45.43,2.1


In [14]:
pd.merge(just_testingdf, preprocess_dataframe(train_df.head()), left_index=True, right_index=True)

Unnamed: 0,hyphens,periods,commas,semis,exclaims,questions,num_char,num_words,unique_words,word_diversity,...,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,fernandez_huerta,szigriszt_pazos,gutierrez_polini,crawford
0,2,11,14,0,0,0,987,174,112,0.643678,...,15.1,7.34,25,15.0,13.44,11.0,99.3,96.2,41.18,2.3
1,0,10,24,0,5,2,932,164,123,0.75,...,8.9,6.09,17,8.833333,8.51,9.0,106.1,105.34,47.29,2.0
2,2,11,17,2,1,0,904,162,124,0.765432,...,11.8,6.44,17,20.0,11.46,11.0,105.3,102.55,45.43,2.1
3,3,5,23,2,0,0,908,163,117,0.717791,...,20.5,7.02,14,12.5,16.81,8.0,93.2,91.17,38.51,2.0
4,4,5,13,10,0,0,723,147,51,0.346939,...,11.8,1.57,1,13.5,11.76,12.0,104.9,105.39,48.95,1.6


In [15]:
def create_folds(data: pd.DataFrame, num_splits: int):
    """ 
    This function creates a kfold cross validation system based on this reference: 
    https://www.kaggle.com/abhishek/step-1-create-folds
    """
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [16]:
class CLRDataset:
    """
    This is my CommonLit Readability Dataset.
    By calling the get_df method on an object of this class,
    you will have a fully feature engineered dataframe
    """
    def __init__(self, df: pd.DataFrame, train: bool, n_folds=2):
        self.df = df
        self.excerpts = df["excerpt_preprocessed"]
        
        self._extract_features()
        
        if train:
            self.df = create_folds(self.df, n_folds)
        
    def _extract_features(self):
        scores_df = pd.DataFrame(self.df["excerpt_preprocessed"].apply(lambda p : readability_measurements(p)).tolist(), 
                                 columns=["chars_per_word", "syll_per_word", "words_per_sent",
                                          "tobeverb", "auxverb", "conjunction", "pronoun", "preposition", "nominalization",
                                          "pronoun_b", "interrogative", "article", "subordination", "conjunction_b", "preposition_b"])
        self.df = pd.merge(self.df, scores_df, left_index=True, right_index=True)
        
        spacy_df = pd.DataFrame(spacy_features(self.df), columns=get_spacy_col_names())
        self.df = pd.merge(self.df, spacy_df, left_index=True, right_index=True)
        
        pos_df = pd.DataFrame(self.df["excerpt_preprocessed"].apply(lambda p : pos_tag_features(p)).tolist(),
                              columns=["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                                       "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                                       "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"])
        self.df = pd.merge(self.df, pos_df, left_index=True, right_index=True)
        
        other_df = pd.DataFrame(self.df["excerpt_preprocessed"].apply(lambda p : generate_other_features(p)).tolist(),
                                columns=["hyphens","periods", "commas", "semis", "exclaims", "questions",
                                         "num_char", "num_words", "unique_words", "word_diversity",
                                         "longest_word", "avg_len_word"])
        
        self.df = pd.merge(self.df, other_df, left_index=True, right_index=True)
        
        self.df = pd.merge(self.df, preprocess_dataframe(self.df), left_index=True, right_index=True)
        
    def get_df(self):
        return self.df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        pass

In [17]:
dataset = CLRDataset(train_df, train=True)
train_df = dataset.get_df()

train_df.head(3)

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,excerpt_preprocessed,excerpt_length,chars_per_word,syll_per_word,...,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,fernandez_huerta,szigriszt_pazos,gutierrez_polini,crawford,kfold
0,b51730f9c,,,"Alice looked at the jury-box, and saw that, in her haste, she had put the Lizard in head downwards, and the poor little thing was waving its tail about in a melancholy way, being quite unable to move. She soon got it out again, and put it right; ‘not that it signifies much,' she said to herself; ‘I should think it would be quite as much use in the trial one way up as the other.'As soon as the jury had a little recovered from the shock of being upset, and their slates and pencils had been found and handed back to them, they set to work very diligently to write out a history of the accident, all except the Lizard, who seemed too much overcome to do anything but sit with its mouth open, gazing up into the roof of the court.‘What do you know about this business?' the King said to Alice.‘Nothing,' said Alice.‘Nothing whatever?' persisted the King.‘Nothing whatever,' said Alice.",-0.432678,0.487498,"Alice looked at the jury-box, and saw that, in her haste, she had put the Lizard in head downwards, and the poor little thing was waving its tail about in a melancholy way, being quite unable to move. She soon got it out again, and put it right; ‘not that it signifies much,' she said to herself; ‘I should think it would be quite as much use in the trial one way up as the other.'As soon as the jury had a little recovered from the shock of being upset, and their slates and pencils had been found and handed back to them, they set to work very diligently to write out a history of the accident, all except the Lizard, who seemed too much overcome to do anything but sit with its mouth open, gazing up into the roof of the court.‘What do you know about this business?' the King said to Alice.‘Nothing,' said Alice.‘Nothing whatever?' persisted the King.‘Nothing whatever,' said Alice.",885,4.083333,1.220238,...,9.33,17,26.5,33.83,9.0,45.7,45.84,25.55,2.6,0
1,4d403fd57,https://en.wikipedia.org/wiki/Artificial_intelligence,CC BY-SA 3.0,"Artificial intelligence (AI) is intelligence exhibited by machines. In computer science, an ideal ""intelligent"" machine is a flexible rational agent that perceives its environment and takes actions that maximize its chance of success at some goal. Colloquially, the term ""artificial intelligence"" is applied when a machine mimics ""cognitive"" functions that humans associate with other human minds, such as ""learning"" and ""problem solving"". As machines become increasingly capable, facilities once thought to require intelligence are removed from the definition. For example, optical character recognition is no longer perceived as an exemplar of ""artificial intelligence"" having become a routine technology. Capabilities still classified as AI include advanced Chess and Go systems and self-driving cars.AI research is divided into subfields that focus on specific problems or on specific approaches or on the use of a particular tool or towards satisfying particular applications.",-1.161746,0.458396,"Artificial intelligence (AI) is intelligence exhibited by machines. In computer science, an ideal ""intelligent"" machine is a flexible rational agent that perceives its environment and takes actions that maximize its chance of success at some goal. Colloquially, the term ""artificial intelligence"" is applied when a machine mimics ""cognitive"" functions that humans associate with other human minds, such as ""learning"" and ""problem solving"". As machines become increasingly capable, facilities once thought to require intelligence are removed from the definition. For example, optical character recognition is no longer perceived as an exemplar of ""artificial intelligence"" having become a routine technology. Capabilities still classified as AI include advanced Chess and Go systems and self-driving cars.AI research is divided into subfields that focus on specific problems or on specific approaches or on the use of a particular tool or towards satisfying particular applications.",981,5.835714,1.971429,...,11.04,55,15.8,17.91,18.0,63.2,60.41,30.15,5.4,0
2,0f789ee41,,,"A gruff squire on horseback with shiny top boots. Soft day, sir John! Soft day, your honor!... Day!... Day!... Two top boots jog dangling on to Dublin. Lal the ral the ra. Lal the ral the raddy.—That reminds me, Mr. Deasy said. You can do me a favor, Mr. Dedalus, with some of your literary friends. I have a letter here for the press. Sit down a moment. I have just to copy the end.He went to the desk near the window, pulled in his chair twice and read off some words from the sheet on the drum of his typewriter.—Sit down. Excuse me, he said over his shoulder, the dictates of common sense. Just a moment.He peered from under his shaggy brows at the manuscript by his elbow and, muttering, began to prod the stiff buttons of the keyboard slowly, sometimes blowing as he screwed up the drum to erase an error.",-2.367914,0.519369,"A gruff squire on horseback with shiny top boots. Soft day, sir John! Soft day, your honor!... Day!... Day!... Two top boots jog dangling on to Dublin. Lal the ral the ra. Lal the ral the raddy.—That reminds me, Mr. Deasy said. You can do me a favor, Mr. Dedalus, with some of your literary friends. I have a letter here for the press. Sit down a moment. I have just to copy the end.He went to the desk near the window, pulled in his chair twice and read off some words from the sheet on the drum of his typewriter.—Sit down. Excuse me, he said over his shoulder, the dictates of common sense. Just a moment.He peered from under his shaggy brows at the manuscript by his elbow and, muttering, began to prod the stiff buttons of the keyboard slowly, sometimes blowing as he screwed up the drum to erase an error.",811,3.929936,1.133758,...,5.67,14,3.636364,5.77,6.0,122.8,119.33,51.84,0.8,0


In [18]:
test_dataset = CLRDataset(test_df, train=False)
test_df = test_dataset.get_df()

test_df.head()

Unnamed: 0,id,url_legal,license,excerpt,excerpt_preprocessed,chars_per_word,syll_per_word,words_per_sent,tobeverb,auxverb,...,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,fernandez_huerta,szigriszt_pazos,gutierrez_polini,crawford
0,c0f722661,,,"My hope lay in Jack's promise that he would keep a bright light burning in the upper story to guide me on my course. On a clear night this light was visible from the village, but somehow or other I failed to take into account the state of the weather. The air was full of eddying flakes, which would render the headlight of a locomotive invisible a hundred yards distant. Strange that this important fact never occurred to me until I was fully a fourth of a mile from the village. Then, after looking in vain for the beacon light, the danger of my situation struck me, and I halted.""I am certain to go wrong,"" I said to myself.""It is out of my power to follow a direct course without something to serve as a compass. I will go back to the village and wait till morning.""","When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.The floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches.At each end of the room, on the wall, hung a beautiful bear-skin rug.These rugs were for prizes, one for the girls and one for the boys. And this was the game.The girls were gathered at one end of the room and the boys at the other, and one end was called the North Pole, and the other the South Pole. Each player was given a small flag which they were to plant on reaching the Pole.This would have been an easy matter, but each traveller was obliged to wear snowshoes.",4.407821,1.234637,179.0,12,1,...,10.8,6.25,13,11.0,10.89,11.0,103.9,99.27,46.7,2.3
1,f0953f0a5,,,"Dotty continued to go to Mrs. Gray's every night with the milk. Sometimes Katie went with her, and then they always paused a while under the acorn-tree and played ""King and Queen."" Dotty said she wished they could ever remember to bring their nipperkins, for in that case the milk would taste a great deal more like nectar. The ""nipperkins"" were a pair of handled cups which the children supposed to be silver, and which they always used at table.Dotty knew she was doing wrong every time she played ""King and Queen."" She knew the milk was not hers, but Mrs. Gray's; still she said to herself, ""Ruthie needn't give so much measure, all pressed down and run over. If Queenie and I should drink a great deal more, there would always be a quart left. Yes, I know there would.""Mrs. Gray never said anything about the milk; she merely poured it out in a pan, and gave back the pail to Dotty, asking her at the same time as many questions as the child would stay to hear.","All through dinner time, Mrs. Fayre was somewhat silent, her eyes resting on Dolly with a wistful, uncertain expression. She wanted to give the child the pleasure she craved, but she had hard work to bring herself to the point of overcoming her own objections.At last, however, when the meal was nearly over, she smiled at her little daughter, and said, ""All right, Dolly, you may go.""""Oh, mother!"" Dolly cried, overwhelmed with sudden delight. ""Really?Oh, I am so glad! Are you sure you're willing?""""I've persuaded myself to be willing, against my will,"" returned Mrs. Fayre, whimsically. ""I confess I just hate to have you go, but I can't bear to deprive you of the pleasure trip. And, as you say, it would also keep Dotty at home, and so, altogether, I think I shall have to give in.""""Oh, you angel mother! You blessed lady! How good you are!"" And Dolly flew around the table and gave her mother a hug that nearly suffocated her.",4.145349,1.197674,172.0,5,5,...,8.2,5.49,11,7.833333,7.38,6.0,116.6,114.45,48.24,1.3
2,0df072751,,,"It was a bright and cheerful scene that greeted the eyes of Captain Raymond and his son as they entered the parlor of the adjacent cottage.It was strictly a family gathering, yet the room was quite full. Mr. Dinsmore was there with his wife, his daughter Elsie and her children, Edward and Zoe, Elsie Leland with her husband and babe, Violet Raymond with her husband's two little girls, Lulu and Grace, and lastly Rosie and Walter.Everybody had a kindly greeting for the captain, and Violet's bright face grew still brighter as she made room for him on the sofa by her side.""We were beginning to wonder what was keeping you,"" she said.""Yes, I'm afraid I am rather behind time,"" he returned. ""I hope you have not delayed your tea for me, Mrs. Dinsmore.""""No; it is but just ready,"" she said. ""Ah, there's the bell. Please, all of you walk out.""When the meal was over all returned to the parlor, where they spent the next hour in desultory chat.","As Roger had predicted, the snow departed as quickly as it came, and two days after their sleigh ride there was scarcely a vestige of white on the ground. Tennis was again possible and a great game was in progress on the court at Pine Laurel. Patty and Roger were playing against Elise and Sam Blaney, and the pairs were well matched.But the long-contested victory finally went against Patty, and she laughingly accepted defeat.""Only because Patty's not quite back on her game yet,"" Roger defended; ""this child has been on the sick list, you know, Sam, and she isn't up to her own mark.""""Well, I like that!"" cried Patty; ""suppose you bear half the blame, Roger. You see, Mr. Blaney, he is so absorbed in his own Love Game, he can't play with his old-time skill.""""All right, Patsy, let it go at that. And it's so, too. I suddenly remembered something Mona told me to tell you, and it affected my service.""",4.105882,1.211765,170.0,7,1,...,21.3,7.69,21,28.0,17.75,9.0,86.0,84.74,38.29,2.4
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,"Cell division is the process by which a parent cell divides into two or more daughter cells. Cell division usually occurs as part of a larger cell cycle. In eukaryotes, there are two distinct types of cell division: a vegetative division, whereby each daughter cell is genetically identical to the parent cell (mitosis), and a reproductive cell division, whereby the number of chromosomes in the daughter cells is reduced by half, to produce haploid gametes (meiosis). Meiosis results in four haploid daughter cells by undergoing one round of DNA replication followed by two divisions: homologous chromosomes are separated in the first division, and sister chromatids are separated in the second division.Both of these cell division cycles are used in sexually reproducing organisms at some point in their life cycle, and both are believed to be present in the last eukaryotic common ancestor. Prokaryotes also undergo a vegetative cell division known as binary fission, where their genetic material is segregated equally into two daughter cells. All cell divisions, regardless of organism, are preceded by a single round of DNA replication.","And outside before the palace a great garden was walled round, filled full of stately fruit-trees, gray olives and sweet figs, and pomegranates, pears, and apples, which bore the whole year round. For the rich south-west wind fed them, till pear grew ripe on pear, fig on fig, and grape on grape, all the winter and the spring. And at the farther end gay flower-beds bloomed through all seasons of the year; and two fair fountains rose, and ran, one through the garden grounds, and one beneath the palace gate, to water all the town. Such noble gifts the heavens had given to Alcinous the wise.So they went in, and saw him sitting, like Poseidon, on his throne, with his golden sceptre by him, in garments stiff with gold, and in his hand a sculptured goblet, as he pledged the merchant kings; and beside him stood Arete, his wise and lovely queen, and leaned against a pillar as she spun her golden threads.",4.365854,1.189024,164.0,1,0,...,18.8,9.26,47,17.25,17.95,18.0,74.5,68.41,33.82,4.4
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,"Debugging is the process of finding and resolving of defects that prevent correct operation of computer software or a system. Debugging tends to be harder when various subsystems are tightly coupled, as changes in one may cause bugs to emerge in another.Numerous books have been written about debugging, as it involves numerous aspects, including interactive debugging, control flow, integration testing, log files, monitoring (application, system), memory dumps, profiling, Statistical Process Control, and special design tactics to improve detection while simplifying changes. The terms ""bug"" and ""debugging"" are popularly attributed to Admiral Grace Hopper in the 1940s. While she was working on a Mark II Computer at Harvard University, her associates discovered a moth stuck in a relay and thereby impeding operation, whereupon she remarked that they were ""debugging"" the system. However the term ""bug"" in the meaning of technical error dates back at least to 1878 and Thomas Edison, and ""debugging"" seems to have been used as a term in aeronautics before entering the world of computers.","Once upon a time there were Three Bears who lived together in a house of their own in a wood. One of them was a Little, Small, Wee Bear; and one was a Middle-sized Bear, and the other was a Great, Huge Bear. They had each a pot for their porridge; a little pot for the Little, Small, Wee Bear; and a middle-sized pot for the Middle Bear; and a great pot for the Great, Huge Bear. And they had each a chair to sit in; a little chair for the Little, Small, Wee Bear; and a middle-sized chair for the Middle Bear; and a great chair for the Great, Huge Bear. And they had each a bed to sleep in; a little bed for the Little, Small, Wee Bear; and a middle-sized bed for the Middle Bear; and a great bed for the Great, Huge Bear.",3.734694,1.027211,147.0,4,0,...,21.4,10.3,53,18.75,20.07,19.0,70.8,66.0,31.58,4.4


# Vectorize By BERT Function

## Fine Tuning

In [19]:
MODEL_PATH = '../input/huggingface-roberta/roberta-large'

tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH)

In [20]:
class RobertaForSequenceClassification_pl(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr):
        super().__init__()
        
        self.save_hyperparameters()
        
        self.roberta_sc = RobertaForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        
    def training_step(self, batch, batch_idx):
        output = self.roberta_sc(**batch)
        loss = output.loss
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        output = self.roberta_sc(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss)
        
    def test_step(self, batch, batch_idx):
        labels = batch.pop('labels')
        output = self.roberta_sc(**batch)
        labels_predicted = output.logits.argmax(-1)
        num_correct = (labels_predicted == labels).sum().item()
        accuracy = num_correct / labels.size(0)
        self.log('accuracy', accuracy)
        
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)

In [21]:
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/'
)

trainer = pl.Trainer(
    gpus=1,
    max_epochs=25,
    callbacks=[checkpoint]
)

In [22]:
def createRobertaFineDataSet(excerpts, targets):
    data = []    
    for excerpt, target in zip(excerpts, targets):
        encoding = tokenizer(
            excerpt,
            max_length = 240,
            padding='max_length',
            truncation=True
        )

        encoding['labels'] = target
        encoding = { k: torch.tensor(v) for k, v in encoding.items() }

        data.append(encoding)

    return data

In [23]:
# update kfold values for fine tune
kfolds = []

for i in progress_bar(train_df.index):
    kfolds.append(i % 5)
    
train_df['kfold'] = kfolds

In [24]:
torch.cuda.empty_cache()

In [25]:
model = RobertaForSequenceClassification_pl(
    MODEL_PATH,
    num_labels=1,
    lr=1e-5
)

Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../input/huggingface-roberta/roberta-large and a

In [26]:
for i in progress_bar(train_df['kfold'].unique()):
    train_df_for_fine_tune = train_df[train_df['kfold'] != i]
    test_df_for_fine_tune = train_df[train_df['kfold'] == i]
    
    dataset_train = createRobertaFineDataSet(
        train_df_for_fine_tune['excerpt'],
        train_df_for_fine_tune['target']
    )
    
    dataset_val = createRobertaFineDataSet(
        test_df_for_fine_tune['excerpt'],
        test_df_for_fine_tune['target']
    )

    train_dataloader = DataLoader(
        dataset_train,
        batch_size=8,
        shuffle=True
    )
    val_dataloader = DataLoader(
        dataset_val, 
        batch_size=128
    )

    trainer.fit(model, train_dataloader, val_dataloader)

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validation sanity check: 0it [00:00, ?it/s]

Training: 283it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validation sanity check: 0it [00:00, ?it/s]

Training: 283it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validation sanity check: 0it [00:00, ?it/s]

Training: 283it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validation sanity check: 0it [00:00, ?it/s]

Training: 283it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [27]:
best_model_path = checkpoint.best_model_path

model = RobertaForSequenceClassification_pl.load_from_checkpoint(
    best_model_path
)

FINE_TUNED_MODEL_PATH = '/kaggle/working/model_transformers'

model.roberta_sc.save_pretrained(FINE_TUNED_MODEL_PATH)

Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../input/huggingface-roberta/roberta-large and a

## Roberta interface

In [28]:
class RobertaDataset(nn.Module):
    def __init__(self, df, tokenizer, max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(
            self.excerpt[idx],
            return_tensors='pt',
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    

def get_embeddings(df, path, plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = RobertaModel.from_pretrained(MODEL_PATH, num_labels=1)
    model.to(device)
    model.eval()
    
    ds = RobertaDataset(df, tokenizer, config['max_len'])
    dl = DataLoader(
        ds,
        batch_size=config["batch_size"],
        shuffle=False,
        num_workers = 4,
        pin_memory=True,
        drop_last=False
    )

    embeddings = list()
    with torch.no_grad():
        for i, inputs in progress_bar(list(enumerate(dl))):
            inputs = {key:val.reshape(val.shape[0], -1).to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            #outputs = outputs[0][:, -1].detach().cpu().numpy()
            outputs = np.sum(outputs[0][:, -4:].detach().cpu().numpy(),axis=1)
            embeddings.extend(outputs)
            
    return np.array(embeddings)

In [29]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [30]:
config = {
    'batch_size': 8,
    'max_len': 240,
    'seed': 42,
}
seed_everything(seed=config['seed'])

train_embeddings =  get_embeddings(train_df, FINE_TUNED_MODEL_PATH)
test_embeddings = get_embeddings(test_df, FINE_TUNED_MODEL_PATH)

cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at /kaggle/working/model_transformers and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at /kaggle/working/model_transformers and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
train_embeddings.shape

(2834, 1024)

# Prepare train and test data

In [32]:
pd.set_option('display.max_rows', 500)

best_feat_idx=train_df.filter(regex='^(?!.*spacy_).*$').corr().query('target < -0.2 | 0.2 < target')['target']
best_feat_idx

target                          1.000000
excerpt_length                 -0.366427
chars_per_word                 -0.431207
syll_per_word                  -0.444332
pronoun                         0.309376
preposition                    -0.330594
nominalization                 -0.401848
DT                             -0.221746
IN                             -0.459998
JJ                             -0.258960
NN                             -0.253861
PRP                             0.368470
RP                              0.213603
VB                              0.230837
VBD                             0.322142
WDT                            -0.229986
WRB                             0.206875
periods                         0.265055
exclaims                        0.215480
questions                       0.205436
num_char                       -0.366470
longest_word                   -0.207259
avg_len_word                   -0.397504
avg sent length                -0.313869
normalized_word_

In [33]:
columns=set(best_feat_idx.index.values)

intersection = columns. intersection(test_df.columns) #Find common elements of set and list.
columns = list(intersection)
columns

['normalized_stopword_freq',
 'longest_word',
 'difficult_words',
 'NN',
 'nominalization',
 'avg_len_word',
 'automated_readability_index',
 'dale_chall_readability_score',
 'flesch_reading_ease',
 'coleman_liau_index',
 'PRP',
 'exclaims',
 'pronoun',
 'syll_per_word',
 'IN',
 'gunning_fog',
 'szigriszt_pazos',
 'gutierrez_polini',
 'periods',
 'questions',
 'text_standard',
 'fernandez_huerta',
 'WDT',
 'crawford',
 'flesch_kincaid_grade',
 'DT',
 'smog_index',
 'JJ',
 'RP',
 'linsear_write_formula',
 'VBD',
 'preposition',
 'chars_per_word',
 'normalized_word_count',
 'VB',
 'avg sent length',
 'num_char',
 'WRB']

In [34]:
X_train = pd.DataFrame(train_embeddings)
X_train = pd.concat([X_train, train_df[columns]], axis=1)

X_test = pd.DataFrame(test_embeddings)
X_test = pd.concat([X_test, test_df[columns]], axis=1)

In [35]:
y_train = train_df[['target']]

In [36]:
kf = KFold(n_splits=5, shuffle=True, random_state=71)

cv = list(kf.split(X_train, y_train))

# Light GBM

In [37]:
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'seed': 42,
    'learning_rate': 0.04,
    'max_depth': 6,
    'n_jobs': -1,
    'verbose': -1,
    'num_leaves': 10,
    'max_bin': 63,
    'feature_fraction': 0.25,
    'extra_trees': True,
    'path_smooth':0.1
}
pred = np.zeros(X_test.shape[0])
rmses = []

for tr_idx, val_idx in progress_bar(cv):
    x_tr, x_va = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[val_idx]

    train_set = lgb.Dataset(x_tr, y_tr)
    val_set = lgb.Dataset(x_va, y_va, reference=train_set)

    model = lgb.train(
        params,
        train_set, 
        num_boost_round=10000,
        early_stopping_rounds=50,
        valid_sets=[train_set, val_set], 
        verbose_eval=-1
    )

    y_pred = model.predict(x_va)
    rmse = np.sqrt(mse(y_va, y_pred))
    rmses.append(rmse)
    
    tmp_pred = model.predict(X_test)
    pred += tmp_pred / 5
    
print("\n", "Mean Fold RMSE:", np.mean(rmses))

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[588]	training's rmse: 0.0967336	valid_1's rmse: 0.171669
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[358]	training's rmse: 0.117046	valid_1's rmse: 0.183937
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[303]	training's rmse: 0.127849	valid_1's rmse: 0.172584
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[624]	training's rmse: 0.0933398	valid_1's rmse: 0.170033
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[767]	training's rmse: 0.0818854	valid_1's rmse: 0.173934

 Mean Fold RMSE: 0.1744314735825725


In [38]:
# check xgboost version
#import xgboost
#print(xgboost.__version__)

In [39]:
# create an xgboost regression model
#model = xgboost.XGBRegressor(n_estimators=1000, max_depth=8, eta=0.1, subsample=0.7, colsample_bytree=0.8)

In [40]:
# define model evaluation method
#cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
#scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

In [41]:
# force scores to be positive
#scores = absolute(scores)
#print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [42]:
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = pred
predictions.to_csv("submission.csv", index=False)

predictions

Unnamed: 0,id,target
0,c0f722661,-0.357089
1,f0953f0a5,-0.292511
2,0df072751,-0.461666
3,04caf4e0c,-1.893593
4,0e63f8bea,-1.717949
5,12537fe78,-0.999733
6,965e592c0,0.458713
