## Import libraries

In [1]:
import sys
sys.path.append('../input/iterativestratification/')

import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

import re
import nltk
import string
from textblob import TextBlob
from nltk.corpus import stopwords

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import QuantileTransformer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression

tqdm.pandas()
np.random.seed(42)

## Load source datasets

In [2]:
train = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
print(f"train: {train.shape}")
train.head()

train: (3911, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [3]:
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
print(f"test: {test.shape}")
test.head()

test: (3, 2)


Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [4]:
labels = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
data_labels = train[labels].values

## Feature Engineering

### Helper Functions

In [5]:
def contraction_count(sent):
    count = 0
    count += re.subn(r"won\'t", '', sent)[1]
    count += re.subn(r"can\'t", '', sent)[1]
    count += re.subn(r"n\'t", '', sent)[1]
    count += re.subn(r"\'re", '', sent)[1]
    count += re.subn(r"\'s", '', sent)[1]
    count += re.subn(r"\'d", '', sent)[1]
    count += re.subn(r"\'ll", '', sent)[1]
    count += re.subn(r"\'t", '', sent)[1]
    count += re.subn(r"\'ve", '', sent)[1]
    count += re.subn(r"\'m", '', sent)[1]
    return count

In [6]:
def decontraction(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"he's", "he is", phrase)
    phrase = re.sub(r"there's", "there is", phrase)
    phrase = re.sub(r"We're", "We are", phrase)
    phrase = re.sub(r"That's", "That is", phrase)
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"they're", "they are", phrase)
    phrase = re.sub(r"Can't", "Cannot", phrase)
    phrase = re.sub(r"wasn't", "was not", phrase)
    phrase = re.sub(r"don\x89Ûªt", "do not", phrase)
    phrase = re.sub(r"donãât", "do not", phrase)
    phrase = re.sub(r"aren't", "are not", phrase)
    phrase = re.sub(r"isn't", "is not", phrase)
    phrase = re.sub(r"What's", "What is", phrase)
    phrase = re.sub(r"haven't", "have not", phrase)
    phrase = re.sub(r"hasn't", "has not", phrase)
    phrase = re.sub(r"There's", "There is", phrase)
    phrase = re.sub(r"He's", "He is", phrase)
    phrase = re.sub(r"It's", "It is", phrase)
    phrase = re.sub(r"You're", "You are", phrase)
    phrase = re.sub(r"I'M", "I am", phrase)
    phrase = re.sub(r"shouldn't", "should not", phrase)
    phrase = re.sub(r"wouldn't", "would not", phrase)
    phrase = re.sub(r"i'm", "I am", phrase)
    phrase = re.sub(r"I\x89Ûªm", "I am", phrase)
    phrase = re.sub(r"I'm", "I am", phrase)
    phrase = re.sub(r"Isn't", "is not", phrase)
    phrase = re.sub(r"Here's", "Here is", phrase)
    phrase = re.sub(r"you've", "you have", phrase)
    phrase = re.sub(r"you\x89Ûªve", "you have", phrase)
    phrase = re.sub(r"we're", "we are", phrase)
    phrase = re.sub(r"what's", "what is", phrase)
    phrase = re.sub(r"couldn't", "could not", phrase)
    phrase = re.sub(r"we've", "we have", phrase)
    phrase = re.sub(r"it\x89Ûªs", "it is", phrase)
    phrase = re.sub(r"doesn\x89Ûªt", "does not", phrase)
    phrase = re.sub(r"It\x89Ûªs", "It is", phrase)
    phrase = re.sub(r"Here\x89Ûªs", "Here is", phrase)
    phrase = re.sub(r"who's", "who is", phrase)
    phrase = re.sub(r"I\x89Ûªve", "I have", phrase)
    phrase = re.sub(r"y'all", "you all", phrase)
    phrase = re.sub(r"can\x89Ûªt", "cannot", phrase)
    phrase = re.sub(r"would've", "would have", phrase)
    phrase = re.sub(r"it'll", "it will", phrase)
    phrase = re.sub(r"we'll", "we will", phrase)
    phrase = re.sub(r"wouldn\x89Ûªt", "would not", phrase)
    phrase = re.sub(r"We've", "We have", phrase)
    phrase = re.sub(r"he'll", "he will", phrase)
    phrase = re.sub(r"Y'all", "You all", phrase)
    phrase = re.sub(r"Weren't", "Were not", phrase)
    phrase = re.sub(r"Didn't", "Did not", phrase)
    phrase = re.sub(r"they'll", "they will", phrase)
    phrase = re.sub(r"they'd", "they would", phrase)
    phrase = re.sub(r"DON'T", "DO NOT", phrase)
    phrase = re.sub(r"That\x89Ûªs", "That is", phrase)
    phrase = re.sub(r"they've", "they have", phrase)
    phrase = re.sub(r"i'd", "I would", phrase)
    phrase = re.sub(r"should've", "should have", phrase)
    phrase = re.sub(r"You\x89Ûªre", "You are", phrase)
    phrase = re.sub(r"where's", "where is", phrase)
    phrase = re.sub(r"Don\x89Ûªt", "Do not", phrase)
    phrase = re.sub(r"we'd", "we would", phrase)
    phrase = re.sub(r"i'll", "I will", phrase)
    phrase = re.sub(r"weren't", "were not", phrase)
    phrase = re.sub(r"They're", "They are", phrase)
    phrase = re.sub(r"Can\x89Ûªt", "Cannot", phrase)
    phrase = re.sub(r"you\x89Ûªll", "you will", phrase)
    phrase = re.sub(r"I\x89Ûªd", "I would", phrase)
    phrase = re.sub(r"let's", "let us", phrase)
    phrase = re.sub(r"it's", "it is", phrase)
    phrase = re.sub(r"can't", "cannot", phrase)
    phrase = re.sub(r"don't", "do not", phrase)
    phrase = re.sub(r"you're", "you are", phrase)
    phrase = re.sub(r"i've", "I have", phrase)
    phrase = re.sub(r"that's", "that is", phrase)
    phrase = re.sub(r"i'll", "I will", phrase)
    phrase = re.sub(r"doesn't", "does not",phrase)
    phrase = re.sub(r"i'd", "I would", phrase)
    phrase = re.sub(r"didn't", "did not", phrase)
    phrase = re.sub(r"ain't", "am not", phrase)
    phrase = re.sub(r"you'll", "you will", phrase)
    phrase = re.sub(r"I've", "I have", phrase)
    phrase = re.sub(r"Don't", "do not", phrase)
    phrase = re.sub(r"I'll", "I will", phrase)
    phrase = re.sub(r"I'd", "I would", phrase)
    phrase = re.sub(r"Let's", "Let us", phrase)
    phrase = re.sub(r"you'd", "You would", phrase)
    phrase = re.sub(r"It's", "It is", phrase)
    phrase = re.sub(r"Ain't", "am not", phrase)
    phrase = re.sub(r"Haven't", "Have not", phrase)
    phrase = re.sub(r"Could've", "Could have", phrase)
    phrase = re.sub(r"youve", "you have", phrase)  
    phrase = re.sub(r"donå«t", "do not", phrase)
    return phrase

In [7]:
def remove_punctuations(text):
    for punctuation in list(string.punctuation):
        text = text.replace(punctuation, '')
    return text

In [8]:
def clean_number(text):
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    return text

In [9]:
def sent2vec(text):
    words = nltk.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            M.append(embeddings_index['unk'])
            continue
    
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    
    return v / np.sqrt((v ** 2).sum())

### Create basic text features

In [10]:
def text_features(df, col):
    df[f"{col}_num_words"] = df[col].progress_apply(lambda x: len(str(x).split()))
    df[f"{col}_num_unique_words"] = df[col].progress_apply(lambda x: len(set(str(x).split())))
    df[f"{col}_num_chars"] = df[col].progress_apply(lambda x: len(str(x)))
    df[f"{col}_num_stopwords"] = df[col].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
    df[f"{col}_num_punctuations"] = df[col].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
    df[f"{col}_num_words_upper"] = df[col].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    df[f"{col}_num_words_title"] = df[col].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df[f"{col}_mean_word_len"] = df[col].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    df[f"{col}_num_paragraphs"] = df[col].progress_apply(lambda x: len(x.split('\n')))
    df[f"{col}_num_contractions"] = df[col].progress_apply(contraction_count)
    df[f"{col}_polarity"] = df[col].progress_apply(lambda x: TextBlob(x).sentiment[0])
    df[f"{col}_subjectivity"] = df[col].progress_apply(lambda x: TextBlob(x).sentiment[1])
    return df

In [11]:
train = text_features(train, "full_text")
train.head()

100%|██████████| 3911/3911 [00:00<00:00, 41039.05it/s]
100%|██████████| 3911/3911 [00:00<00:00, 16539.56it/s]
100%|██████████| 3911/3911 [00:00<00:00, 463636.50it/s]
100%|██████████| 3911/3911 [04:01<00:00, 16.18it/s]
100%|██████████| 3911/3911 [00:13<00:00, 296.81it/s]
100%|██████████| 3911/3911 [00:00<00:00, 14476.21it/s]
100%|██████████| 3911/3911 [00:00<00:00, 14002.22it/s]
100%|██████████| 3911/3911 [00:00<00:00, 7645.81it/s]
100%|██████████| 3911/3911 [00:00<00:00, 184301.32it/s]
100%|██████████| 3911/3911 [00:00<00:00, 28784.47it/s]
100%|██████████| 3911/3911 [00:12<00:00, 321.22it/s]
100%|██████████| 3911/3911 [00:12<00:00, 323.04it/s]


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,full_text_num_words,full_text_num_unique_words,full_text_num_chars,full_text_num_stopwords,full_text_num_punctuations,full_text_num_words_upper,full_text_num_words_title,full_text_mean_word_len,full_text_num_paragraphs,full_text_num_contractions,full_text_polarity,full_text_subjectivity
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,261,144,1387,129,21,1,3,4.252874,7,2,0.103571,0.55119
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,533,151,2635,311,21,2,12,3.93621,11,6,0.084853,0.530833
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,320,136,1663,177,36,9,27,4.196875,3,15,0.037857,0.516468
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,728,260,3973,420,108,9,57,4.447802,15,41,0.225062,0.608395
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,234,106,1326,122,3,0,3,4.089744,5,0,0.200216,0.527706


In [12]:
test = text_features(test, "full_text")
test.head()

100%|██████████| 3/3 [00:00<00:00, 2699.04it/s]
100%|██████████| 3/3 [00:00<00:00, 2263.11it/s]
100%|██████████| 3/3 [00:00<00:00, 3225.56it/s]
100%|██████████| 3/3 [00:00<00:00, 11.95it/s]
100%|██████████| 3/3 [00:00<00:00, 229.64it/s]
100%|██████████| 3/3 [00:00<00:00, 2952.35it/s]
100%|██████████| 3/3 [00:00<00:00, 3155.98it/s]
100%|██████████| 3/3 [00:00<00:00, 2533.81it/s]
100%|██████████| 3/3 [00:00<00:00, 5075.80it/s]
100%|██████████| 3/3 [00:00<00:00, 3708.49it/s]
100%|██████████| 3/3 [00:00<00:00, 254.97it/s]
100%|██████████| 3/3 [00:00<00:00, 262.65it/s]


Unnamed: 0,text_id,full_text,full_text_num_words,full_text_num_unique_words,full_text_num_chars,full_text_num_stopwords,full_text_num_punctuations,full_text_num_words_upper,full_text_num_words_title,full_text_mean_word_len,full_text_num_paragraphs,full_text_num_contractions,full_text_polarity,full_text_subjectivity
0,0000C359D63E,when a person has no experience on a job their...,835,263,4224,454,37,1,25,4.053892,11,2,0.263041,0.492126
1,000BAD50D026,Do you think students would benefit from being...,386,140,2167,207,36,1,11,4.608808,5,20,0.182193,0.502193
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde...",442,187,2361,244,33,1,11,4.332579,9,0,0.30821,0.516237


### Text Preprocessing

In [13]:
def clean_text(text):
    text = decontraction(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = remove_punctuations(text)
    text = clean_number(text)
    return text

In [14]:
train['full_text'] = train['full_text'].progress_apply(clean_text)
test['full_text'] = test['full_text'].progress_apply(clean_text)

100%|██████████| 3911/3911 [00:02<00:00, 1505.44it/s]
100%|██████████| 3/3 [00:00<00:00, 1021.17it/s]


### Glove Embeddings

In [15]:
with open("../input/nlp-word-embeddings/Glove_Embeddings.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
embeddings_index = processed_data['glove_embeddings_index']
print('Word vectors found: {}'.format(len(embeddings_index)))

del processed_data
gc.collect()

Word vectors found: 2196017


464

In [16]:
train.set_index('text_id', inplace=True)

glove_vec = [sent2vec(x) for x in tqdm(train["full_text"].values)]
col_list = ['glove_'+str(i) for i in range(300)]
glove_vec_df = pd.DataFrame(np.array(glove_vec), columns=col_list, index=train.index)
print(f"glove_vec_df: {glove_vec_df.shape}")

train = pd.merge(
    train, 
    glove_vec_df, 
    how="inner", 
    on="text_id", 
    sort=False
)

del glove_vec, glove_vec_df
gc.collect()

print(f"train: {train.shape}")
train.head()

100%|██████████| 3911/3911 [00:09<00:00, 417.36it/s]


glove_vec_df: (3911, 300)
train: (3911, 319)


Unnamed: 0_level_0,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,full_text_num_words,full_text_num_unique_words,full_text_num_chars,...,glove_290,glove_291,glove_292,glove_293,glove_294,glove_295,glove_296,glove_297,glove_298,glove_299
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0016926B079C,i think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,261,144,1387,...,-0.047897,0.012196,-0.005005,-0.018625,0.015539,0.000387,-0.024421,-0.015618,0.03968,0.028479
0022683E9EA5,when a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,533,151,2635,...,-0.059975,0.013082,-0.006524,-0.023567,0.03204,-0.00351,-0.023261,-0.011262,0.033279,0.039578
00299B378633,dear principal\n\nif u change the school polic...,3.0,3.5,3.0,3.0,3.0,2.5,320,136,1663,...,-0.052307,-0.002266,-0.009439,-0.032528,0.024091,0.00136,-0.040144,-0.038722,0.042079,0.02327
003885A45F42,the best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,728,260,3973,...,-0.069191,0.011354,-0.025531,-0.026579,0.024683,0.006697,-0.023632,-0.022341,0.040106,0.026516
0049B1DF5CCC,small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,234,106,1326,...,-0.065113,0.012299,-0.010551,-0.03528,0.00956,0.012831,-0.013666,-0.018179,0.016979,0.031928


In [17]:
test.set_index('text_id', inplace=True)

glove_vec = [sent2vec(x) for x in tqdm(test["full_text"].values)]
col_list = ['glove_'+str(i) for i in range(300)]
glove_vec_df = pd.DataFrame(np.array(glove_vec), columns=col_list, index=test.index)
print(f"glove_vec_df: {glove_vec_df.shape}")

test = pd.merge(
    test, 
    glove_vec_df, 
    how="inner", 
    on="text_id", 
    sort=False
)

del glove_vec, glove_vec_df
gc.collect()

print(f"test: {test.shape}")
test.head()

100%|██████████| 3/3 [00:00<00:00, 268.49it/s]

glove_vec_df: (3, 300)
test: (3, 313)





Unnamed: 0_level_0,full_text,full_text_num_words,full_text_num_unique_words,full_text_num_chars,full_text_num_stopwords,full_text_num_punctuations,full_text_num_words_upper,full_text_num_words_title,full_text_mean_word_len,full_text_num_paragraphs,...,glove_290,glove_291,glove_292,glove_293,glove_294,glove_295,glove_296,glove_297,glove_298,glove_299
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000C359D63E,when a person has no experience on a job their...,835,263,4224,454,37,1,25,4.053892,11,...,-0.051433,0.007779,-0.016645,-0.031097,0.028537,-0.00261,-0.027199,-0.017325,0.027618,0.032211
000BAD50D026,do you think students would benefit from being...,386,140,2167,207,36,1,11,4.608808,5,...,-0.023414,-0.001185,-0.001112,-0.006998,0.014711,0.003211,-0.024542,-0.025076,0.029015,0.018912
00367BB2546B,thomas jefferson once states that it is wonder...,442,187,2361,244,33,1,11,4.332579,9,...,-0.054315,0.01905,-0.012756,-0.022174,0.019773,-0.015299,-0.026611,-0.018428,0.035217,0.029322


In [18]:
del embeddings_index
gc.collect()

24

### FastText Embeddings

In [20]:
with open("../input/nlp-word-embeddings/FastText_Embeddings.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
embeddings_index = processed_data['fasttext_embeddings_index']
print('Word vectors found: {}'.format(len(embeddings_index)))

del processed_data
gc.collect()

Word vectors found: 1000000


0

In [21]:
fasttext_vec = [sent2vec(x) for x in tqdm(train["full_text"].values)]
col_list = ['fasttext_'+str(i) for i in range(300)]
fasttext_vec_df = pd.DataFrame(np.array(fasttext_vec), columns=col_list, index=train.index)
print(f"fasttext_vec_df: {fasttext_vec_df.shape}")

train = pd.merge(
    train, 
    fasttext_vec_df, 
    how="inner", 
    on="text_id", 
    sort=False
)

del fasttext_vec, fasttext_vec_df
gc.collect()

train.drop('full_text', axis=1, inplace=True)
print(f"train: {train.shape}")
train.head()

100%|██████████| 3911/3911 [00:10<00:00, 368.32it/s]


fasttext_vec_df: (3911, 300)
train: (3911, 618)


Unnamed: 0_level_0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,full_text_num_words,full_text_num_unique_words,full_text_num_chars,full_text_num_stopwords,...,fasttext_290,fasttext_291,fasttext_292,fasttext_293,fasttext_294,fasttext_295,fasttext_296,fasttext_297,fasttext_298,fasttext_299
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0016926B079C,3.5,3.5,3.0,3.0,4.0,3.0,261,144,1387,129,...,0.003124,-0.004187,-0.008275,0.004614,0.006928,0.00048,-0.014724,0.152922,0.012428,0.005484
0022683E9EA5,2.5,2.5,3.0,2.0,2.0,2.5,533,151,2635,311,...,0.01517,-0.001463,-0.005018,0.003173,-0.006586,-0.015554,-0.025787,0.164027,0.023467,-0.000588
00299B378633,3.0,3.5,3.0,3.0,3.0,2.5,320,136,1663,177,...,0.009466,-0.007042,-0.013316,0.001678,-0.004676,0.007161,-0.016156,0.141131,0.01713,-0.013145
003885A45F42,4.5,4.5,4.5,4.5,4.0,5.0,728,260,3973,420,...,0.013575,-0.009068,-0.014698,-0.003738,-0.001768,-0.015897,-0.030869,0.16531,0.004362,-0.012372
0049B1DF5CCC,2.5,3.0,3.0,3.0,2.5,2.5,234,106,1326,122,...,0.010878,-0.017811,-6.3e-05,-0.008606,-0.008423,-0.017439,-0.026796,0.160969,0.002208,0.006025


In [22]:
fasttext_vec = [sent2vec(x) for x in tqdm(test["full_text"].values)]
col_list = ['fasttext_'+str(i) for i in range(300)]
fasttext_vec_df = pd.DataFrame(np.array(fasttext_vec), columns=col_list, index=test.index)
print(f"fasttext_vec_df: {fasttext_vec_df.shape}")

test = pd.merge(
    test, 
    fasttext_vec_df, 
    how="inner", 
    on="text_id", 
    sort=False
)

del fasttext_vec, fasttext_vec_df
gc.collect()

test.drop('full_text', axis=1, inplace=True)
print(f"test: {test.shape}")
test.head()

100%|██████████| 3/3 [00:00<00:00, 245.99it/s]

fasttext_vec_df: (3, 300)
test: (3, 612)





Unnamed: 0_level_0,full_text_num_words,full_text_num_unique_words,full_text_num_chars,full_text_num_stopwords,full_text_num_punctuations,full_text_num_words_upper,full_text_num_words_title,full_text_mean_word_len,full_text_num_paragraphs,full_text_num_contractions,...,fasttext_290,fasttext_291,fasttext_292,fasttext_293,fasttext_294,fasttext_295,fasttext_296,fasttext_297,fasttext_298,fasttext_299
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000C359D63E,835,263,4224,454,37,1,25,4.053892,11,2,...,0.007568,-0.011315,-0.017608,-0.002028,0.003305,-0.004966,-0.025996,0.160157,0.011031,-0.009291
000BAD50D026,386,140,2167,207,36,1,11,4.608808,5,20,...,-0.000962,-0.020579,-0.001811,0.000838,0.004391,0.01777,-0.027009,0.12449,0.002267,0.007213
00367BB2546B,442,187,2361,244,33,1,11,4.332579,9,0,...,0.014041,-0.002292,-0.020868,0.001919,-0.015535,-0.013856,-0.022666,0.164269,0.008171,-0.008842


In [23]:
del embeddings_index
gc.collect()

24

In [24]:
features = test.columns.tolist()

qt = QuantileTransformer(n_quantiles=1000, 
                         output_distribution='normal', 
                         random_state=42).fit(train[features])

train[features] = qt.transform(train[features])
test[features] = qt.transform(test[features])

## Models Training

### XGBoost

In [25]:
counter = 0
oof_score = 0
y_pred_final_xgb = np.zeros((test.shape[0], 6))

    
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(mskf.split(train, data_labels)):
    counter += 1

    train_x, val_x = train.iloc[train_idx][features], train.iloc[val_idx][features]
    
    mean_score = 0
    for i in range(len(labels)):
        train_y, val_y = train.iloc[train_idx][labels[i]], train.iloc[val_idx][labels[i]]

        model = XGBRegressor(
            objective='reg:squarederror',
            eval_metric='rmse',
            booster='gbtree',
            sample_type='weighted',
            tree_method='hist',
            grow_policy='lossguide',
            use_label_encoder=False,
            num_round=5000,
            max_depth=9, 
            max_leaves=36,
            learning_rate=0.095,
            subsample=0.7024,
            colsample_bytree=0.5289,
            min_child_weight=15,
            reg_lambda=0.05465,
            verbosity=0,
            random_state=42
        )
        
        model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                  early_stopping_rounds=100, verbose=50)
    
        y_pred = model.predict(val_x, iteration_range=(0, model.best_iteration))
        y_pred_final_xgb[:,i] += model.predict(test[features], iteration_range=(0, model.best_iteration))
    
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        print(f"Fold-{counter} | Col-{labels[i]} | RMSE Score: {score}")
        mean_score += score/float(len(labels))
    
    
    oof_score += mean_score
    print(f"\nFold-{counter} | OOF Score: {mean_score}\n\n")
    
    del model, y_pred
    del train_x, train_y
    del val_x, val_y
    gc.collect()


y_pred_final_xgb = y_pred_final_xgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

[0]	validation_0-rmse:2.47475	validation_1-rmse:2.41365
[50]	validation_0-rmse:0.28840	validation_1-rmse:0.54842
[99]	validation_0-rmse:0.16359	validation_1-rmse:0.54888
Fold-1 | Col-cohesion | RMSE Score: 0.5473253091490693
[0]	validation_0-rmse:2.38587	validation_1-rmse:2.31828
[50]	validation_0-rmse:0.28300	validation_1-rmse:0.53819
[99]	validation_0-rmse:0.15908	validation_1-rmse:0.53929
Fold-1 | Col-syntax | RMSE Score: 0.537687406819246
[0]	validation_0-rmse:2.54708	validation_1-rmse:2.50897
[50]	validation_0-rmse:0.24668	validation_1-rmse:0.46684
[99]	validation_0-rmse:0.14235	validation_1-rmse:0.46900
Fold-1 | Col-vocabulary | RMSE Score: 0.4665429537842513
[0]	validation_0-rmse:2.46448	validation_1-rmse:2.40254
[50]	validation_0-rmse:0.28490	validation_1-rmse:0.54076
[99]	validation_0-rmse:0.15924	validation_1-rmse:0.54077
Fold-1 | Col-phraseology | RMSE Score: 0.5396945513598742
[0]	validation_0-rmse:2.40977	validation_1-rmse:2.31801
[50]	validation_0-rmse:0.30970	validation_

### LightGBM

In [26]:
counter = 0
oof_score = 0
y_pred_final_lgb = np.zeros((test.shape[0], 6))

    
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(mskf.split(train, data_labels)):
    counter += 1

    train_x, val_x = train.iloc[train_idx][features], train.iloc[val_idx][features]
    
    mean_score = 0
    for i in range(len(labels)):
        train_y, val_y = train.iloc[train_idx][labels[i]], train.iloc[val_idx][labels[i]]

        model = LGBMRegressor(
            boosting_type='gbdt', 
            num_leaves=175, 
            max_depth=9, 
            learning_rate=0.05, 
            n_estimators=5000,
            objective='regression',
            min_child_samples=5, 
            subsample=0.85, 
            subsample_freq=5, 
            colsample_bytree=0.55, 
            reg_lambda=0.05,
            random_state=0,
            bagging_seed=0,
            feature_fraction_seed=0
        )
        
        model.fit(train_x, train_y, eval_metric='rmse',
                  eval_set=[(train_x, train_y), (val_x, val_y)],
                  early_stopping_rounds=100, verbose=100)
    
        y_pred = model.predict(val_x, num_iteration=model.best_iteration_)
        y_pred_final_lgb[:,i] += model.predict(test[features], num_iteration=model.best_iteration_)
    
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        print(f"Fold-{counter} | Col-{labels[i]} | RMSE Score: {score}")
        mean_score += score/float(len(labels))
    
    
    oof_score += mean_score
    print(f"\nFold-{counter} | OOF Score: {mean_score}\n\n")
    
    del model, y_pred
    del train_x, train_y
    del val_x, val_y
    gc.collect()


y_pred_final_lgb = y_pred_final_lgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

[100]	training's rmse: 0.0835898	training's l2: 0.00698725	valid_1's rmse: 0.547678	valid_1's l2: 0.299951
[200]	training's rmse: 0.0143904	training's l2: 0.000207085	valid_1's rmse: 0.547177	valid_1's l2: 0.299403
[300]	training's rmse: 0.00300559	training's l2: 9.03359e-06	valid_1's rmse: 0.54701	valid_1's l2: 0.29922
[400]	training's rmse: 0.000952217	training's l2: 9.06717e-07	valid_1's rmse: 0.547002	valid_1's l2: 0.299212
Fold-1 | Col-cohesion | RMSE Score: 0.5469858296080827
[100]	training's rmse: 0.0774156	training's l2: 0.00599317	valid_1's rmse: 0.531931	valid_1's l2: 0.282951
[200]	training's rmse: 0.0138761	training's l2: 0.000192546	valid_1's rmse: 0.531521	valid_1's l2: 0.282514
Fold-1 | Col-syntax | RMSE Score: 0.5314830417946587
[100]	training's rmse: 0.0679072	training's l2: 0.00461139	valid_1's rmse: 0.463387	valid_1's l2: 0.214728
[200]	training's rmse: 0.0113637	training's l2: 0.000129134	valid_1's rmse: 0.462448	valid_1's l2: 0.213858
Fold-1 | Col-vocabulary | RMSE

## Create submission file

In [27]:
y_pred_final = (y_pred_final_xgb * 0.65) + (y_pred_final_lgb * 0.35)

submission = pd.read_csv("../input/feedback-prize-english-language-learning/sample_submission.csv")

submission['cohesion'] = y_pred_final[:,0]
submission['syntax'] = y_pred_final[:,1]
submission['vocabulary'] = y_pred_final[:,2]
submission['phraseology'] = y_pred_final[:,3]
submission['grammar'] = y_pred_final[:,4]
submission['conventions'] = y_pred_final[:,5]

submission.to_csv("./submission.csv", index=False)
submission.head()

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.888193,2.818543,3.10875,2.962972,2.629677,2.711964
1,000BAD50D026,2.895682,2.890732,3.07528,2.897982,2.707038,2.890968
2,00367BB2546B,3.22001,3.120133,3.304714,3.170132,3.040692,3.095837


In [None]:
## Good Day!!