In [12]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.1-py3-none-any.whl (99 kB)
Collecting pyphen
  Downloading pyphen-0.11.0-py3-none-any.whl (2.0 MB)
Installing collected packages: pyphen, textstat
Successfully installed pyphen-0.11.0 textstat-0.7.1


In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [3]:
# Since there is no url_legal and license info in the hidden testing dataset
# These two columns are meaningless for our model
train = train.drop(['url_legal', 'license'], axis=1)

## EDA
Word:

    1. less common words are usually more complex
    2. Longer words are usually more complex
    
Sentence:

    1. Longer sentence are usually more complex

### Meta data - excerpt features

In [4]:
# get the meta data for each excerpt
def get_meta(df, col):
    
    # Bin the target into three levels
    target_bins = [i for i in range(-4, 3, 2)]
    target_labels = ['complex', 'medium', 'simple']
    df['level'] = pd.cut(
    df['target'], bins=target_bins, labels=target_labels).astype('str')
    
    # Add meta data
    df['sentences_per_excerpt'] = df[col].apply(lambda x: x.count('.'))
    df['words_per_excerpt'] = df[col].str.split().map(lambda x: len(x))
    df['characters_per_excerpt'] = df[col].apply(lambda x: len(x))

    df['words_per_sentence'] = df[col].str.split('.').apply(
    lambda x: [len(i.split()) for i in x]).map(
    lambda x: np.mean(x[:-1])
    )
    df['characters_per_sentense'] = df[col].str.split('.').apply(
    lambda x: [len(i) for i in x]).map(
    lambda x: np.mean(x[:-1])
    )
    df['characters_per_word'] = df[col].str.split().apply(
    lambda x: [len(i) for i in x]).map(
    lambda x: np.mean(x)
    )
    return df

In [5]:
def get_samples(df, col_1='level'):
    samples = {}
    level = df[col_1].unique().tolist()
    level.sort(reverse=True)
    for i in level:
        idx = df.loc[df[col_1] == i].sample().index[0]
        target = round(df['target'][idx], 2)
        excerpt = df['excerpt'][idx]
        samples[f'{i}'] = [target, excerpt]
    return samples

In [6]:
train = get_meta(train, 'excerpt')

In [7]:
train.head()

Unnamed: 0,id,excerpt,target,standard_error,level,sentences_per_excerpt,words_per_excerpt,characters_per_excerpt,words_per_sentence,characters_per_sentense,characters_per_word
0,c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009,medium,11,179,992,16.272727,89.181818,4.547486
1,85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,medium,10,169,937,17.1,92.7,4.550296
2,b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,medium,11,166,908,15.272727,81.454545,4.475904
3,dd1000b26,And outside before the palace a great garden w...,-1.054013,0.450007,medium,5,164,909,32.8,180.8,4.54878
4,37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,0.510845,simple,5,147,723,29.4,143.6,3.92517


In [15]:
from termcolor import colored, cprint

samples = get_samples(train)
n_words = train['words_per_excerpt'].tolist()
excerpt_features = [i for i in train.columns[-6:]]

print(
    colored(
        f'Train contains {train.shape[0]}' 
        + f' excerpts, ranging from {min(n_words)} to {max(n_words)}' 
        + f' (avg {round(np.mean(n_words))}) words long.',
        'yellow',
        attrs=['bold']
    )
)

for level, (target, text) in samples.items():
    if target > 0:
        color = 'green'
    elif target < -2:
        color = 'red'
    else:
        color = 'yellow'
    print(colored('---' * 10, color))
    level = colored(level.upper(), color, attrs=['bold'])
    target = colored(target, color, attrs=['bold'])
    print(f'Train sample of {level} text with the target value: {target}')
    print(colored('---' * 10, color))
    print(text)

[1m[33mTrain contains 2834 excerpts, ranging from 135 to 205 (avg 173) words long.[0m
[32m------------------------------[0m
Train sample of [1m[32mSIMPLE[0m text with the target value: [1m[32m0.55[0m
[32m------------------------------[0m
Annie and her baby-brother went to ride with their papa and mamma. They crossed the river on a long bridge; and beyond it they saw horses and cows feeding on the green prairie.
"What are all these heaps of dirt for?" said Annie.
"We are just entering 'dog-town,'" said her papa; "and those are the houses of the inhabitants. Do you see the two little fellows sitting up on that mound?"
"Yes," said Annie; "but they look like little fat squirrels; don't they, mamma?"
Baby pointed his little chubby finger, and said, "Ish!"
"They are prairie-dogs," said mamma; "but are sometimes called the 'wish-ton-wish' and 'prairie marmot,' and sometimes 'prairie marmot squirrel.' It is like the marmot because it burrows in the ground, and like the squirrel be

### Complexity - textstat features

In [16]:
import textstat

def get_stat(df, col):
    df['flesch_reading_ease'] = df[col].apply(lambda x: textstat.flesch_reading_ease(x))
    df['flesch_kincaid_grade'] = df[col].apply(lambda x: textstat.flesch_kincaid_grade(x))
    df['gunning_fog'] = df[col].apply(lambda x: textstat.gunning_fog(x))
    df['smog_index'] = df[col].apply(lambda x: textstat.smog_index(x))
    df['automated_readability_index'] = df[col].apply(lambda x: textstat.automated_readability_index(x))
    df['coleman_liau_index'] = df[col].apply(lambda x: textstat.coleman_liau_index(x))
    df['linsear_write_formula'] = df[col].apply(lambda x: textstat.linsear_write_formula(x))
    df['dale_chall_readability_score'] = df[col].apply(lambda x: textstat.dale_chall_readability_score(x))
    df['text_standard'] = df[col].apply(lambda x: textstat.text_standard(x, float_output=True))
    return df

In [17]:
train = get_stat(train, 'excerpt')
textstat_features = train.columns[-9:].tolist()
train.head()

Unnamed: 0,id,excerpt,target,standard_error,level,sentences_per_excerpt,words_per_excerpt,characters_per_excerpt,words_per_sentence,characters_per_sentense,characters_per_word,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,text_standard
0,c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009,medium,11,179,992,16.272727,89.181818,4.547486,80.31,6.1,8.31,8.6,8.3,8.06,9.0,6.65,9.0
1,85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,medium,10,169,937,17.1,92.7,4.550296,82.54,5.2,7.53,8.3,7.2,6.78,7.285714,5.92,8.0
2,b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,medium,11,166,908,15.272727,81.454545,4.475904,75.74,7.9,10.49,10.1,10.1,7.2,14.75,6.29,8.0
3,dd1000b26,And outside before the palace a great garden w...,-1.054013,0.450007,medium,5,164,909,32.8,180.8,4.54878,72.02,11.4,13.61,6.7,16.4,8.54,12.5,6.61,7.0
4,37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,0.510845,simple,5,147,723,29.4,143.6,3.92517,75.47,10.0,11.76,8.8,11.8,4.83,13.5,1.57,12.0
