In [32]:
!pip install tabulate

Collecting tabulate
  Downloading tabulate-0.8.9-py3-none-any.whl (25 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.8.9


In [6]:
import numpy as np
import pandas as pd

In [7]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [8]:
# Since there is no url_legal and license info in the hidden testing dataset
# These two columns are meaningless for our model
train = train.drop(['url_legal', 'license'], axis=1)

## Pre-processing
Word:

    1. less common words are usually more complex
    2. Longer words are usually more complex
    
Sentence:

    1. Longer sentence are usually more complex

### Meta data - excerpt features

In [9]:
# get the meta data for each excerpt
def get_meta(df, col):
    
    # Add meta data
    df['sentences_per_excerpt'] = df[col].apply(lambda x: x.count('.'))
    df['words_per_excerpt'] = df[col].str.split().map(lambda x: len(x))
    df['characters_per_excerpt'] = df[col].apply(lambda x: len(x))

    df['words_per_sentence'] = df[col].str.split('.').apply(
    lambda x: [len(i.split()) for i in x]).map(
    lambda x: np.mean(x[:-1])
    )
    df['characters_per_sentense'] = df[col].str.split('.').apply(
    lambda x: [len(i) for i in x]).map(
    lambda x: np.mean(x[:-1])
    )
    df['characters_per_word'] = df[col].str.split().apply(
    lambda x: [len(i) for i in x]).map(
    lambda x: np.mean(x)
    )
    return df

In [10]:
train = get_meta(train, 'excerpt')

In [11]:
train.head()

Unnamed: 0,id,excerpt,target,standard_error,sentences_per_excerpt,words_per_excerpt,characters_per_excerpt,words_per_sentence,characters_per_sentense,characters_per_word
0,c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009,11,179,992,16.272727,89.181818,4.547486
1,85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,10,169,937,17.1,92.7,4.550296
2,b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,11,166,908,15.272727,81.454545,4.475904
3,dd1000b26,And outside before the palace a great garden w...,-1.054013,0.450007,5,164,909,32.8,180.8,4.54878
4,37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,0.510845,5,147,723,29.4,143.6,3.92517


### words features

In [14]:
import regex as re
import nltk
from nltk.util import ngrams

def tokenize(text):
    return re.findall(r'[\w-]*\p{L}[\w-]*', text)


# Remove stop words
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stopwords]


# Piplene
pipeline = [str.lower, tokenize, remove_stopwords]


# Preprocess
def preprocess(text, pipeline):
    tokens = text
    for transform in pipeline:
        tokens = transform(tokens)
    return tokens


In [15]:
train['tokens'] = train['excerpt'].apply(preprocess, pipeline=pipeline)

In [19]:
from collections import defaultdict, Counter

def count_words(df, column='tokens', preprocess=None, min_freq=2):

    # Process tokens and update counter
    def update(doc):
        tokens = doc if preprocess is None else preprocess(doc)
        counter.update(tokens)

    # Create counter and run through all data
    counter = Counter()
    df[column].map(update)

    # Transform counter into data frame
    freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['frequency'])
    freq_df = freq_df.query('frequency >= @min_freq')
    freq_df.index.name = column
    
    return freq_df.sort_values('frequency', ascending=False)

In [20]:
freq_df = count_words(train)
freq_df

Unnamed: 0_level_0,frequency
tokens,Unnamed: 1_level_1
one,1919
little,1137
would,1134
said,1112
could,905
...,...
grotesque,2
joys,2
equinox,2
cleverness,2


In [24]:
freq_dict = freq_df.frequency.to_dict()
# rare words
rare_words = {k: v for k, v in freq_dict.items() if v <= 5}
# common_words
common_words = {k: v for k, v in freq_dict.items() if v > 100}
# long words
long_df = count_words(
    train,
    column='excerpt',
    preprocess=lambda text: re.findall(r'\w{6,}', text),
)
long_words = long_df.frequency.to_dict()

def count_freq(corpus, vocab):
    counter = 0
    for word in vocab:
        if word in corpus:
            counter += 1
    return counter

def token_stats (df, col):
    
    df['rare_tokens'] = df[col].apply(count_freq, vocab=rare_words)
    df['common_tokens'] = df[col].apply(count_freq, vocab=common_words)
    df['long_tokens'] = df[col].apply(count_freq, vocab=long_words)
    
    return df

In [25]:
train = token_stats(train, 'tokens')

In [26]:
train.head()

Unnamed: 0,id,excerpt,target,standard_error,sentences_per_excerpt,words_per_excerpt,characters_per_excerpt,words_per_sentence,characters_per_sentense,characters_per_word,...,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,text_standard,tokens,rare_tokens,common_tokens,long_tokens
0,c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009,11,179,992,16.272727,89.181818,4.547486,...,8.6,8.3,8.06,9.0,6.65,9.0,"[young, people, returned, ballroom, presented,...",10,16,35
1,85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,10,169,937,17.1,92.7,4.550296,...,8.3,7.2,6.78,7.285714,5.92,8.0,"[dinner, time, mrs, fayre, somewhat, silent, e...",9,32,29
2,b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,11,166,908,15.272727,81.454545,4.475904,...,10.1,10.1,7.2,14.75,6.29,8.0,"[roger, predicted, snow, departed, quickly, ca...",7,27,23
3,dd1000b26,And outside before the palace a great garden w...,-1.054013,0.450007,5,164,909,32.8,180.8,4.54878,...,6.7,16.4,8.54,12.5,6.61,7.0,"[outside, palace, great, garden, walled, round...",16,21,30
4,37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,0.510845,5,147,723,29.4,143.6,3.92517,...,8.8,11.8,4.83,13.5,1.57,12.0,"[upon, time, three, bears, lived, together, ho...",2,12,4


## Model

### Model Selection

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from tabulate import tabulate
from xgboost import XGBRegressor

In [28]:
cat_col = ['id', 'excerpt', 'target', 'standard_error', 'tokens']
X = train.drop(cat_col, axis=1)
y = train['target']

In [29]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
models = [LinearRegression(), XGBRegressor()]
performance = {}
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmse = mean_squared_error(y_pred, y_valid, squared=True)
    performance[model] = rmse
    
print(tabulate(performance.items(), headers=['Model', 'RMSE']))

Model                                                                               RMSE
------------------------------------------------------------------------------  --------
LinearRegression()                                                              0.554025
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,             0.657856
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)


### Train selected model

In [46]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [None]:
cat_col = ['id', 'excerpt', 'target', 'standard_error', 'tokens']
X = train.drop(cat_col, axis=1)
y = train['target']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

## Submission

In [39]:
test = pd.read_csv("test.csv")

In [43]:
test = test.drop(['url_legal', 'license'], axis=1)
test = get_meta(test, 'excerpt')
test = get_stat(test, 'excerpt')
test['tokens'] = test['excerpt'].apply(preprocess, pipeline=pipeline)
test = token_stats(test, 'tokens')

In [44]:
test.head()

Unnamed: 0,id,excerpt,sentences_per_excerpt,words_per_excerpt,characters_per_excerpt,words_per_sentence,characters_per_sentense,characters_per_word,flesch_reading_ease,flesch_kincaid_grade,...,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,text_standard,tokens,rare_tokens,common_tokens,long_tokens
0,c0f722661,My hope lay in Jack's promise that he would ke...,8,149,772,18.625,95.375,4.187919,71.68,9.4,...,8.8,10.8,6.62,11.0,6.24,11.0,"[hope, lay, jack, promise, would, keep, bright...",3,27,32
1,f0953f0a5,Dotty continued to go to Mrs. Gray's every nig...,12,181,967,15.333333,79.583333,4.348066,88.57,5.0,...,5.6,7.3,6.55,6.428571,5.41,7.0,"[dotty, continued, go, mrs, gray, every, night...",2,32,19
2,0df072751,It was a bright and cheerful scene that greete...,13,174,948,13.538462,71.923077,4.454023,80.04,8.3,...,9.2,12.1,7.61,14.0,6.78,8.0,"[bright, cheerful, scene, greeted, eyes, capta...",8,26,30
3,04caf4e0c,Cell division is the process by which a parent...,7,180,1144,25.714286,162.428571,5.35,26.48,18.5,...,19.3,21.9,13.7,17.25,9.55,19.0,"[cell, division, process, parent, cell, divide...",10,20,41
4,0e63f8bea,Debugging is the process of finding and resolv...,6,168,1094,28.0,181.333333,5.517857,34.6,15.4,...,17.7,18.6,13.93,15.2,10.01,18.0,"[debugging, process, finding, resolving, defec...",18,12,44


In [45]:
cat_col = ['id', 'excerpt','tokens']
X_test = test.drop(cat_col, axis=1)

In [48]:
preds = xgb.predict(X_test)
ids = test['id']

In [50]:
submission_df = pd.DataFrame({'id': ids, 'target': preds})
# submission_df.to_csv('/kaggle/working/submission.csv', index=False)