In [1]:
import os
download_dir = f'{os.getcwd()}/nltk_data'
os.environ['NLTK_DATA'] = download_dir

import nltk
nltk.download('stopwords', download_dir=download_dir)
nltk.download('punkt', download_dir=download_dir)
nltk.download('wordnet', download_dir=download_dir)
nltk.download('averaged_perceptron_tagger', download_dir=download_dir)
nltk.download('universal_tagset', download_dir=download_dir)
nltk.data.path.append(download_dir)
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk import ngrams

import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import plotly.express as px
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import lightgbm as lgb
from math import sqrt

from functools import partial
from pathlib import Path
from pprint import pprint
from typing import Optional, Union, List, Tuple, Dict

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chris/repos/student-summary-
[nltk_data]     evaluation/notebooks/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/chris/repos/student-
[nltk_data]     summary-evaluation/notebooks/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/chris/repos/student-
[nltk_data]     summary-evaluation/notebooks/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/chris/repos/student-summary-
[nltk_data]     evaluation/notebooks/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/chris/repos/student-summary-
[nltk_data]     evaluation/notebooks/nltk_data...
[nltk_data]   Package univers

In [2]:
stop_words = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()

data_dir = Path('../data')

sample_submission = data_dir / 'sample_submission.csv'
summaries_train = data_dir / 'summaries_train.csv'
summaries_test = data_dir / 'summaries_test.csv'
prompts_train = data_dir / 'prompts_train.csv'
prompts_test = data_dir / 'prompts_test.csv'

In [3]:
def make_split(summaries_path: Path, prompts_path: Path, dtype_backend: Optional[str] = 'pyarrow') -> pd.DataFrame:
    summaries_df = pd.read_csv(summaries_path, dtype_backend=dtype_backend)
    prompts_df = pd.read_csv(prompts_path, dtype_backend=dtype_backend)
    df = pd.merge(summaries_df, prompts_df, how='inner', on='prompt_id')

    if len(df) != len(summaries_df):
        raise AssertionError('Could not match all prompt ids to a prompt')
    
    return df

def clear_stopwords(column: pd.Series, idx: int) -> Union[List[str], List[str], List[str]]:
    tokens = [tok.lower() for tok in word_tokenize(column.iloc[idx]) if tok.isalnum()]
    cleared_stopwords = [tok for tok in tokens if tok not in stop_words]
    lemmas = [lemmatiser.lemmatize(tok) for tok in cleared_stopwords]
    bigram = set(ngrams(lemmas, 2))

    return tokens, cleared_stopwords, lemmas, bigram

def nlp_splits(df: pd.DataFrame, column: str) -> None:
    output = Parallel(n_jobs=4)(delayed(clear_stopwords)(df[column], idx) for idx in range(len(df)))

    df[f'{column}_tokens'] = [part[0] for part in output]
    df[f'{column}_no_stopwords'] = [part[1] for part in output]
    df[f'{column}_lemmas'] = [part[2] for part in output]
    df[f'{column}_bigram'] = [part[3] for part in output]

In [4]:
# Load data
df = make_split(summaries_train, prompts_train)

# Make n-grams for all text columns
text_columns = ['prompt_title', 'prompt_question', 'prompt_text', 'text']
for column in tqdm(text_columns):
    nlp_splits(df, column)
    df[f'{column}_unique_bigrams'] = df[f'{column}_bigram'].str.len()

100%|██████████| 4/4 [01:50<00:00, 27.71s/it]


In [5]:
df_train = df.copy(deep=True)

# Create n-gram based features
df_train['text_bigram_overlap'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_overlap'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1) / df_train.text_unique_bigrams
df_train['text_bigram_ratio'] = df_train['text_unique_bigrams'] / (df_train['prompt_text_unique_bigrams'])

df_train['text_bigram_diff'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[1] - row[0]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_diff'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[1] - row[0]), axis=1) / df_train.text_unique_bigrams

df_train['text_bigram_exclusive'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_exclusive'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1) / df_train.text_unique_bigrams

In [6]:
df_train['n_words'] = df_train.text_lemmas.str.len()
df_train['unique_words'] = df_train.text_lemmas.apply(set).str.len()
df_train['unique_ratio'] = df_train.unique_words / df_train.n_words

In [7]:
df_train['word_lengths'] = df_train.text_lemmas.apply(lambda x: [len(y) for y in x])
df_train['word_len_avg'] = df_train.word_lengths.apply(np.mean)

In [8]:
df_train['word_len_q10'] = df_train.word_lengths.apply(partial(np.percentile, q=10))
df_train['word_len_q90'] = df_train.word_lengths.apply(partial(np.percentile, q=90))

In [9]:
x = pos_tag(df_train.text_lemmas[0], tagset='universal')
from collections import defaultdict

dd = defaultdict(lambda: 0)
for _, pos in x:
    dd[pos] += 1

In [10]:
df_train['pos'] = df_train.text_lemmas.apply(partial(pos_tag, tagset='universal'))

In [11]:
def pos_counts(tags):
    dd = defaultdict(lambda: 0)
    for _, pos in tags:
        dd[pos] += 1
    return dd

df_train['pos_counts'] = df_train.pos.apply(pos_counts)

In [12]:
df_train['verb_count'] = df_train.pos_counts.str['VERB'].replace(np.nan, 0)
df_train['noun_count'] = df_train.pos_counts.str['NOUN'].replace(np.nan, 0)
df_train['adv_count'] = df_train.pos_counts.str['ADV'].replace(np.nan, 0)
df_train['adj_count'] = df_train.pos_counts.str['ADJ'].replace(np.nan, 0)
df_train['det_count'] = df_train.pos_counts.str['DET'].replace(np.nan, 0)

In [13]:
df_train[['verb_count','noun_count','adv_count','adj_count','det_count']].corr()

Unnamed: 0,verb_count,noun_count,adv_count,adj_count,det_count
verb_count,1.0,0.805601,0.650115,0.642748,0.196229
noun_count,0.805601,1.0,0.566242,0.848588,0.197472
adv_count,0.650115,0.566242,1.0,0.502692,0.156091
adj_count,0.642748,0.848588,0.502692,1.0,0.132898
det_count,0.196229,0.197472,0.156091,0.132898,1.0


In [14]:
numeric_features = df_train.select_dtypes(include=np.number)
target_columns = ['content', 'wording']
feature_columns = [col for col in numeric_features if col not in target_columns]

targets = numeric_features[target_columns]
features = numeric_features[feature_columns]
prompt_group = pd.Categorical(df['prompt_title'])

In [15]:
def calculate_errors(y, y_pred):
    return {
        'r2': r2_score(y, y_pred),
        'rmse': sqrt(mean_squared_error(y, y_pred)),
        'mae': mean_absolute_error(y, y_pred)
    }


def train_lgb_kfold(
        target: str, 
        prompt_group: pd.DataFrame, 
        features: pd.DataFrame, 
        targets: pd.DataFrame, 
        feature_names: List[str],
        model_params: dict) -> Tuple[pd.DataFrame, lgb.LGBMRegressor]:
    
    group_kfold = GroupKFold(n_splits=prompt_group.unique().size)
    assert group_kfold.get_n_splits(features, targets, prompt_group) == len(prompt_group.unique())

    
    train_errors, val_errors = [], []
    for i, (train_index, test_index) in enumerate(group_kfold.split(features, targets, prompt_group)):
        # print(f'Fold {i}')
        # print(f'\tTest prompt: {df.iloc[test_index].prompt_title.unique().tolist()}')

        X_train = features[feature_names].iloc[train_index].convert_dtypes(dtype_backend='numpy_nullable')
        y_train = targets.iloc[train_index][target].convert_dtypes(dtype_backend='numpy_nullable')

        X_val = features[feature_names].iloc[test_index]
        y_val = targets.iloc[test_index][target]

        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, y_val)
        bst = lgb.train(model_params, train_data, )#, feval=[r2_score, mean_absolute_error])

        train_errors.append(calculate_errors(y_train, bst.predict(X_train)))
        val_errors.append(calculate_errors(y_val, bst.predict(X_val)))

    train_metrics = pd.DataFrame.from_records(train_errors).describe()
    train_metrics['set'] = 'train'
    val_metrics = pd.DataFrame.from_records(val_errors).describe()
    val_metrics['set'] = 'val'
    metric_df = pd.concat([train_metrics, val_metrics])

    return metric_df, bst

def train_lgb(
        target: str, 
        prompt_group: pd.DataFrame, 
        features: pd.DataFrame, 
        targets: pd.DataFrame, 
        feature_names: List[str],
        model_params: dict) -> Tuple[pd.DataFrame, lgb.LGBMRegressor]:
    
    
    X_train = features[feature_names].convert_dtypes(dtype_backend='numpy_nullable')
    y_train = targets[target].convert_dtypes(dtype_backend='numpy_nullable')


    train_data = lgb.Dataset(X_train, label=y_train)
    bst = lgb.train(model_params, train_data, )#, feval=[r2_score, mean_absolute_error])

    train_errors = [calculate_errors(y_train, bst.predict(X_train))]
    train_metrics = pd.DataFrame.from_records(train_errors)

    return train_metrics, bst

In [16]:
def eval_validation(f_cols, model_params):
    metric_df_content, bst_content = train_lgb_kfold('content', prompt_group, features, targets, f_cols, model_params)
    metric_df_wording, bst_wording = train_lgb_kfold('wording', prompt_group, features, targets, f_cols, model_params)

    metric_df_content['target'] = 'content'
    metric_df_wording['target'] = 'wording'
    metric_df = pd.concat([metric_df_content, metric_df_wording])
    metric_df = metric_df.loc[['mean', 'std']]
    print(metric_df)

    mcrmse = (metric_df.loc[metric_df.target=='content', 'rmse'] + metric_df.loc[metric_df.target=='wording', 'rmse']) / 2
    print(f'\nTrain MCRMSE:\t   {mcrmse.iloc[0]}')
    print(f'Validation MCRMSE: {mcrmse.iloc[1]}\n')

    importance = pd.DataFrame({
    'importance': bst_wording.feature_importance(),
    'feature': bst_wording.feature_name()}).sort_values(by='importance', ascending=False)
    print(importance)

In [17]:
features.columns

Index(['prompt_title_unique_bigrams', 'prompt_question_unique_bigrams',
       'prompt_text_unique_bigrams', 'text_unique_bigrams',
       'text_bigram_overlap', 'question_bigram_overlap', 'text_bigram_ratio',
       'text_bigram_diff', 'question_bigram_diff', 'text_bigram_exclusive',
       'question_bigram_exclusive', 'n_words', 'unique_words', 'unique_ratio',
       'word_len_avg', 'word_len_q10', 'word_len_q90', 'verb_count',
       'noun_count', 'adv_count', 'adj_count', 'det_count'],
      dtype='object')

In [18]:
model_params = {
    'objective': 'fair', 
    'verbose': 0, 
    'force_col_wise': True,
    'learning_rate': 0.08,
    'boosting_type': 'dart',
    'num_leaves': 11,
}
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90',
          'verb_count','noun_count','adv_count','adj_count','det_count']

eval_validation(f_cols, model_params)

            r2      rmse       mae    set   target
mean  0.806390  0.459261  0.358195  train  content
mean  0.759987  0.513684  0.402202    val  content
mean  0.658127  0.604183  0.476375  train  wording
mean  0.509102  0.706457  0.564051    val  wording
std   0.006303  0.017028  0.013151  train  content
std   0.015049  0.053489  0.036848    val  content
std   0.030951  0.024802  0.018259  train  wording
std   0.118705  0.122813  0.105093    val  wording

Train MCRMSE:	   0.5317219417174222
Validation MCRMSE: 0.6100704488099216

    importance              feature
0          406  text_bigram_overlap
3          154              n_words
5          103         word_len_avg
1           94  text_unique_bigrams
9           68           noun_count
2           56         unique_ratio
10          55            adv_count
4           23         unique_words
11          22            adj_count
8           13           verb_count
7            6         word_len_q90
6            0         word_len_q

In [19]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio']

eval_validation(f_cols, model_params)

            r2      rmse       mae    set   target
mean  0.791520  0.476565  0.370783  train  content
mean  0.756455  0.517225  0.402603    val  content
mean  0.621879  0.635708  0.500014  train  wording
mean  0.498623  0.713939  0.568026    val  wording
std   0.001926  0.013957  0.010144  train  content
std   0.017666  0.052774  0.035961    val  content
std   0.032610  0.031037  0.022116  train  wording
std   0.133426  0.135442  0.113713    val  wording

Train MCRMSE:	   0.5561367552440215
Validation MCRMSE: 0.6155821194684965

   importance              feature
0         465  text_bigram_overlap
1         427  text_unique_bigrams
2         108         unique_ratio


In [20]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams']

eval_validation(f_cols, model_params)

            r2      rmse       mae    set   target
mean  0.790097  0.478199  0.371981  train  content
mean  0.758234  0.515405  0.401389    val  content
mean  0.616357  0.640318  0.503853  train  wording
mean  0.510684  0.705167  0.559819    val  wording
std   0.002559  0.014749  0.010732  train  content
std   0.017286  0.053294  0.036332    val  content
std   0.033239  0.031048  0.021957  train  wording
std   0.128460  0.131434  0.109709    val  wording

Train MCRMSE:	   0.559258697469001
Validation MCRMSE: 0.6102856988744446

   importance              feature
0         527  text_bigram_overlap
1         473  text_unique_bigrams


In [21]:
metric_df_content, bst_content = train_lgb('content', prompt_group, features, targets, f_cols, model_params)
metric_df_wording, bst_wording = train_lgb('wording', prompt_group, features, targets, f_cols, model_params)

In [22]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90']

eval_validation(f_cols, model_params)

            r2      rmse       mae    set   target
mean  0.804425  0.461585  0.359985  train  content
mean  0.765076  0.508234  0.397803    val  content
mean  0.649260  0.612031  0.482072  train  wording
mean  0.515658  0.701880  0.559380    val  wording
std   0.006252  0.017009  0.012971  train  content
std   0.013946  0.052668  0.036411    val  content
std   0.032625  0.028631  0.021088  train  wording
std   0.117498  0.122713  0.104864    val  wording

Train MCRMSE:	   0.536808078044499
Validation MCRMSE: 0.6050568632489971

   importance              feature
0         403  text_bigram_overlap
3         218              n_words
1         137  text_unique_bigrams
5         132         word_len_avg
2          59         unique_ratio
4          39         unique_words
7          11         word_len_q90
6           1         word_len_q10


In [23]:
print(f'\n{"-"*35}\n\tContent scores')
pprint(metric_df_content)
print(f'\n{"-"*35}\n\tWording scores')
pprint(metric_df_wording)


-----------------------------------
	Content scores
         r2      rmse      mae
0  0.785977  0.482748  0.37519

-----------------------------------
	Wording scores
         r2      rmse       mae
0  0.611153  0.646009  0.508004


In [None]:
print('Full data MCRMSE: ')
(metric_df_content.rmse + metric_df_wording.rmse) / 2