In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams

import pandas as pd
import numpy as np
from joblib import Parallel, delayed
import plotly.express as px
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import lightgbm as lgb
from math import sqrt

from pathlib import Path
from typing import Optional, Union, List, Tuple, Dict

[nltk_data] Downloading package stopwords to /home/chris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/chris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/chris/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
stop_words = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()

In [3]:
data_dir = Path('../data')

sample_submission = data_dir / 'sample_submission.csv'
summaries_train = data_dir / 'summaries_train.csv'
summaries_test = data_dir / 'summaries_test.csv'
prompts_train = data_dir / 'prompts_train.csv'
prompts_test = data_dir / 'prompts_test.csv'

def make_split(summaries_path: Path, prompts_path: Path, dtype_backend: Optional[str] = 'pyarrow') -> pd.DataFrame:
    summaries_df = pd.read_csv(summaries_path, dtype_backend=dtype_backend)
    prompts_df = pd.read_csv(prompts_path, dtype_backend=dtype_backend)
    df = pd.merge(summaries_df, prompts_df, how='inner', on='prompt_id')

    if len(df) != len(summaries_df):
        raise AssertionError('Could not match all prompt ids to a prompt')
    
    return df

df = make_split(summaries_train, prompts_train)

In [4]:
df.head(1)

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...


In [5]:
def clear_stopwords(column: pd.Series, idx: int) -> Union[List[str], List[str], List[str], List[str], List[str]]:
    tokens = [tok.lower() for tok in word_tokenize(column.iloc[idx]) if tok.isalnum()]
    cleared_stopwords = [tok for tok in tokens if tok not in stop_words]
    lemmas = [lemmatiser.lemmatize(tok) for tok in cleared_stopwords]
    bigram = set(ngrams(lemmas, 2))
    # tri_gram = set(ngrams(lemmas, 3))
    # four_gram = set(ngrams(lemmas, 4))

    return tokens, cleared_stopwords, lemmas, bigram #, tri_gram, four_gram

def nlp_splits(df: pd.DataFrame, column: str) -> None:
    output = Parallel(n_jobs=4, backend='multiprocessing')(delayed(clear_stopwords)(df[column], idx) for idx in range(len(df)))

    df[f'{column}_tokens'] = [part[0] for part in output]
    df[f'{column}_no_stopwords'] = [part[1] for part in output]
    df[f'{column}_lemmas'] = [part[2] for part in output]
    df[f'{column}_bigram'] = [part[3] for part in output]

x = clear_stopwords(df.text, 0)

In [6]:
text_columns = ['prompt_title', 'prompt_question', 'prompt_text', 'text']
for column in text_columns:
    nlp_splits(df, column)
    df[f'{column}_unique_bigrams'] = df[f'{column}_bigram'].str.len()

In [7]:
df_train = df.copy(deep=True)

df_train['text_bigram_overlap'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_overlap'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1) / df_train.text_unique_bigrams
df_train['text_bigram_ratio'] = df_train['text_unique_bigrams'] / (df_train['prompt_text_unique_bigrams'])

df_train['text_bigram_diff'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[1] - row[0]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_diff'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[1] - row[0]), axis=1) / df_train.text_unique_bigrams

df_train['text_bigram_exclusive'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1) / ((df_train.text_unique_bigrams + df_train.prompt_text_unique_bigrams) / 2)
df_train['question_bigram_exclusive'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1) / ((df_train.text_unique_bigrams + df_train.prompt_text_unique_bigrams) / 2)

In [8]:
numeric_features = df_train.select_dtypes(include=np.number)
corr = numeric_features.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,content,wording,prompt_title_unique_bigrams,prompt_question_unique_bigrams,prompt_text_unique_bigrams,text_unique_bigrams,text_bigram_overlap,question_bigram_overlap,text_bigram_ratio,text_bigram_diff,question_bigram_diff,text_bigram_exclusive,question_bigram_exclusive
content,1.0,0.75138,0.052423,0.026536,-0.03023,0.793555,-0.011486,-0.090704,0.795084,0.011486,0.090704,-0.399939,0.779024
wording,0.75138,1.0,0.026611,-0.056031,-0.125305,0.529123,-0.3199,-0.028342,0.559532,0.3199,0.028342,-0.014476,0.542503
prompt_title_unique_bigrams,0.052423,0.026611,1.0,0.940945,0.198507,0.162399,0.125865,-0.014407,0.114988,-0.125865,0.014407,-0.195834,0.251099
prompt_question_unique_bigrams,0.026536,-0.056031,0.940945,1.0,0.261953,0.162973,0.178714,-0.073393,0.101703,-0.178714,0.073393,-0.218553,0.251833
prompt_text_unique_bigrams,-0.03023,-0.125305,0.198507,0.261953,1.0,0.087776,0.277909,0.015343,-0.141215,-0.277909,-0.015343,-0.108869,-0.184709
text_unique_bigrams,0.793555,0.529123,0.162399,0.162973,0.087776,1.0,0.186255,-0.141032,0.961015,-0.186255,0.141032,-0.675624,0.918799
text_bigram_overlap,-0.011486,-0.3199,0.125865,0.178714,0.277909,0.186255,1.0,-0.157848,0.117397,-1.0,0.157848,-0.72732,0.137636
question_bigram_overlap,-0.090704,-0.028342,-0.014407,-0.073393,0.015343,-0.141032,-0.157848,1.0,-0.146373,0.157848,-1.0,0.160251,-0.296151
text_bigram_ratio,0.795084,0.559532,0.114988,0.101703,-0.141215,0.961015,0.117397,-0.146373,1.0,-0.117397,0.146373,-0.631491,0.962103
text_bigram_diff,0.011486,0.3199,-0.125865,-0.178714,-0.277909,-0.186255,-1.0,0.157848,-0.117397,1.0,-0.157848,0.72732,-0.137636


In [9]:
df_train.select_dtypes(include=np.number).describe()

Unnamed: 0,content,wording,prompt_title_unique_bigrams,prompt_question_unique_bigrams,prompt_text_unique_bigrams,text_unique_bigrams,text_bigram_overlap,question_bigram_overlap,text_bigram_ratio,text_bigram_diff,question_bigram_diff,text_bigram_exclusive,question_bigram_exclusive
count,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0
mean,-0.014853,-0.063072,0.993301,10.511375,323.413957,34.973761,0.25657,0.045554,0.11037,0.74343,0.954446,1.895557,0.234856
std,1.043569,1.036048,0.753336,3.198101,62.4909,25.55395,0.245381,0.064807,0.081772,0.245381,0.064807,0.130174,0.114135
min,-1.729859,-1.962614,0.0,7.0,268.0,6.0,0.0,0.0,0.016588,0.0,0.4,0.424,0.039867
25%,-0.799545,-0.87272,0.0,7.0,268.0,18.0,0.0625,0.0,0.058419,0.608247,0.933333,1.855626,0.155477
50%,-0.093814,-0.081769,1.0,11.0,300.0,27.0,0.177419,0.02439,0.085911,0.822581,0.97561,1.944637,0.206186
75%,0.49966,0.503833,2.0,15.0,422.0,43.0,0.391753,0.066667,0.134021,0.9375,1.0,1.985765,0.281437
max,3.900326,4.310693,2.0,15.0,422.0,325.0,1.0,0.6,1.093284,1.0,1.0,2.0,1.088


In [10]:
target_columns = ['content', 'wording']
feature_columns = [col for col in numeric_features if col not in target_columns]

targets = numeric_features[target_columns]
features = numeric_features[feature_columns]
prompt_group = pd.Categorical(df['prompt_title'])

In [11]:
def calculate_errors(y, y_pred):
    return {
        'r2': r2_score(y, y_pred),
        'rmse': sqrt(mean_squared_error(y, y_pred)),
        'mae': mean_absolute_error(y, y_pred)
    }

In [12]:
def train_lgb(
        target: str, 
        prompt_group: pd.DataFrame, 
        features: pd.DataFrame, 
        targets: pd.DataFrame, 
        feature_names: List[str],
        model_params: dict) -> Tuple[pd.DataFrame, lgb.LGBMRegressor]:
    
    group_kfold = GroupKFold(n_splits=prompt_group.unique().size)
    assert group_kfold.get_n_splits(features, targets, prompt_group) == len(prompt_group.unique())

    
    train_errors, val_errors = [], []
    for i, (train_index, test_index) in enumerate(group_kfold.split(features, targets, prompt_group)):
        # print(f'Fold {i}')
        # print(f'\tTest prompt: {df.iloc[test_index].prompt_title.unique().tolist()}')

        X_train = features[feature_names].iloc[train_index].convert_dtypes(dtype_backend='numpy_nullable')
        y_train = targets.iloc[train_index][target].convert_dtypes(dtype_backend='numpy_nullable')

        X_val = features[feature_names].iloc[test_index]
        y_val = targets.iloc[test_index][target]

        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, y_val)
        bst = lgb.train(model_params, train_data, )#, feval=[r2_score, mean_absolute_error])

        train_errors.append(calculate_errors(y_train, bst.predict(X_train)))
        val_errors.append(calculate_errors(y_val, bst.predict(X_val)))

    train_metrics = pd.DataFrame.from_records(train_errors).describe()
    train_metrics['set'] = 'train'
    val_metrics = pd.DataFrame.from_records(val_errors).describe()
    val_metrics['set'] = 'val'
    metric_df = pd.concat([train_metrics, val_metrics])

    return metric_df, bst

In [13]:
def train(f_cols):
    
    model_params = {
        'objective': 'fair', 
        'verbose': 0, 
        'force_col_wise': True,
        'learning_rate': 0.08,
        'boosting_type': 'dart',
        'num_leaves': 11,
    }

    metric_df_content, bst_content = train_lgb('content', prompt_group, features, targets, f_cols, model_params)
    metric_df_wording, bst_wording = train_lgb('wording', prompt_group, features, targets, f_cols, model_params)

    metric_df_content['target'] = 'content'
    metric_df_wording['target'] = 'wording'
    metric_df = pd.concat([metric_df_content, metric_df_wording])
    metric_df = metric_df.loc[['mean', 'std']]
    print(metric_df)

    mcrmse = (metric_df.loc[metric_df.target=='content', 'rmse'] + metric_df.loc[metric_df.target=='wording', 'rmse']) / 2
    print(f'\nMCRMSE: {mcrmse.iloc[1]}\n')

    importance = pd.DataFrame({
    'importance': bst_wording.feature_importance(),
    'feature': bst_wording.feature_name()}).sort_values(by='importance', ascending=False)
    print(importance)

In [14]:
train(feature_columns)

            r2      rmse       mae    set   target
mean  0.806634  0.458984  0.357693  train  content
mean  0.760823  0.512585  0.401044    val  content
mean  0.661757  0.600901  0.473157  train  wording
mean  0.438680  0.758031  0.606133    val  wording
std   0.004038  0.015497  0.011840  train  content
std   0.023556  0.057368  0.040232    val  content
std   0.029967  0.020707  0.015021  train  wording
std   0.066966  0.093496  0.079438    val  wording

MCRMSE: 0.635307968278987

    importance                         feature
3          270             text_unique_bigrams
4          252             text_bigram_overlap
6          135               text_bigram_ratio
7          117                text_bigram_diff
9           85           text_bigram_exclusive
10          67       question_bigram_exclusive
0           44     prompt_title_unique_bigrams
5           23         question_bigram_overlap
8            5            question_bigram_diff
2            2      prompt_text_unique_bigr

In [15]:
train(['text_bigram_overlap', 'text_unique_bigrams'])

            r2      rmse       mae    set   target
mean  0.790097  0.478199  0.371981  train  content
mean  0.758234  0.515405  0.401389    val  content
mean  0.616357  0.640318  0.503853  train  wording
mean  0.510684  0.705167  0.559819    val  wording
std   0.002559  0.014749  0.010732  train  content
std   0.017286  0.053294  0.036332    val  content
std   0.033239  0.031048  0.021957  train  wording
std   0.128460  0.131434  0.109709    val  wording

MCRMSE: 0.6102856988744446

   importance              feature
0         527  text_bigram_overlap
1         473  text_unique_bigrams


In [16]:
train(['text_bigram_exclusive', 'text_bigram_overlap', 'text_unique_bigrams'])

            r2      rmse       mae    set   target
mean  0.792974  0.474924  0.369432  train  content
mean  0.759408  0.514202  0.400492    val  content
mean  0.625913  0.632181  0.496424  train  wording
mean  0.502043  0.711969  0.564591    val  wording
std   0.003546  0.015729  0.011344  train  content
std   0.019022  0.055204  0.037566    val  content
std   0.033135  0.029229  0.019858  train  wording
std   0.121501  0.127058  0.106459    val  wording

MCRMSE: 0.613085594970603

   importance                feature
1         481    text_bigram_overlap
2         442    text_unique_bigrams
0          77  text_bigram_exclusive


In summary, it looks like adding more n-gram based features does improve train-set performance.  However due to low variance in the train set, any features that encode information based on the prompt question, title or text lead to heavy overfitting, so the best validation performance so far has been seen using only the n_unique_bigrams and the size of the intersection of bigrams in the summary and original text.