In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
import string

import pandas as pd
import numpy as np
import numba
import joblib
from sklearn.ensemble import HistGradientBoostingRegressor
from joblib import Parallel, delayed
import plotly.express as px
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GroupKFold, HalvingRandomSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import lightgbm as lgb
from math import sqrt

from pathlib import Path
from typing import Optional, Union, List, Tuple

## TODO
- add sentence embedding -> clustering with cluster ID as a categorical feature

^ might not work actually as the train set is smaller than our test set + has less unique prompts meaning the clusters will not be very transferrable.

In [None]:
stop_words = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()

In [None]:
data_dir = Path('../data')

sample_submission = data_dir / 'sample_submission.csv'
summaries_train = data_dir / 'summaries_train.csv'
summaries_test = data_dir / 'summaries_test.csv'
prompts_train = data_dir / 'prompts_train.csv'
prompts_test = data_dir / 'prompts_test.csv'

def make_split(summaries_path: Path, prompts_path: Path, dtype_backend: Optional[str] = 'pyarrow') -> pd.DataFrame:
    summaries_df = pd.read_csv(summaries_path, dtype_backend=dtype_backend)
    prompts_df = pd.read_csv(prompts_path, dtype_backend=dtype_backend)
    df = pd.merge(summaries_df, prompts_df, how='inner', on='prompt_id')

    if len(df) != len(summaries_df):
        raise AssertionError('Could not match all prompt ids to a prompt')
    
    return df

df = make_split(summaries_train, prompts_train)

In [None]:
df.head(1)

In [None]:
def clear_stopwords(column: pd.Series, idx: int) -> Union[List[str], List[str], List[str], List[str], List[str]]:
    tokens = [tok.lower() for tok in word_tokenize(column.iloc[idx]) if tok.isalnum()]
    cleared_stopwords = [tok for tok in tokens if tok not in stop_words]
    lemmas = [lemmatiser.lemmatize(tok) for tok in cleared_stopwords]
    bigram = set(ngrams(lemmas, 2))
    tri_gram = set(ngrams(lemmas, 3))
    four_gram = set(ngrams(lemmas, 4))

    return tokens, cleared_stopwords, lemmas, bigram, tri_gram, four_gram

def nlp_splits(df: pd.DataFrame, column: str) -> None:
    output = Parallel(n_jobs=4)(delayed(clear_stopwords)(df[column], idx) for idx in range(len(df)))

    df[f'{column}_tokens'] = [part[0] for part in output]
    df[f'{column}_no_stopwords'] = [part[1] for part in output]
    df[f'{column}_lemmas'] = [part[2] for part in output]
    df[f'{column}_bigram'] = [part[3] for part in output]
    df[f'{column}_trigram'] = [part[4] for part in output]
    df[f'{column}_fourgram'] = [part[5] for part in output]

x = clear_stopwords(df.text, 0)

In [None]:
text_columns = ['prompt_title', 'prompt_question', 'prompt_text', 'text']
for column in text_columns:
    nlp_splits(df, column)
    df[f'{column}_unique_bigrams'] = df[f'{column}_bigram'].str.len()
    df[f'{column}_unique_trigrams'] = df[f'{column}_trigram'].str.len()

In [None]:
df['text_bigram_overlap'] = df[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1)
df['question_bigram_overlap'] = df[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1)
df['title_bigram_overlap'] = df[['prompt_title_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1)

df['text_bigram_diff'] = df[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] - row[1]), axis=1)
df['question_bigram_diff'] = df[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] - row[1]), axis=1)
df['title_bigram_diff'] = df[['prompt_title_bigram', 'text_bigram']].apply(lambda row: len(row[0] - row[1]), axis=1)

df['text_bigram_exclusive'] = df[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1)
df['question_bigram_exclusive'] = df[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1)
df['title_bigram_exclusive'] = df[['prompt_title_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1)

In [None]:
df

In [None]:
numeric_features = df.select_dtypes(include=np.number)
corr = numeric_features.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
df.select_dtypes(include=np.number).describe()

In [None]:
target_columns = ['content', 'wording']
feature_columns = [col for col in numeric_features if col not in target_columns]

targets = numeric_features[target_columns]
features = numeric_features[feature_columns]
prompt_group = pd.Categorical(df['prompt_title'])

In [None]:
prompt_group.unique().size

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GroupKFold, HalvingRandomSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import lightgbm as lgb
from math import sqrt

In [None]:
from sklearn.model_selection import GroupKFold

group_kfold = GroupKFold(n_splits=prompt_group.unique().size)
group_kfold.get_n_splits(features, targets, prompt_group)

In [None]:
model = HistGradientBoostingRegressor(learning_rate=0.2)

In [None]:
def calculate_errors(y, y_pred):
    return {
        'r2': r2_score(y, y_pred),
        'rmse': sqrt(mean_squared_error(y, y_pred)),
        'mae': mean_absolute_error(y, y_pred)
    }

def train_kfold(target: str, prompt_group: pd.DataFrame, features: pd.DataFrame, targets: pd.DataFrame, model_params: dict) -> pd.DataFrame:
    model = HistGradientBoostingRegressor(**model_params)

    group_kfold = GroupKFold(n_splits=prompt_group.unique().size)
    assert group_kfold.get_n_splits(features, targets, prompt_group) == len(prompt_group.unique())

    train_errors, val_errors = [], []
    for i, (train_index, test_index) in enumerate(group_kfold.split(features, targets, prompt_group)):
        print(f'Fold {i}')
        print(f'\tTest prompt: {df.iloc[test_index].prompt_title.unique()}')

        X_train = features.iloc[train_index]
        y_train = targets.iloc[train_index][target]

        X_val = features.iloc[test_index]
        y_val = targets.iloc[test_index][target]

        model.fit(X_train, y_train)
        train_errors.append(calculate_errors(y_train, model.predict(X_train)))
        val_errors.append(calculate_errors(y_val, model.predict(X_val)))
        print(model.predict(X_train))

    train_metrics = pd.DataFrame.from_records(train_errors).describe()
    train_metrics['set'] = 'train'
    val_metrics = pd.DataFrame.from_records(val_errors).describe()
    val_metrics['set'] = 'val'
    metric_df = pd.concat([train_metrics, val_metrics])

    return metric_df

In [None]:
metric_df = train_kfold('content', prompt_group, features, targets, {})
metric_df.loc[['mean', 'std']]

In [None]:
metric_df = train_kfold('wording', prompt_group, features, targets, {})
metric_df.loc[['mean', 'std']]