## 1 - Imports and nltk downloads

In [1]:
import os
download_dir = f'{os.getcwd()}/nltk_data'
os.environ['NLTK_DATA'] = download_dir

import nltk
nltk.download('stopwords', download_dir=download_dir)
nltk.download('punkt', download_dir=download_dir)
nltk.download('wordnet', download_dir=download_dir)
nltk.download('averaged_perceptron_tagger', download_dir=download_dir)
nltk.download('universal_tagset', download_dir=download_dir)
nltk.download('words', download_dir=download_dir)
nltk.data.path.append(download_dir)
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk import ngrams
import spellwise
from spellwise import Levenshtein

import datasets
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import plotly.express as px
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import optuna
import lightgbm as lgb
from math import sqrt

from functools import partial, reduce
from operator import or_
from pathlib import Path
from pprint import pprint
import random
from typing import Optional, Union, List, Tuple, Dict, Set, Any

seed = 42
random.seed(seed)
np.random.seed(seed)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chris/repos/student-summary-
[nltk_data]     evaluation/notebooks/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/chris/repos/student-
[nltk_data]     summary-evaluation/notebooks/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/chris/repos/student-
[nltk_data]     summary-evaluation/notebooks/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/chris/repos/student-summary-
[nltk_data]     evaluation/notebooks/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/chris/repos/student-summary-
[nltk_data]     evaluation/notebooks/nltk_data...
[nltk_data]   Package univers

## 2 - Fix nltk installation

In [2]:
%%sh

yes | unzip -q nltk_data/corpora/wordnet.zip -d nltk_data/corpora/
ls nltk_data/corpora/

[1m[36mbrown[m[m
brown.zip
[1m[36mstopwords[m[m
stopwords.zip
[1m[36mwordnet[m[m
wordnet.zip
[1m[36mwords[m[m
words.zip


replace nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/data.verb? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/index.adv? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/adv.exc? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/index.verb? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/cntlist.rev? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/data.adj? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/index.adj? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/LICENSE? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/citation.bib? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/noun.exc? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/verb.exc? [y]es, [n]o, [A]ll, [N]one, [r]ename: replac

## 3 - Set data paths

In [3]:
stop_words = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()
tqdm.pandas()

data_dir = Path('../data/commonlit-evaluate-student-summaries')

sample_submission = data_dir / 'sample_submission.csv'
summaries_train = data_dir / 'summaries_train.csv'
summaries_test = data_dir / 'summaries_test.csv'
prompts_train = data_dir / 'prompts_train.csv'
prompts_test = data_dir / 'prompts_test.csv'

content_model = 'content.txt'
wording_model = 'wording.txt'

## 4 - Preprocessing functions

In [4]:
def predict(model: lgb.Booster, df: pd.DataFrame, features: List[str]) -> pd.Series:
    return model.predict(df[features])

In [5]:
def text_tokenize(text: str) -> List[str]:
    return [lemmatiser.lemmatize(tok.lower()) for tok in word_tokenize(text) if tok.isalnum() and tok not in stop_words]

        
def make_bigram(tokens: List[str]) -> Set[str]:
    if type(tokens) != list:
        tokens = tokens.tolist()
    return set(ngrams(tokens, 2))


def tokenize(row: Dict[str, Any]) -> List[str]:
    for col in ['prompt_text', 'prompt_title', 'prompt_question', 'prompt_text']:
        row[f'{col}_lemmas'] = lemmas = text_tokenize(row[col])
        row[f'{col}_bigram'] = make_bigram(lemmas)

        
def nlp_preprocess(df: pd.DataFrame, column: str):
    df[f'{column}_lemmas'] = df[column].apply(text_tokenize)
    df[f'{column}_bigram'] = df[f'{column}_lemmas'].apply(make_bigram)
    
    
def batch_tokenize(data: Dict[str, Any]) -> List[str]:
    lemmas = [text_tokenize(row) for row in data]
    return lemmas

def process_col(data: Dict[str, Any], col: str) -> List[str]:
    lemmas = [text_tokenize(row) for row in data]
    bigrams = [make_bigram(lemma) for lemma in lemmas]
    return {f'{col}_lemmas': lemmas, f'{col}_bigram': bigrams}

def make_split(summaries_path: Path, prompts_path: Path, dtype_backend: Optional[str] = 'pyarrow') -> pd.DataFrame:
    summaries_df = pd.read_csv(summaries_path)
    prompts_df = pd.read_csv(prompts_path)
    
    for column in ['prompt_title', 'prompt_question', 'prompt_text']:
        nlp_preprocess(prompts_df, column)
        prompts_df[f'{column}_unique_bigrams'] = prompts_df[f'{column}_bigram'].str.len()
    
    summaries_dataset = datasets.Dataset.from_pandas(summaries_df, preserve_index=False)
    proc_func = partial(process_col, col='text')
    summaries_df = summaries_dataset.map(function=lambda example: {**proc_func(example['text']), **example}, num_proc=os.cpu_count(), keep_in_memory=True, batched=True).to_pandas()
    summaries_df['text_bigram'] = summaries_df.text_bigram.apply(lambda row: {(x[0], x[1]) for x in row})
    summaries_df['text_unique_bigrams'] = summaries_df['text_bigram'].str.len()
    
    df = pd.merge(summaries_df, prompts_df, how='left', on='prompt_id')
    df.fillna('')
    
    return df

## 5 - Load data and preprocess

In [6]:
# Load data
# df = make_split(summaries_test, prompts_test)
df = make_split(summaries_train, prompts_train)

Map (num_proc=8):   0%|          | 0/7165 [00:00<?, ? examples/s]

## 6 - Create bigram based features

In [7]:
df_train = df.copy(deep=True)

# Create n-gram based features
df_train['text_bigram_overlap'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_overlap'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1) / df_train.text_unique_bigrams
df_train['text_bigram_ratio'] = df_train['text_unique_bigrams'] / (df_train['prompt_text_unique_bigrams'])

df_train['text_bigram_diff'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[1] - row[0]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_diff'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[1] - row[0]), axis=1) / df_train.text_unique_bigrams

df_train['text_bigram_exclusive'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_exclusive'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1) / df_train.text_unique_bigrams

## 7 - Create word based features

In [8]:
df_train['n_words'] = df_train.text_lemmas.str.len()
df_train['unique_words'] = df_train.text_lemmas.apply(set).str.len()
df_train['unique_ratio'] = df_train.unique_words / df_train.n_words

In [9]:
df_train['word_lengths'] = df_train.text_lemmas.apply(lambda x: [len(y) for y in x])
df_train['word_len_avg'] = df_train.word_lengths.apply(np.mean)

In [10]:
df_train['word_len_q10'] = df_train.word_lengths.apply(partial(np.percentile, q=10))
df_train['word_len_q90'] = df_train.word_lengths.apply(partial(np.percentile, q=90))

In [11]:
x = pos_tag(df_train.text_lemmas[0], tagset='universal')
from collections import defaultdict

dd = defaultdict(lambda: 0)
for _, pos in x:
    dd[pos] += 1

In [12]:
df_train['pos'] = df_train.text_lemmas.apply(partial(pos_tag, tagset='universal'))

In [13]:
def pos_counts(tags):
    dd = defaultdict(lambda: 0)
    for _, pos in tags:
        dd[pos] += 1
    return dd

df_train['pos_counts'] = df_train.pos.apply(pos_counts)

In [14]:
df_train['verb_count'] = df_train.pos_counts.str['VERB'].replace(np.nan, 0)
df_train['noun_count'] = df_train.pos_counts.str['NOUN'].replace(np.nan, 0)
df_train['adv_count'] = df_train.pos_counts.str['ADV'].replace(np.nan, 0)
df_train['adj_count'] = df_train.pos_counts.str['ADJ'].replace(np.nan, 0)
df_train['det_count'] = df_train.pos_counts.str['DET'].replace(np.nan, 0)

In [15]:
df_train[['verb_count','noun_count','adv_count','adj_count','det_count']].corr()

Unnamed: 0,verb_count,noun_count,adv_count,adj_count,det_count
verb_count,1.0,0.812849,0.68108,0.64503,0.486026
noun_count,0.812849,1.0,0.603829,0.844499,0.628654
adv_count,0.68108,0.603829,1.0,0.533738,0.34228
adj_count,0.64503,0.844499,0.533738,1.0,0.566271
det_count,0.486026,0.628654,0.34228,0.566271,1.0


## 8 - Create spelling based features

In [16]:
%%sh
wget -nv https://github.com/dwyl/english-words/archive/refs/heads/master.zip -O master.zip
yes | unzip -q master.zip

2023-08-22 11:09:38 URL:https://codeload.github.com/dwyl/english-words/zip/refs/heads/master [7118481] -> "master.zip" [1]
replace english-words-master/CONTRIBUTING.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/LICENSE.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/README.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/read_english_dictionary.py? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/scripts/create_json.py? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/scripts/gen.sh? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/word_list_moby_README.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/word_list_moby_all_moby_words.icss.yaml? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/word_list_moby_credits.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/words.txt? [y]es, [n]o, [A]ll, [N]one, [r

In [17]:
with open('./english-words-master/words.txt', 'r') as f:
    en_words = [line.strip() for line in f.read().split('\n')]

en_words = set([word for word in en_words if word.isalpha()])

In [18]:
def get_unique_words(col: str) -> Set[str]:
    word_sets = df_train[col].apply(set).tolist()
    return reduce(or_, word_sets)

prompt_words = get_unique_words('prompt_text_lemmas')
question_words = get_unique_words('prompt_question_lemmas')
title_words = get_unique_words('prompt_title_lemmas')

word_set = en_words | prompt_words | question_words | title_words

In [19]:
with open('commonlit_words.txt', 'w') as f:
    f.write('\n'.join(word_set))

```
# word_dir = words.abspath('en')
# Initialise the algorithm
metric = Levenshtein()
# Index the words from a dictionary
# metric.add_from_path('./brown_words.txt')
# metric.add_from_path(words.abspath('en'))
# metric.add_from_path('./english-words-master/words.txt')
metric.add_from_path('./commonlit_words.txt')

def get_distances(tokens: List[str], metric: spellwise.algorithms.base.Base) -> List[str]:
    distances = []
    for idx, token in enumerate(tokens):
        suggestions = metric.get_suggestions(token)
        if suggestions == []:
            distance = len(token) if token.isalpha() else 0
        else:
            distance = suggestions[0]['distance']
        distances.append(distance)
    return sum(distances)


def distance_func(chunk: pd.DataFrame):
    return chunk.apply(partial(get_distances, metric=metric))

n_jobs = 6
df_chunks = np.array_split(df_train.text_lemmas, n_jobs * 2)
# total_edit_distances = Parallel(n_jobs=n_jobs, backend='loky')(delayed(distance_func)(chunk) for chunk in tqdm(df_chunks))
```

In [20]:
count_missing_words = lambda tokens: sum([word not in word_set for word in tokens if word.isalpha()])
df_train['missing_wordcount'] = df_train.text_lemmas.progress_apply(count_missing_words)

100%|██████████| 7165/7165 [00:00<00:00, 266781.97it/s]


In [21]:
numeric_features = df_train.select_dtypes(include=np.number)
target_columns = ['content', 'wording']
feature_columns = [col for col in numeric_features if col not in target_columns]

targets = numeric_features[target_columns]
features = numeric_features[feature_columns]
prompt_group = pd.Categorical(df['prompt_title'])

In [22]:
def calculate_errors(y, y_pred):
    return {
        'r2': r2_score(y, y_pred),
        'rmse': sqrt(mean_squared_error(y, y_pred)),
        'mae': mean_absolute_error(y, y_pred)
    }


def train_lgb_kfold(
        target: str, 
        prompt_group: pd.DataFrame, 
        features: pd.DataFrame, 
        targets: pd.DataFrame, 
        feature_names: List[str],
        model_params: dict) -> Tuple[pd.DataFrame, lgb.LGBMRegressor]:
    
    group_kfold = GroupKFold(n_splits=prompt_group.unique().size)
    assert group_kfold.get_n_splits(features, targets, prompt_group) == len(prompt_group.unique())

    train_errors, val_errors = [], []
    for i, (train_index, test_index) in enumerate(group_kfold.split(features, targets, prompt_group)):
        X_train = features[feature_names].iloc[train_index]
        y_train = targets.iloc[train_index][target]

        X_val = features[feature_names].iloc[test_index]
        y_val = targets.iloc[test_index][target]

        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, y_val)
        bst = lgb.train(model_params, 
                        train_set=train_data, valid_sets=(train_data, val_data), 
                        valid_names=('fit', 'val'), callbacks=[lgb.log_evaluation(100)])#, lgb.early_stopping(stopping_rounds=50)])

        train_errors.append(calculate_errors(y_train, bst.predict(X_train)))
        val_errors.append(calculate_errors(y_val, bst.predict(X_val)))

    train_metrics = pd.DataFrame.from_records(train_errors).describe()
    train_metrics['set'] = 'train'
    val_metrics = pd.DataFrame.from_records(val_errors).describe()
    val_metrics['set'] = 'val'
    metric_df = pd.concat([train_metrics, val_metrics])

    return metric_df, bst


def tune_kfold(
    trial, 
    prompt_group: pd.DataFrame, 
    features: pd.DataFrame, 
    targets: pd.DataFrame, 
    feature_names: List[str]):
    model_params = {
        'objective': 'fair', 
        'verbose': -1, 
        'force_col_wise': True,
        'boosting_type': 'dart',
        'num_leaves': trial.suggest_int('num_leaves', 5, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 100),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'seed': 0
    }
    
    group_kfold = GroupKFold(n_splits=prompt_group.unique().size)
    assert group_kfold.get_n_splits(features, targets, prompt_group) == len(prompt_group.unique())
    
    val_errors, train_errors = [], []
    for target in ['content', 'wording']:
        val_errors_kf, train_errors_kf = [], []
        for i, (train_index, test_index) in enumerate(group_kfold.split(features, targets, prompt_group)):
            X_train = features[feature_names].iloc[train_index]
            y_train = targets.iloc[train_index][target]

            X_val = features[feature_names].iloc[test_index]
            y_val = targets.iloc[test_index][target]

            train_data = lgb.Dataset(X_train, label=y_train)
            val_data = lgb.Dataset(X_val, y_val)
            bst = lgb.train(model_params, 
                            train_set=train_data,valid_sets=(train_data, val_data), 
                            valid_names=('fit', 'val'))#, callbacks=[lgb.early_stopping(stopping_rounds=50)])

            y_pred = bst.predict(X_val)
            val_errors_kf.append(mean_squared_error(y_val, y_pred, squared=False))

            y_pred = bst.predict(X_train)
            train_errors_kf.append(mean_squared_error(y_train, y_pred, squared=False))

        val_avg = sum(val_errors_kf) / len(val_errors_kf)
        train_avg = sum(train_errors_kf) / len(train_errors_kf)
        val_errors.append(val_avg)
        train_errors.append(train_avg)
    
    mcrmse_val = sum(val_errors) / len(val_errors)
    mcrmse_train = sum(train_errors) / len(train_errors)
    
    return mcrmse_val, mcrmse_val - mcrmse_train
    
    
def train_lgb(
        target: str, 
        prompt_group: pd.DataFrame, 
        features: pd.DataFrame, 
        targets: pd.DataFrame, 
        feature_names: List[str],
        model_params: dict) -> Tuple[pd.DataFrame, lgb.LGBMRegressor]:
    
    
    X_train = features[feature_names]
    y_train = targets[target]


    train_data = lgb.Dataset(X_train, label=y_train)
    bst = lgb.train(model_params, train_data, )

    train_errors = [calculate_errors(y_train, bst.predict(X_train))]
    train_metrics = pd.DataFrame.from_records(train_errors)

    return train_metrics, bst

In [23]:
def eval_validation(f_cols, model_params):
    metric_df_content, bst_content = train_lgb_kfold('content', prompt_group, features, targets, f_cols, model_params)
    metric_df_wording, bst_wording = train_lgb_kfold('wording', prompt_group, features, targets, f_cols, model_params)

    metric_df_content['target'] = 'content'
    metric_df_wording['target'] = 'wording'
    metric_df = pd.concat([metric_df_content, metric_df_wording])
    metric_df = metric_df.loc[['mean', 'std']]
    print(metric_df)

    mcrmse = (metric_df.loc[metric_df.target=='content', 'rmse'] + metric_df.loc[metric_df.target=='wording', 'rmse']) / 2
    
    print(f'\nTrain MCRMSE:\t   {mcrmse.iloc[0]}')
    print(f'Validation MCRMSE: {mcrmse.iloc[1]}')
    print(f'Diff:\t {mcrmse.iloc[1]-mcrmse.iloc[0]}\n')

    importance = pd.DataFrame({
    'importance': bst_wording.feature_importance(),
    'feature': bst_wording.feature_name()}).sort_values(by='importance', ascending=False)
    print(importance)

In [24]:
features.columns

Index(['text_unique_bigrams', 'prompt_title_unique_bigrams',
       'prompt_question_unique_bigrams', 'prompt_text_unique_bigrams',
       'text_bigram_overlap', 'question_bigram_overlap', 'text_bigram_ratio',
       'text_bigram_diff', 'question_bigram_diff', 'text_bigram_exclusive',
       'question_bigram_exclusive', 'n_words', 'unique_words', 'unique_ratio',
       'word_len_avg', 'word_len_q10', 'word_len_q90', 'verb_count',
       'noun_count', 'adv_count', 'adj_count', 'det_count',
       'missing_wordcount'],
      dtype='object')

In [25]:
model_params = {
    'objective': 'fair', 
    'verbose': 0, 
    'force_col_wise': True,
    'learning_rate': 0.08,
    'boosting_type': 'dart',
    'num_leaves': 11,
    'seed': 42
}
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90',
          'verb_count','noun_count','adv_count','adj_count','det_count', 'missing_wordcount']

eval_validation(f_cols, model_params)

[100]	fit's fair: 0.073898	val's fair: 0.0744072
[100]	fit's fair: 0.0643943	val's fair: 0.0925891
[100]	fit's fair: 0.072399	val's fair: 0.0732018
[100]	fit's fair: 0.0694643	val's fair: 0.100794
[100]	fit's fair: 0.116925	val's fair: 0.12074
[100]	fit's fair: 0.104969	val's fair: 0.140358
[100]	fit's fair: 0.120796	val's fair: 0.114311
[100]	fit's fair: 0.105359	val's fair: 0.218076
            r2      rmse       mae    set   target
mean  0.811794  0.452791  0.351500  train  content
mean  0.766295  0.506680  0.394682    val  content
mean  0.668659  0.594840  0.468061  train  wording
mean  0.521146  0.697150  0.554830    val  wording
std   0.006052  0.016114  0.012697  train  content
std   0.014283  0.049848  0.033615    val  content
std   0.030331  0.026025  0.019478  train  wording
std   0.129282  0.131706  0.108750    val  wording

Train MCRMSE:	   0.5238150404420341
Validation MCRMSE: 0.6019149121749355
Diff:	 0.07809987173290145

    importance              feature
0          425

In [26]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90',
          'verb_count','noun_count','adv_count','adj_count','det_count']

eval_validation(f_cols, model_params)

[100]	fit's fair: 0.0741563	val's fair: 0.0742505
[100]	fit's fair: 0.0645325	val's fair: 0.093427
[100]	fit's fair: 0.0732094	val's fair: 0.0734625
[100]	fit's fair: 0.0701525	val's fair: 0.101113
[100]	fit's fair: 0.117944	val's fair: 0.119873
[100]	fit's fair: 0.104969	val's fair: 0.140358
[100]	fit's fair: 0.121675	val's fair: 0.115192
[100]	fit's fair: 0.105977	val's fair: 0.216439
            r2      rmse       mae    set   target
mean  0.810313  0.454563  0.352959  train  content
mean  0.765264  0.507853  0.395379    val  content
mean  0.666055  0.597105  0.469482  train  wording
mean  0.522469  0.696117  0.554072    val  wording
std   0.006732  0.016666  0.013065  train  content
std   0.014317  0.050683  0.034150    val  content
std   0.032146  0.027350  0.020190  train  wording
std   0.127061  0.129481  0.107284    val  wording

Train MCRMSE:	   0.5258341152542029
Validation MCRMSE: 0.6019850055804638
Diff:	 0.07615089032626088

    importance              feature
0          4

In [27]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 'missing_wordcount']

eval_validation(f_cols, model_params)

[100]	fit's fair: 0.0776343	val's fair: 0.0755168
[100]	fit's fair: 0.0701341	val's fair: 0.0902022
[100]	fit's fair: 0.0760303	val's fair: 0.0739233
[100]	fit's fair: 0.0719717	val's fair: 0.104128
[100]	fit's fair: 0.127563	val's fair: 0.122846
[100]	fit's fair: 0.116546	val's fair: 0.140982
[100]	fit's fair: 0.129288	val's fair: 0.11315
[100]	fit's fair: 0.109169	val's fair: 0.227647
            r2      rmse       mae    set   target
mean  0.798857  0.468097  0.362222  train  content
mean  0.762893  0.510188  0.395320    val  content
mean  0.636879  0.623065  0.488989  train  wording
mean  0.508990  0.705901  0.559822    val  wording
std   0.001739  0.013353  0.009884  train  content
std   0.018845  0.051431  0.034738    val  content
std   0.029810  0.030341  0.022301  train  wording
std   0.142668  0.142475  0.117932    val  wording

Train MCRMSE:	   0.5455812394143367
Validation MCRMSE: 0.6080446852834198
Diff:	 0.062463445869083145

   importance              feature
0         46

In [28]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'missing_wordcount']

eval_validation(f_cols, model_params)

[100]	fit's fair: 0.0779461	val's fair: 0.0748764
[100]	fit's fair: 0.0701431	val's fair: 0.0900859
[100]	fit's fair: 0.0761504	val's fair: 0.0737349
[100]	fit's fair: 0.0720479	val's fair: 0.104012
[100]	fit's fair: 0.128939	val's fair: 0.120712
[100]	fit's fair: 0.117832	val's fair: 0.140246
[100]	fit's fair: 0.13033	val's fair: 0.111115
[100]	fit's fair: 0.110146	val's fair: 0.222764
            r2      rmse       mae    set   target
mean  0.798424  0.468607  0.362574  train  content
mean  0.763910  0.509140  0.394541    val  content
mean  0.632504  0.626811  0.491608  train  wording
mean  0.518152  0.699185  0.554063    val  wording
std   0.002040  0.013757  0.010228  train  content
std   0.018770  0.051935  0.035275    val  content
std   0.030240  0.030610  0.022224  train  wording
std   0.139047  0.139814  0.115087    val  wording

Train MCRMSE:	   0.5477087851123347
Validation MCRMSE: 0.60416254058714
Diff:	 0.056453755474805356

   importance              feature
0         497 

In [29]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90',
         'missing_wordcount']

eval_validation(f_cols, model_params)

[100]	fit's fair: 0.0746903	val's fair: 0.0716788
[100]	fit's fair: 0.0649443	val's fair: 0.0926592
[100]	fit's fair: 0.0729142	val's fair: 0.0735282
[100]	fit's fair: 0.0698164	val's fair: 0.0984167
[100]	fit's fair: 0.119734	val's fair: 0.12299
[100]	fit's fair: 0.10825	val's fair: 0.143696
[100]	fit's fair: 0.125268	val's fair: 0.110272
[100]	fit's fair: 0.107239	val's fair: 0.221045
            r2      rmse       mae    set   target
mean  0.810067  0.454874  0.353205  train  content
mean  0.770312  0.502426  0.391577    val  content
mean  0.657491  0.604866  0.475687  train  wording
mean  0.516314  0.700617  0.556636    val  wording
std   0.006007  0.016508  0.012620  train  content
std   0.011636  0.049457  0.033816    val  content
std   0.030926  0.028063  0.021284  train  wording
std   0.135122  0.136031  0.113217    val  wording

Train MCRMSE:	   0.5298699974920608
Validation MCRMSE: 0.6015214329442573
Diff:	 0.07165143545219654

   importance              feature
0         440

In [41]:
content_tune = partial(
    tune_kfold, 
    prompt_group=prompt_group, features=features, targets=targets, feature_names=f_cols)

study = optuna.create_study(directions=['minimize', 'minimize'])
study.optimize(content_tune, n_trials=5)

[I 2023-08-22 11:11:55,645] A new study created in memory with name: no-name-cd8c221d-37b3-4eae-8142-d779e99fed16
[I 2023-08-22 11:12:00,897] Trial 0 finished with values: [0.6107791619636729, 0.10081000959094022] and parameters: {'num_leaves': 90, 'learning_rate': 0.06861958254410183, 'max_depth': 11, 'min_child_samples': 24, 'colsample_bytree': 0.8200218973945176, 'reg_alpha': 0.410794115838205, 'reg_lambda': 0.656657659590142, 'bagging_freq': 83, 'bagging_fraction': 0.6102638128019751}. 
[I 2023-08-22 11:12:03,721] Trial 1 finished with values: [0.631378794697304, 0.07178421363413312] and parameters: {'num_leaves': 43, 'learning_rate': 0.06055076695990209, 'max_depth': 15, 'min_child_samples': 76, 'colsample_bytree': 0.5152538946306845, 'reg_alpha': 0.4870717705691793, 'reg_lambda': 0.11510687444483525, 'bagging_freq': 19, 'bagging_fraction': 0.7493662588141362}. 
[I 2023-08-22 11:12:06,305] Trial 2 finished with values: [0.6301579725598281, 0.071656009301731] and parameters: {'num_

In [31]:
optuna.visualization.plot_pareto_front(study, target_names=["mcrmse val", "train - val"])

In [32]:
fig = optuna.visualization.plot_param_importances(
    study, target=lambda t: t.values[0], target_name="mcrmse val"
)
fig.show()
fig = optuna.visualization.plot_param_importances(
    study, target=lambda t: t.values[1], target_name="regularisation"
)
fig.show()

In [33]:
best_trial = min(filter(lambda x: x.values[1] < 0.0685, study.best_trials), key=lambda t: t.values[0])
best_trial

FrozenTrial(number=4, state=1, values=[0.6185144823474051, 0.06276667402229008], datetime_start=datetime.datetime(2023, 8, 22, 11, 9, 54, 954989), datetime_complete=datetime.datetime(2023, 8, 22, 11, 9, 56, 420781), params={'num_leaves': 43, 'learning_rate': 0.05477353077639313, 'max_depth': 4, 'min_child_samples': 8, 'colsample_bytree': 0.9770736875539814, 'reg_alpha': 0.7071735629450199, 'reg_lambda': 0.6418604659487895, 'bagging_freq': 74, 'bagging_fraction': 0.8741298202075501}, user_attrs={}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'num_leaves': IntDistribution(high=100, log=False, low=5, step=1), 'learning_rate': FloatDistribution(high=0.1, log=False, low=0.001, step=None), 'max_depth': IntDistribution(high=15, log=False, low=3, step=1), 'min_child_samples': IntDistribution(high=100, log=False, low=5, step=1), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'reg_alpha': FloatDistribution(high=1.0, log=False, low

In [34]:
# best_trial = min(study.best_trials, key=lambda t: t.values[0])
# best_params = best_trial.params
# best_params
# best_trial = study.trials[0]
best_params = best_trial.params
best_params

{'num_leaves': 43,
 'learning_rate': 0.05477353077639313,
 'max_depth': 4,
 'min_child_samples': 8,
 'colsample_bytree': 0.9770736875539814,
 'reg_alpha': 0.7071735629450199,
 'reg_lambda': 0.6418604659487895,
 'bagging_freq': 74,
 'bagging_fraction': 0.8741298202075501}

In [35]:
model_params = {
    'objective': 'fair', 
    'verbose': 0, 
    'force_col_wise': True,
    'boosting_type': 'dart',
    'seed': 42,
    **best_params
}

eval_validation(f_cols, model_params)

[100]	fit's fair: 0.0813811	val's fair: 0.0768605
[100]	fit's fair: 0.0712462	val's fair: 0.0974732
[100]	fit's fair: 0.0799288	val's fair: 0.0792272
[100]	fit's fair: 0.0763027	val's fair: 0.107278
[100]	fit's fair: 0.127587	val's fair: 0.132557
[100]	fit's fair: 0.115251	val's fair: 0.14028
[100]	fit's fair: 0.132712	val's fair: 0.116869
[100]	fit's fair: 0.114298	val's fair: 0.224446
            r2      rmse       mae    set   target
mean  0.788876  0.479577  0.371500  train  content
mean  0.749893  0.524195  0.406892    val  content
mean  0.630084  0.628590  0.493799  train  wording
mean  0.501261  0.712869  0.566811    val  wording
std   0.006125  0.016816  0.013137  train  content
std   0.014686  0.051684  0.036357    val  content
std   0.033378  0.028813  0.021098  train  wording
std   0.124751  0.131962  0.108005    val  wording

Train MCRMSE:	   0.5540838295339732
Validation MCRMSE: 0.6185322436738567
Diff:	 0.06444841413988356

   importance              feature
0         655

In [36]:
# model_params = {
#     'objective': 'fair', 
#     'verbose': 0, 
#     'force_col_wise': True,
#     'learning_rate': 0.08,
#     'boosting_type': 'dart',
#     'num_leaves': 11,
#     'seed': 42
# }
model_params = {
    'objective': 'fair', 
    'verbose': 0, 
    'force_col_wise': True,
    'boosting_type': 'dart',
    'seed': 42,
    'num_leaves': 12,
    'learning_rate': 0.08713136443642772,
    'max_depth': 4,
    'min_child_samples': 97,
    'colsample_bytree': 0.8553368999584898,
    'reg_alpha': 0.12136690110575765,
    'reg_lambda': 0.46182111174463625,
    'bagging_freq': 2,
    'bagging_fraction': 0.9254170180688875
}

f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90']
          

eval_validation(f_cols, model_params)

[100]	fit's fair: 0.0756988	val's fair: 0.070912
[100]	fit's fair: 0.0660576	val's fair: 0.0943326
[100]	fit's fair: 0.0757735	val's fair: 0.0764035
[100]	fit's fair: 0.0709473	val's fair: 0.0971064
[100]	fit's fair: 0.122352	val's fair: 0.123106
[100]	fit's fair: 0.108723	val's fair: 0.143737
[100]	fit's fair: 0.12768	val's fair: 0.109678
[100]	fit's fair: 0.108529	val's fair: 0.217485
            r2      rmse       mae    set   target
mean  0.804755  0.461204  0.357080  train  content
mean  0.767938  0.504948  0.393192    val  content
mean  0.649593  0.611654  0.477781  train  wording
mean  0.519724  0.698300  0.554084    val  wording
std   0.006816  0.017889  0.013529  train  content
std   0.008896  0.047174  0.033185    val  content
std   0.035346  0.031255  0.022961  train  wording
std   0.129640  0.131854  0.110662    val  wording

Train MCRMSE:	   0.5364292138610853
Validation MCRMSE: 0.6016238526029356
Diff:	 0.06519463874185027

   importance              feature
0         440

In [37]:
metric_df_content, bst_content = train_lgb('content', prompt_group, features, targets, f_cols, model_params)
metric_df_wording, bst_wording = train_lgb('wording', prompt_group, features, targets, f_cols, model_params)

In [38]:
print(f'\n{"-"*35}\n\tContent scores')
pprint(metric_df_content)
print(f'\n{"-"*35}\n\tWording scores')
pprint(metric_df_wording)


-----------------------------------
	Content scores
         r2      rmse       mae
0  0.802414  0.463841  0.359997

-----------------------------------
	Wording scores
         r2      rmse       mae
0  0.643225  0.618795  0.483608


In [39]:
print('Full data MCRMSE: ')
(metric_df_content.rmse + metric_df_wording.rmse) / 2

Full data MCRMSE: 


0    0.541318
Name: rmse, dtype: float64

In [40]:
bst_content.save_model(content_model)
bst_wording.save_model(wording_model)

<lightgbm.basic.Booster at 0x2a329afd0>