## 1 - Imports and nltk downloads

In [1]:
import os
download_dir = f'{os.getcwd()}/nltk_data'
os.environ['NLTK_DATA'] = download_dir

import nltk
nltk.download('stopwords', download_dir=download_dir)
nltk.download('punkt', download_dir=download_dir)
nltk.download('wordnet', download_dir=download_dir)
nltk.download('averaged_perceptron_tagger', download_dir=download_dir)
nltk.download('universal_tagset', download_dir=download_dir)
nltk.download('words', download_dir=download_dir)
nltk.data.path.append(download_dir)
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk import ngrams
import spellwise
from spellwise import Levenshtein

import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import plotly.express as px
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import lightgbm as lgb
from math import sqrt

from functools import partial, reduce
from operator import or_
from pathlib import Path
from pprint import pprint
from typing import Optional, Union, List, Tuple, Dict, Set

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chris/repos/student-summary-
[nltk_data]     evaluation/notebooks/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/chris/repos/student-
[nltk_data]     summary-evaluation/notebooks/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/chris/repos/student-
[nltk_data]     summary-evaluation/notebooks/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/chris/repos/student-summary-
[nltk_data]     evaluation/notebooks/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/chris/repos/student-summary-
[nltk_data]     evaluation/notebooks/nltk_data...
[nltk_data]   Package univers

## 2 - Fix nltk installation

In [2]:
%%sh

yes | unzip -q nltk_data/corpora/wordnet.zip -d nltk_data/corpora/
ls nltk_data/corpora/

[1m[36mbrown[m[m
brown.zip
[1m[36mstopwords[m[m
stopwords.zip
[1m[36mwordnet[m[m
wordnet.zip
[1m[36mwords[m[m
words.zip


replace nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/data.verb? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/index.adv? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/adv.exc? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/index.verb? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/cntlist.rev? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/data.adj? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/index.adj? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/LICENSE? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/citation.bib? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/noun.exc? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/verb.exc? [y]es, [n]o, [A]ll, [N]one, [r]ename: replac

## 3 - Set data paths

In [3]:
stop_words = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()
tqdm.pandas()

data_dir = Path('../data/commonlit-evaluate-student-summaries')

sample_submission = data_dir / 'sample_submission.csv'
summaries_train = data_dir / 'summaries_train.csv'
summaries_test = data_dir / 'summaries_test.csv'
prompts_train = data_dir / 'prompts_train.csv'
prompts_test = data_dir / 'prompts_test.csv'

content_model = '../data/models/content.txt'
wording_model = '../data/models/wording.txt'

## 4 - Preprocessing functions

In [4]:
def make_split(summaries_path: Path, prompts_path: Path, dtype_backend: Optional[str] = 'pyarrow') -> pd.DataFrame:
    summaries_df = pd.read_csv(summaries_path)#, dtype_backend=dtype_backend)
    prompts_df = pd.read_csv(prompts_path)#, dtype_backend=dtype_backend)
    df = pd.merge(summaries_df, prompts_df, how='left', on='prompt_id')
    df.fillna('')
    
    return df


def tokenize(text: str) -> List[str]:
    return [lemmatiser.lemmatize(tok.lower()) for tok in word_tokenize(text) if tok.isalnum() and tok not in stop_words]


def make_bigram(tokens: List[str]) -> Set[str]:
    return set(ngrams(tokens, 2))


def clear_stopwords(column: pd.Series, idx: int) -> Union[List[str], List[str], List[str]]:
    tokens = [tok.lower() for tok in word_tokenize(column.iloc[idx]) if tok.isalnum()]
    cleared_stopwords = [tok for tok in tokens if tok not in stop_words]
    lemmas = [lemmatiser.lemmatize(tok) for tok in cleared_stopwords]
    bigram = set(ngrams(lemmas, 2))

    return tokens, cleared_stopwords, lemmas, bigram
    
    
def nlp_preprocess(df: pd.DataFrame, column: str):
    df[f'{column}_lemmas'] = df[column].apply(tokenize)
    df[f'{column}_bigram'] = df[f'{column}_lemmas'].apply(make_bigram)
    
def predict(model: lgb.Booster, df: pd.DataFrame, features: List[str]) -> pd.Series:
    return model.predict(df[features])

## 5 - Load data and preprocess

In [5]:
# Load data
# df = make_split(summaries_test, prompts_test)
df = make_split(summaries_train, prompts_train)

# Make n-grams for all text columns
text_columns = ['prompt_title', 'prompt_question', 'prompt_text', 'text']
for column in tqdm(text_columns):
    nlp_preprocess(df, column)
    df[f'{column}_unique_bigrams'] = df[f'{column}_bigram'].str.len()

100%|██████████| 4/4 [00:17<00:00,  4.45s/it]


## 6 - Create bigram based features

In [6]:
df_train = df.copy(deep=True)

# Create n-gram based features
df_train['text_bigram_overlap'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(
    lambda row: len(row[0] & row[1]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_overlap'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(
    lambda row: len(row[0] & row[1]), axis=1) / df_train.text_unique_bigrams
df_train['text_bigram_ratio'] = df_train['text_unique_bigrams'] / (df_train['prompt_text_unique_bigrams'])

df_train['text_bigram_diff'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(
    lambda row: len(row[1] - row[0]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_diff'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(
    lambda row: len(row[1] - row[0]), axis=1) / df_train.text_unique_bigrams

df_train['text_bigram_exclusive'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(
    lambda row: len(row[0] ^ row[1]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_exclusive'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(
    lambda row: len(row[0] ^ row[1]), axis=1) / df_train.text_unique_bigrams

## 7 - Create word based features

In [7]:
df_train['n_words'] = df_train.text_lemmas.str.len()
df_train['unique_words'] = df_train.text_lemmas.apply(set).str.len()
df_train['unique_ratio'] = df_train.unique_words / df_train.n_words

In [8]:
df_train['word_lengths'] = df_train.text_lemmas.apply(lambda x: [len(y) for y in x])
df_train['word_len_avg'] = df_train.word_lengths.apply(np.mean)

In [9]:
df_train['word_len_q10'] = df_train.word_lengths.apply(partial(np.percentile, q=10))
df_train['word_len_q90'] = df_train.word_lengths.apply(partial(np.percentile, q=90))

In [10]:
x = pos_tag(df_train.text_lemmas[0], tagset='universal')
from collections import defaultdict

dd = defaultdict(lambda: 0)
for _, pos in x:
    dd[pos] += 1

In [11]:
df_train['pos'] = df_train.text_lemmas.apply(partial(pos_tag, tagset='universal'))

In [12]:
def pos_counts(tags):
    dd = defaultdict(lambda: 0)
    for _, pos in tags:
        dd[pos] += 1
    return dd

df_train['pos_counts'] = df_train.pos.apply(pos_counts)

In [13]:
df_train['verb_count'] = df_train.pos_counts.str['VERB'].replace(np.nan, 0)
df_train['noun_count'] = df_train.pos_counts.str['NOUN'].replace(np.nan, 0)
df_train['adv_count'] = df_train.pos_counts.str['ADV'].replace(np.nan, 0)
df_train['adj_count'] = df_train.pos_counts.str['ADJ'].replace(np.nan, 0)
df_train['det_count'] = df_train.pos_counts.str['DET'].replace(np.nan, 0)

In [14]:
df_train[['verb_count','noun_count','adv_count','adj_count','det_count']].corr()

Unnamed: 0,verb_count,noun_count,adv_count,adj_count,det_count
verb_count,1.0,0.812849,0.68108,0.64503,0.486026
noun_count,0.812849,1.0,0.603829,0.844499,0.628654
adv_count,0.68108,0.603829,1.0,0.533738,0.34228
adj_count,0.64503,0.844499,0.533738,1.0,0.566271
det_count,0.486026,0.628654,0.34228,0.566271,1.0


## 8 - Create spelling based features

In [15]:
%%sh
wget -nv https://github.com/dwyl/english-words/archive/refs/heads/master.zip -O master.zip
yes | unzip -q master.zip

2023-08-21 18:08:34 URL:https://codeload.github.com/dwyl/english-words/zip/refs/heads/master [7118481] -> "master.zip" [1]
replace english-words-master/CONTRIBUTING.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/LICENSE.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/README.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/read_english_dictionary.py? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/scripts/create_json.py? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/scripts/gen.sh? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/word_list_moby_README.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/word_list_moby_all_moby_words.icss.yaml? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/word_list_moby_credits.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/words.txt? [y]es, [n]o, [A]ll, [N]one, [r

In [16]:
with open('./english-words-master/words.txt', 'r') as f:
    en_words = [line.strip() for line in f.read().split('\n')]

en_words = set([word for word in en_words if word.isalpha()])

In [17]:
def get_unique_words(col: str) -> Set[str]:
    word_sets = df_train[col].apply(set).tolist()
    return reduce(or_, word_sets)

prompt_words = get_unique_words('prompt_text_lemmas')
question_words = get_unique_words('prompt_question_lemmas')
title_words = get_unique_words('prompt_title_lemmas')

word_set = en_words | prompt_words | question_words | title_words

In [18]:
with open('commonlit_words.txt', 'w') as f:
    f.write('\n'.join(word_set))

In [19]:
# word_dir = words.abspath('en')
# Initialise the algorithm
metric = Levenshtein()
# Index the words from a dictionary
# metric.add_from_path('./brown_words.txt')
# metric.add_from_path(words.abspath('en'))
# metric.add_from_path('./english-words-master/words.txt')
metric.add_from_path('./commonlit_words.txt')

def get_distances(tokens: List[str], metric: spellwise.algorithms.base.Base) -> List[str]:
    distances = []
    for idx, token in enumerate(tokens):
        suggestions = metric.get_suggestions(token)
        if suggestions == []:
            distance = len(token) if token.isalpha() else 0
        else:
            distance = suggestions[0]['distance']
        distances.append(distance)
    return sum(distances)

In [20]:
# df_train.text_lemmas.progress_apply(partial(get_distances, metric=metric))

def distance_func(chunk: pd.DataFrame):
    return chunk.apply(partial(get_distances, metric=metric))

n_jobs = 6
# df_chunks = np.array_split(df_train.text_lemmas, n_jobs * 2)

# total_edit_distances = Parallel(n_jobs=n_jobs, backend='loky')(delayed(distance_func)(chunk) for chunk in tqdm(df_chunks))

In [21]:
count_missing_words = lambda tokens: sum([word not in word_set for word in tokens])
df_train['missing_wordcount'] = df_train.text_lemmas.progress_apply(count_missing_words)

100%|██████████| 7165/7165 [00:00<00:00, 219697.40it/s]


In [22]:
numeric_features = df_train.select_dtypes(include=np.number)
target_columns = ['content', 'wording']
feature_columns = [col for col in numeric_features if col not in target_columns]

targets = numeric_features[target_columns]
features = numeric_features[feature_columns]
prompt_group = pd.Categorical(df['prompt_title'])

In [23]:
def calculate_errors(y, y_pred):
    return {
        'r2': r2_score(y, y_pred),
        'rmse': sqrt(mean_squared_error(y, y_pred)),
        'mae': mean_absolute_error(y, y_pred)
    }


def train_lgb_kfold(
        target: str, 
        prompt_group: pd.DataFrame, 
        features: pd.DataFrame, 
        targets: pd.DataFrame, 
        feature_names: List[str],
        model_params: dict) -> Tuple[pd.DataFrame, lgb.LGBMRegressor]:
    
    group_kfold = GroupKFold(n_splits=prompt_group.unique().size)
    assert group_kfold.get_n_splits(features, targets, prompt_group) == len(prompt_group.unique())

    
    train_errors, val_errors = [], []
    for i, (train_index, test_index) in enumerate(group_kfold.split(features, targets, prompt_group)):
        # print(f'Fold {i}')
        # print(f'\tTest prompt: {df.iloc[test_index].prompt_title.unique().tolist()}')

        X_train = features[feature_names].iloc[train_index].convert_dtypes(dtype_backend='numpy_nullable')
        y_train = targets.iloc[train_index][target].convert_dtypes(dtype_backend='numpy_nullable')

        X_val = features[feature_names].iloc[test_index]
        y_val = targets.iloc[test_index][target]

        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, y_val)
        bst = lgb.train(model_params, train_data, )#, feval=[r2_score, mean_absolute_error])

        train_errors.append(calculate_errors(y_train, bst.predict(X_train)))
        val_errors.append(calculate_errors(y_val, bst.predict(X_val)))

    train_metrics = pd.DataFrame.from_records(train_errors).describe()
    train_metrics['set'] = 'train'
    val_metrics = pd.DataFrame.from_records(val_errors).describe()
    val_metrics['set'] = 'val'
    metric_df = pd.concat([train_metrics, val_metrics])

    return metric_df, bst

def train_lgb(
        target: str, 
        prompt_group: pd.DataFrame, 
        features: pd.DataFrame, 
        targets: pd.DataFrame, 
        feature_names: List[str],
        model_params: dict) -> Tuple[pd.DataFrame, lgb.LGBMRegressor]:
    
    
    X_train = features[feature_names].convert_dtypes(dtype_backend='numpy_nullable')
    y_train = targets[target].convert_dtypes(dtype_backend='numpy_nullable')


    train_data = lgb.Dataset(X_train, label=y_train)
    bst = lgb.train(model_params, train_data, )#, feval=[r2_score, mean_absolute_error])

    train_errors = [calculate_errors(y_train, bst.predict(X_train))]
    train_metrics = pd.DataFrame.from_records(train_errors)

    return train_metrics, bst

In [24]:
def eval_validation(f_cols, model_params):
    metric_df_content, bst_content = train_lgb_kfold('content', prompt_group, features, targets, f_cols, model_params)
    metric_df_wording, bst_wording = train_lgb_kfold('wording', prompt_group, features, targets, f_cols, model_params)

    metric_df_content['target'] = 'content'
    metric_df_wording['target'] = 'wording'
    metric_df = pd.concat([metric_df_content, metric_df_wording])
    metric_df = metric_df.loc[['mean', 'std']]
    print(metric_df)

    mcrmse = (metric_df.loc[metric_df.target=='content', 'rmse'] + metric_df.loc[metric_df.target=='wording', 'rmse']) / 2
    print(f'\nTrain MCRMSE:\t   {mcrmse.iloc[0]}')
    print(f'Validation MCRMSE: {mcrmse.iloc[1]}\n')

    importance = pd.DataFrame({
    'importance': bst_wording.feature_importance(),
    'feature': bst_wording.feature_name()}).sort_values(by='importance', ascending=False)
    print(importance)

In [25]:
features.columns

Index(['prompt_title_unique_bigrams', 'prompt_question_unique_bigrams',
       'prompt_text_unique_bigrams', 'text_unique_bigrams',
       'text_bigram_overlap', 'question_bigram_overlap', 'text_bigram_ratio',
       'text_bigram_diff', 'question_bigram_diff', 'text_bigram_exclusive',
       'question_bigram_exclusive', 'n_words', 'unique_words', 'unique_ratio',
       'word_len_avg', 'word_len_q10', 'word_len_q90', 'verb_count',
       'noun_count', 'adv_count', 'adj_count', 'det_count',
       'missing_wordcount'],
      dtype='object')

In [26]:
model_params = {
    'objective': 'fair', 
    'verbose': 0, 
    'force_col_wise': True,
    'learning_rate': 0.08,
    'boosting_type': 'dart',
    'num_leaves': 11,
}
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90',
          'verb_count','noun_count','adv_count','adj_count','det_count', 'missing_wordcount']

eval_validation(f_cols, model_params)

            r2      rmse       mae    set   target
mean  0.815664  0.448115  0.347847  train  content
mean  0.770489  0.502267  0.391396    val  content
mean  0.674074  0.589973  0.464470  train  wording
mean  0.525551  0.693794  0.551752    val  wording
std   0.005658  0.015824  0.012227  train  content
std   0.013918  0.051325  0.035137    val  content
std   0.029167  0.024933  0.018364  train  wording
std   0.126036  0.128326  0.106174    val  wording

Train MCRMSE:	   0.519044030509193
Validation MCRMSE: 0.5980305966609889

    importance              feature
0          399  text_bigram_overlap
3          186              n_words
1           89  text_unique_bigrams
5           75         word_len_avg
12          49            det_count
2           48         unique_ratio
13          39    missing_wordcount
4           32         unique_words
9           28           noun_count
10          27            adv_count
8           16           verb_count
11          11            adj_coun

In [31]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90',
          'verb_count','noun_count','adv_count','adj_count','det_count']

eval_validation(f_cols, model_params)

            r2      rmse       mae    set   target
mean  0.812866  0.451490  0.350580  train  content
mean  0.766385  0.506711  0.395081    val  content
mean  0.670775  0.592887  0.466304  train  wording
mean  0.524203  0.694905  0.552141    val  wording
std   0.006487  0.016275  0.012728  train  content
std   0.015214  0.052164  0.034654    val  content
std   0.031142  0.026552  0.019574  train  wording
std   0.125346  0.128070  0.106474    val  wording

Train MCRMSE:	   0.5221885556899797
Validation MCRMSE: 0.6008081678438388

    importance              feature
0          402  text_bigram_overlap
3          186              n_words
1          103  text_unique_bigrams
5           91         word_len_avg
2           52         unique_ratio
12          46            det_count
10          34            adv_count
4           31         unique_words
9           27           noun_count
8           13           verb_count
11          13            adj_count
6            2         word_len_q

In [27]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 'missing_wordcount']

eval_validation(f_cols, model_params)

            r2      rmse       mae    set   target
mean  0.802370  0.463986  0.359229  train  content
mean  0.766108  0.506714  0.392697    val  content
mean  0.641367  0.619213  0.486414  train  wording
mean  0.512604  0.703468  0.557803    val  wording
std   0.001473  0.012892  0.009490  train  content
std   0.018112  0.050612  0.034275    val  content
std   0.028503  0.029089  0.021428  train  wording
std   0.138852  0.140077  0.116270    val  wording

Train MCRMSE:	   0.541599318767548
Validation MCRMSE: 0.6050906212676062

   importance              feature
0         447  text_bigram_overlap
1         413  text_unique_bigrams
3          79    missing_wordcount
2          61         unique_ratio


In [28]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'missing_wordcount']

eval_validation(f_cols, model_params)

            r2      rmse       mae    set   target
mean  0.801467  0.465053  0.360118  train  content
mean  0.767347  0.505449  0.391702    val  content
mean  0.636399  0.623498  0.489565  train  wording
mean  0.522515  0.696076  0.551708    val  wording
std   0.001735  0.013392  0.009936  train  content
std   0.018237  0.051704  0.035528    val  content
std   0.028995  0.029648  0.021851  train  wording
std   0.136099  0.137848  0.113507    val  wording

Train MCRMSE:	   0.5442754400162207
Validation MCRMSE: 0.6007627017618824

   importance              feature
0         468  text_bigram_overlap
1         432  text_unique_bigrams
2         100    missing_wordcount


In [29]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90']

eval_validation(f_cols, model_params)

            r2      rmse       mae    set   target
mean  0.809939  0.455013  0.353506  train  content
mean  0.768888  0.503966  0.392758    val  content
mean  0.656957  0.605207  0.474982  train  wording
mean  0.519213  0.698588  0.554200    val  wording
std   0.006624  0.016646  0.012717  train  content
std   0.010546  0.048736  0.033313    val  content
std   0.033915  0.029978  0.022120  train  wording
std   0.129986  0.131704  0.110075    val  wording

Train MCRMSE:	   0.5301098046221431
Validation MCRMSE: 0.6012768396782324

   importance              feature
0         426  text_bigram_overlap
3         232              n_words
1         126  text_unique_bigrams
5         107         word_len_avg
2          57         unique_ratio
4          37         unique_words
6          12         word_len_q10
7           3         word_len_q90


In [30]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90',
          'missing_wordcount']

eval_validation(f_cols, model_params)

            r2      rmse       mae    set   target
mean  0.813635  0.450579  0.349997  train  content
mean  0.773873  0.498526  0.388436    val  content
mean  0.662685  0.600267  0.472171  train  wording
mean  0.520081  0.698155  0.554405    val  wording
std   0.005523  0.015899  0.011967  train  content
std   0.010618  0.048704  0.033635    val  content
std   0.030107  0.027383  0.020360  train  wording
std   0.126758  0.129643  0.108927    val  wording

Train MCRMSE:	   0.5254230479083919
Validation MCRMSE: 0.5983402765159832

   importance              feature
0         403  text_bigram_overlap
3         230              n_words
1         124  text_unique_bigrams
5          85         word_len_avg
8          64    missing_wordcount
2          44         unique_ratio
4          41         unique_words
6           9         word_len_q10
7           0         word_len_q90


In [30]:
metric_df_content, bst_content = train_lgb('content', prompt_group, features, targets, f_cols, model_params)
metric_df_wording, bst_wording = train_lgb('wording', prompt_group, features, targets, f_cols, model_params)

In [31]:
print(f'\n{"-"*35}\n\tContent scores')
pprint(metric_df_content)
print(f'\n{"-"*35}\n\tWording scores')
pprint(metric_df_wording)


-----------------------------------
	Content scores
         r2      rmse       mae
0  0.808225  0.456969  0.354846

-----------------------------------
	Wording scores
         r2      rmse       mae
0  0.652207  0.610957  0.480325


In [32]:
print('Full data MCRMSE: ')
(metric_df_content.rmse + metric_df_wording.rmse) / 2

Full data MCRMSE: 


0    0.533963
Name: rmse, dtype: float64

In [33]:
bst_content.save_model(content_model)
bst_wording.save_model(wording_model)

<lightgbm.basic.Booster at 0x2b01100d0>