## 1 - Imports and nltk downloads

In [25]:
import os
download_dir = f'{os.getcwd()}/nltk_data'
os.environ['NLTK_DATA'] = download_dir

import nltk
nltk.download('stopwords', download_dir=download_dir)
nltk.download('punkt', download_dir=download_dir)
nltk.download('wordnet', download_dir=download_dir)
nltk.download('averaged_perceptron_tagger', download_dir=download_dir)
nltk.download('universal_tagset', download_dir=download_dir)
nltk.download('words', download_dir=download_dir)
nltk.data.path.append(download_dir)
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk import ngrams

import datasets
import pandas as pd
import numpy as np
from tqdm import tqdm
import plotly.express as px
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import optuna
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
from catboost.utils import eval_metric
from math import sqrt

from collections import defaultdict
from functools import partial, reduce
from operator import or_
from pathlib import Path
from pprint import pprint
import random
from typing import Optional, Union, List, Tuple, Dict, Set, Any

seed = 42
random.seed(seed)
np.random.seed(seed)

[nltk_data] Downloading package stopwords to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /kaggle/working/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package words to /kaggle/working/nltk_data...
[nltk_data]   Package words is already up-to-date!


## 2 - Fix nltk installation

In [26]:
%%sh

yes | unzip -q nltk_data/corpora/wordnet.zip -d nltk_data/corpora/
ls nltk_data/corpora/

stopwords
stopwords.zip
wordnet
wordnet.zip
words
words.zip


replace nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/data.verb? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/index.adv? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/adv.exc? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/index.verb? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/cntlist.rev? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/data.adj? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/index.adj? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/LICENSE? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/citation.bib? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/noun.exc? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace nltk_data/corpora/wordnet/verb.exc? [y]es, [n]o, [A]ll, [N]one, [r]ename: replac

## 3 - Set data paths

In [27]:
stop_words = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()
tqdm.pandas()

data_dir = Path('/kaggle/input/commonlit-evaluate-student-summaries')

sample_submission = data_dir / 'sample_submission.csv'
summaries_train = data_dir / 'summaries_train.csv'
summaries_test = data_dir / 'summaries_test.csv'
prompts_train = data_dir / 'prompts_train.csv'
prompts_test = data_dir / 'prompts_test.csv'

content_model = 'content.txt'
wording_model = 'wording.txt'

## 4 - Preprocessing functions

In [28]:
def text_tokenize(text: str) -> List[str]:
    return [lemmatiser.lemmatize(tok.lower()) for tok in word_tokenize(text) if tok.isalnum() and tok not in stop_words]


def make_bigram(tokens: List[str]) -> Set[str]:
    if type(tokens) != list:
        tokens = tokens.tolist()
    return set(ngrams(tokens, 2))


def tokenize(row: Dict[str, Any]) -> List[str]:
    for col in ['prompt_text', 'prompt_title', 'prompt_question', 'prompt_text']:
        row[f'{col}_lemmas'] = lemmas = text_tokenize(row[col])
        row[f'{col}_bigram'] = make_bigram(lemmas)


def nlp_preprocess(df: pd.DataFrame, column: str):
    df[f'{column}_lemmas'] = df[column].apply(text_tokenize)
    df[f'{column}_bigram'] = df[f'{column}_lemmas'].apply(make_bigram)


# def batch_tokenize(data: Dict[str, Any]) -> List[str]:
#     lemmas = [text_tokenize(row) for row in data]
#     return lemmas


def process_col(data: Dict[str, Any], col: str) -> List[str]:
    lemmas = [text_tokenize(text) for text in data]
    bigrams = [make_bigram(lemma) for lemma in lemmas]
    n_stopwords = []
    for row in lemmas:
        n_stopwords.append(sum([lemma in stop_words for lemma in row]))
    pos = [pos_tag(lemma, tagset='universal') for lemma in lemmas]
    sentences = [sent_tokenize(text) for text in data]
    sentence_len_avg, sentence_len_std = [], []
    for i in range(len(sentences)):
        for j in range(len(sentences[i])):
            sentences[i][j] = len(sentences[i][j].split())
        sentence_len_avg.append(np.mean(sentences[i]))
        sentence_len_std.append(np.std(sentences[i]))

    return {f'{col}_lemmas': lemmas,
            f'{col}_bigram': bigrams,
            f'{col}_sentence_len_avg': sentence_len_avg,
            f'{col}_sentence_len_std': sentence_len_std,
            f'{col}_stopwords': n_stopwords,
            'pos': pos
            }


def make_split(summaries_path: Path, prompts_path: Path, dtype_backend: Optional[str] = 'pyarrow') -> pd.DataFrame:
    summaries_df = pd.read_csv(summaries_path)
    prompts_df = pd.read_csv(prompts_path)

    for column in ['prompt_title', 'prompt_question', 'prompt_text']:
        nlp_preprocess(prompts_df, column)
        prompts_df[f'{column}_unique_bigrams'] = prompts_df[f'{column}_bigram'].str.len()

    summaries_dataset = datasets.Dataset.from_pandas(summaries_df, preserve_index=False)
    proc_func = partial(process_col, col='text')
    summaries_df = summaries_dataset.map(function=lambda example: {
                                         **proc_func(example['text']), **example}, num_proc=os.cpu_count(), keep_in_memory=True, batched=True).to_pandas()
    summaries_df['text_bigram'] = summaries_df.text_bigram.apply(lambda row: {(x[0], x[1]) for x in row})
    summaries_df['text_unique_bigrams'] = summaries_df['text_bigram'].str.len()

    df = pd.merge(summaries_df, prompts_df, how='left', on='prompt_id')
    df.fillna('')

    return df

## 5 - Load data and preprocess

In [29]:
%%time
# Load data
# df = make_split(summaries_test, prompts_test)
df = make_split(summaries_train, prompts_train)

     

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

CPU times: user 1.03 s, sys: 278 ms, total: 1.3 s
Wall time: 21.8 s


## 6 - Create bigram based features

In [30]:
df_train = df.copy(deep=True)

# Create n-gram based features
df_train['text_bigram_overlap'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_overlap'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1) / df_train.text_unique_bigrams
df_train['text_bigram_ratio'] = df_train['text_unique_bigrams'] / (df_train['prompt_text_unique_bigrams'])

df_train['text_bigram_diff'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[1] - row[0]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_diff'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[1] - row[0]), axis=1) / df_train.text_unique_bigrams

df_train['text_bigram_exclusive'] = df_train[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1) / df_train.text_unique_bigrams
df_train['question_bigram_exclusive'] = df_train[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1) / df_train.text_unique_bigrams

## 7 - Create word based features

In [31]:
df_train['n_words'] = df_train.text_lemmas.str.len()
df_train['unique_words'] = df_train.text_lemmas.apply(set).str.len()
df_train['unique_ratio'] = df_train.unique_words / df_train.n_words

In [32]:
df_train['word_lengths'] = df_train.text_lemmas.apply(lambda x: [len(y) for y in x])
df_train['word_len_avg'] = df_train.word_lengths.apply(np.mean)

In [33]:
df_train['word_len_q10'] = df_train.word_lengths.apply(partial(np.percentile, q=10))
df_train['word_len_q90'] = df_train.word_lengths.apply(partial(np.percentile, q=90))
df_train['word_len_std'] = df_train.word_lengths.apply(np.std)

In [34]:
def pos_counts(tags):
    dd = defaultdict(lambda: 0)
    for _, pos in tags:
        dd[pos] += 1
    return dd

df_train['pos_counts'] = df_train.pos.apply(pos_counts)

In [35]:
df_train['verb_count'] = df_train.pos_counts.str['VERB'].replace(np.nan, 0)
df_train['noun_count'] = df_train.pos_counts.str['NOUN'].replace(np.nan, 0)
df_train['adv_count'] = df_train.pos_counts.str['ADV'].replace(np.nan, 0)
df_train['adj_count'] = df_train.pos_counts.str['ADJ'].replace(np.nan, 0)
df_train['det_count'] = df_train.pos_counts.str['DET'].replace(np.nan, 0)

## 8 - Create spelling based features

In [36]:
%%sh
wget -nv https://github.com/dwyl/english-words/archive/refs/heads/master.zip -O master.zip
yes | unzip -q master.zip

2023-08-29 19:00:34 URL:https://codeload.github.com/dwyl/english-words/zip/refs/heads/master [7118481] -> "master.zip" [1]
replace english-words-master/CONTRIBUTING.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/LICENSE.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/README.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/read_english_dictionary.py? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/scripts/create_json.py? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/scripts/gen.sh? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/word_list_moby_README.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/word_list_moby_all_moby_words.icss.yaml? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/word_list_moby_credits.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace english-words-master/words.txt? [y]es, [n]o, [A]ll, [N]one, [r

In [37]:
with open('./english-words-master/words.txt', 'r') as f:
    en_words = [line.strip() for line in f.read().split('\n')]

en_words = set([word for word in en_words if word.isalpha()])

In [38]:
def get_unique_words(col: str) -> Set[str]:
    word_sets = df_train[col].apply(set).tolist()
    return reduce(or_, word_sets)

prompt_words = get_unique_words('prompt_text_lemmas')
question_words = get_unique_words('prompt_question_lemmas')
title_words = get_unique_words('prompt_title_lemmas')

word_set = en_words | prompt_words | question_words | title_words

In [39]:
with open('commonlit_words.txt', 'w') as f:
    f.write('\n'.join(word_set))

```
# word_dir = words.abspath('en')
# Initialise the algorithm
metric = Levenshtein()
# Index the words from a dictionary
# metric.add_from_path('./brown_words.txt')
# metric.add_from_path(words.abspath('en'))
# metric.add_from_path('./english-words-master/words.txt')
metric.add_from_path('./commonlit_words.txt')

def get_distances(tokens: List[str], metric: spellwise.algorithms.base.Base) -> List[str]:
    distances = []
    for idx, token in enumerate(tokens):
        suggestions = metric.get_suggestions(token)
        if suggestions == []:
            distance = len(token) if token.isalpha() else 0
        else:
            distance = suggestions[0]['distance']
        distances.append(distance)
    return sum(distances)


def distance_func(chunk: pd.DataFrame):
    return chunk.apply(partial(get_distances, metric=metric))

n_jobs = 6
df_chunks = np.array_split(df_train.text_lemmas, n_jobs * 2)
# total_edit_distances = Parallel(n_jobs=n_jobs, backend='loky')(delayed(distance_func)(chunk) for chunk in tqdm(df_chunks))
```

In [40]:
count_missing_words = lambda tokens: sum([word not in word_set for word in tokens if word.isalpha()])
df_train['missing_wordcount'] = df_train.text_lemmas.progress_apply(count_missing_words)

100%|██████████| 7165/7165 [00:00<00:00, 60337.28it/s]


In [41]:
numeric_features = df_train.select_dtypes(include=np.number)
target_columns = ['content', 'wording']
feature_columns = [col for col in numeric_features if col not in target_columns]

targets = numeric_features[target_columns]
features = numeric_features[feature_columns]
prompt_group = pd.Categorical(df['prompt_title'])

In [42]:
target_correlations = pd.concat([features.corrwith(df_train['content']), features.corrwith(df_train['wording'])], axis=1).rename(columns={0:'content', 1:'wording'})
target_correlations

Unnamed: 0,content,wording
text_sentence_len_avg,-0.029619,-0.136316
text_sentence_len_std,0.40948,0.27117
text_stopwords,0.624798,0.465088
text_unique_bigrams,0.796601,0.535982
prompt_title_unique_bigrams,0.049018,0.047581
prompt_question_unique_bigrams,0.029809,-0.045439
prompt_text_unique_bigrams,-0.031313,-0.134639
text_bigram_overlap,-0.014675,-0.316449
question_bigram_overlap,-0.091749,-0.034915
text_bigram_ratio,0.799985,0.568999


In [43]:
wording_top_features = target_correlations.loc[target_correlations['content'].abs() > 0.3, 'content']
content_top_features = target_correlations.loc[target_correlations['wording'].abs() > 0.3, 'wording']

from IPython.display import display
display(wording_top_features)
display(content_top_features)

text_sentence_len_std        0.409480
text_stopwords               0.624798
text_unique_bigrams          0.796601
text_bigram_ratio            0.799985
text_bigram_exclusive       -0.760313
question_bigram_exclusive   -0.567776
n_words                      0.793450
unique_words                 0.799003
unique_ratio                -0.526725
verb_count                   0.686144
noun_count                   0.771478
adv_count                    0.566053
adj_count                    0.709521
det_count                    0.563414
missing_wordcount            0.396537
Name: content, dtype: float64

text_stopwords               0.465088
text_unique_bigrams          0.535982
text_bigram_overlap         -0.316449
text_bigram_ratio            0.568999
text_bigram_diff             0.316449
text_bigram_exclusive       -0.576763
question_bigram_exclusive   -0.445547
n_words                      0.534346
unique_words                 0.530608
unique_ratio                -0.431120
verb_count                   0.436233
noun_count                   0.523740
adv_count                    0.400297
adj_count                    0.476357
det_count                    0.469131
missing_wordcount            0.357206
Name: wording, dtype: float64

In [44]:
corr = numeric_features.corr()

import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_white"

fig = go.Heatmap(
    z=corr,
    x=corr.columns,
    y=corr.columns,
    colorscale=px.colors.diverging.RdBu,
    zmin=-1,
    zmax=1
)

fig = go.Figure(fig)
fig.update_layout(height=800)
fig.show()



In [45]:
def calculate_errors(y, y_pred):
    return {
        'r2': r2_score(y, y_pred),
        'rmse': sqrt(mean_squared_error(y, y_pred)),
        'mae': mean_absolute_error(y, y_pred)
    }


def train_cat_kfold(
        target: str, 
        prompt_group: pd.DataFrame, 
        features: pd.DataFrame, 
        targets: pd.DataFrame, 
        feature_names: List[str],
        model_params: dict) -> Tuple[pd.DataFrame, lgb.LGBMRegressor]:
    
    group_kfold = GroupKFold(n_splits=prompt_group.unique().size)
    assert group_kfold.get_n_splits(features, targets, prompt_group) == len(prompt_group.unique())

    train_errors, val_errors = [], []
    for i, (train_index, test_index) in enumerate(group_kfold.split(features, targets, prompt_group)):
        X_train = features[feature_names].iloc[train_index]
        y_train = targets.iloc[train_index][target]

        X_val = features[feature_names].iloc[test_index]
        y_val = targets.iloc[test_index][target]



        train_pool = Pool(X_train, label=y_train)
        test_pool = Pool(X_val, label=y_val)
        cb_rmse = CatBoostRegressor(loss_function='RMSE', random_seed=8, verbose=0)
        cb_rmse.fit(train_pool, eval_set=test_pool)
                        
        train_errors.append(calculate_errors(y_train, cb_rmse.predict(X_train)))
        val_errors.append(calculate_errors(y_val, cb_rmse.predict(X_val)))

    train_metrics = pd.DataFrame.from_records(train_errors).describe()
    train_metrics['set'] = 'train'
    val_metrics = pd.DataFrame.from_records(val_errors).describe()
    val_metrics['set'] = 'val'
    metric_df = pd.concat([train_metrics, val_metrics])

    return metric_df, cb_rmse

  
def train_cat(
        target: str, 
        prompt_group: pd.DataFrame, 
        features: pd.DataFrame, 
        targets: pd.DataFrame, 
        feature_names: List[str],
        model_params: dict) -> Tuple[pd.DataFrame, CatBoostRegressor]:
    
    
    X_train = features[feature_names]
    y_train = targets[target]


    train_data = Pool(X_train, label=y_train)
    cb_rmse = CatBoostRegressor(loss_function='RMSE', random_seed=8, verbose=0)
    cb_rmse.fit(train_data)

    train_errors = [calculate_errors(y_train, cb_rmse.predict(X_train))]
    train_metrics = pd.DataFrame.from_records(train_errors)

    return train_metrics, cb_rmse

In [46]:
def eval_validation(f_cols_content, f_cols_wording, model_params):
    metric_df_content, bst_content = train_cat_kfold('content', prompt_group, features, targets, f_cols_content, model_params)
    metric_df_wording, bst_wording = train_cat_kfold('wording', prompt_group, features, targets, f_cols_wording, model_params)

    metric_df_content['target'] = 'content'
    metric_df_wording['target'] = 'wording'
    metric_df = pd.concat([metric_df_content, metric_df_wording])
    metric_df = metric_df.loc[['mean', 'std']]
    print(metric_df)

    mcrmse = (metric_df.loc[metric_df.target=='content', 'rmse'] + metric_df.loc[metric_df.target=='wording', 'rmse']) / 2
    
    print(f'\nTrain MCRMSE:\t   {mcrmse.iloc[0]}')
    print(f'Validation MCRMSE: {mcrmse.iloc[1]}')
    print(f'Diff:\t {mcrmse.iloc[1]-mcrmse.iloc[0]}\n')

    # importance = pd.DataFrame({
    # 'importance': bst_wording.feature_importance(),
    # 'feature': bst_wording.feature_name()}).sort_values(by='importance', ascending=False)
    # print(importance)

In [47]:
features.columns

Index(['text_sentence_len_avg', 'text_sentence_len_std', 'text_stopwords',
       'text_unique_bigrams', 'prompt_title_unique_bigrams',
       'prompt_question_unique_bigrams', 'prompt_text_unique_bigrams',
       'text_bigram_overlap', 'question_bigram_overlap', 'text_bigram_ratio',
       'text_bigram_diff', 'question_bigram_diff', 'text_bigram_exclusive',
       'question_bigram_exclusive', 'n_words', 'unique_words', 'unique_ratio',
       'word_len_avg', 'word_len_q10', 'word_len_q90', 'word_len_std',
       'verb_count', 'noun_count', 'adv_count', 'adj_count', 'det_count',
       'missing_wordcount'],
      dtype='object')

In [48]:
model_params={}
f_cols_content = content_top_features.index.tolist()
f_cols_wording = wording_top_features.index.tolist()

eval_validation(f_cols_content=f_cols_content, f_cols_wording=f_cols_wording, model_params=model_params)

            r2      rmse       mae    set   target
mean  0.842277  0.414428  0.323053  train  content
mean  0.797496  0.471400  0.365360    val  content
mean  0.609656  0.637595  0.503403  train  wording
mean  0.409458  0.770637  0.614760    val  wording
std   0.005967  0.012922  0.009865  train  content
std   0.008314  0.039774  0.028145    val  content
std   0.161019  0.167340  0.131719  train  wording
std   0.126847  0.083025  0.066270    val  wording

Train MCRMSE:	   0.5260115582151618
Validation MCRMSE: 0.6210187076607189
Diff:	 0.0950071494455571



model_params = {
    'objective': 'fair', 
    'verbose': 0, 
    'force_col_wise': True,
    'learning_rate': 0.08,
    'boosting_type': 'dart',
    'num_leaves': 11,
    'seed': 42
}
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90',
          'verb_count','noun_count','adv_count','adj_count','det_count', 'missing_wordcount']

eval_validation(f_cols, f_cols, model_params)

f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90',
          'verb_count','noun_count','adv_count','adj_count','det_count']

eval_validation(f_cols, f_cols, model_params)

f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 'missing_wordcount']

eval_validation(f_cols, f_cols, model_params)

f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'missing_wordcount', 'text_sentence_len_std']

eval_validation(f_cols, f_cols, model_params)

f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90',
          'text_sentence_len_std']

eval_validation(f_cols, f_cols, model_params)

In [49]:
f_cols = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_std',
         'missing_wordcount', 'text_sentence_len_std']
eval_validation(f_cols, f_cols, model_params)

            r2      rmse       mae    set   target
mean  0.841182  0.415950  0.323937  train  content
mean  0.797165  0.471909  0.367017    val  content
mean  0.701503  0.565277  0.442705  train  wording
mean  0.594591  0.639909  0.507180    val  wording
std   0.009890  0.021874  0.016581  train  content
std   0.017265  0.048673  0.033805    val  content
std   0.035167  0.049381  0.040893  train  wording
std   0.121224  0.124923  0.106219    val  wording

Train MCRMSE:	   0.4906135232441523
Validation MCRMSE: 0.5559089608427017
Diff:	 0.06529543759854939



In [51]:
f_cols_content = f_cols
f_cols_wording = f_cols

metric_df_content, bst_content = train_cat('content', prompt_group, features, targets, f_cols_content, model_params)
metric_df_wording, bst_wording = train_cat('wording', prompt_group, features, targets, f_cols_wording, model_params)

inference_features = set(f_cols_content) | set(f_cols_wording)
display(f_cols_content)
display(f_cols_wording)
display(inference_features)

['text_bigram_overlap',
 'text_unique_bigrams',
 'unique_ratio',
 'n_words',
 'unique_words',
 'word_len_avg',
 'word_len_std',
 'missing_wordcount',
 'text_sentence_len_std',
 'text_stopwords']

['text_bigram_overlap',
 'text_unique_bigrams',
 'unique_ratio',
 'n_words',
 'unique_words',
 'word_len_avg',
 'word_len_std',
 'missing_wordcount',
 'text_sentence_len_std',
 'text_stopwords']

{'missing_wordcount',
 'n_words',
 'text_bigram_overlap',
 'text_sentence_len_std',
 'text_stopwords',
 'text_unique_bigrams',
 'unique_ratio',
 'unique_words',
 'word_len_avg',
 'word_len_std'}

In [52]:
print(f'\n{"-"*35}\n\tContent scores')
pprint(metric_df_content)
print(f'\n{"-"*35}\n\tWording scores')
pprint(metric_df_wording)


-----------------------------------
	Content scores
         r2      rmse       mae
0  0.896982  0.334925  0.262351

-----------------------------------
	Wording scores
         r2      rmse       mae
0  0.820214  0.439266  0.343177


In [53]:
print('Full data MCRMSE: ')
(metric_df_content.rmse + metric_df_wording.rmse) / 2

Full data MCRMSE: 


0    0.387095
Name: rmse, dtype: float64

In [54]:
bst_content.save_model(content_model)
bst_wording.save_model(wording_model)