In [1]:
import os
import random
from collections import defaultdict
from functools import partial, reduce
from math import sqrt
from operator import or_
from pathlib import Path
from pprint import pprint
from typing import Any, Dict, List, Optional, Set, Tuple, Union

import datasets
import nltk
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from catboost.utils import eval_metric
from nltk import ngrams
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm

download_dir = f'/kaggle/input/commonlit-cat-nlp-train-01/nltk_data'
os.environ['NLTK_DATA'] = download_dir

nltk.data.path.append(download_dir)


seed = 42
random.seed(seed)
np.random.seed(seed)



In [2]:
stop_words = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()

kaggle_input = Path('/kaggle/input')
# train_output = Path('commonlit-lgb-nlp-train-01')
# train_output = Path('commonlit-lgb-lucky')
train_output = Path('commonlit-cat-nlp-train-01')
data_dir = kaggle_input / 'commonlit-evaluate-student-summaries'

sample_submission = data_dir / 'sample_submission.csv'
summaries_train = data_dir / 'summaries_train.csv'
summaries_test = data_dir / 'summaries_test.csv'
prompts_train = data_dir / 'prompts_train.csv'
prompts_test = data_dir / 'prompts_test.csv'


content_model = kaggle_input / train_output / 'content.txt'
wording_model = kaggle_input / train_output / 'wording.txt'
word_dictionary = kaggle_input / train_output / 'commonlit_words.txt'

In [3]:
def text_tokenize(text: str) -> List[str]:
    return [lemmatiser.lemmatize(tok.lower()) for tok in word_tokenize(text) if tok.isalnum() and tok not in stop_words]


def make_bigram(tokens: List[str]) -> Set[str]:
    if type(tokens) != list:
        tokens = tokens.tolist()
    return set(ngrams(tokens, 2))


def tokenize(row: Dict[str, Any]) -> List[str]:
    for col in ['prompt_text', 'prompt_title', 'prompt_question', 'prompt_text']:
        row[f'{col}_lemmas'] = lemmas = text_tokenize(row[col])
        row[f'{col}_bigram'] = make_bigram(lemmas)


def nlp_preprocess(df: pd.DataFrame, column: str):
    df[f'{column}_lemmas'] = df[column].apply(text_tokenize)
    df[f'{column}_bigram'] = df[f'{column}_lemmas'].apply(make_bigram)


# def batch_tokenize(data: Dict[str, Any]) -> List[str]:
#     lemmas = [text_tokenize(row) for row in data]
#     return lemmas


def process_col(data: Dict[str, Any], col: str) -> List[str]:
    lemmas = [text_tokenize(text) for text in data]
    bigrams = [make_bigram(lemma) for lemma in lemmas]
    n_stopwords = []
    for row in lemmas:
        n_stopwords.append(sum([lemma in stop_words for lemma in row]))
#     pos = [pos_tag(lemma, tagset='universal') for lemma in lemmas]
    sentences = [sent_tokenize(text) for text in data]
    sentence_len_avg, sentence_len_std = [], []
    for i in range(len(sentences)):
        for j in range(len(sentences[i])):
            sentences[i][j] = len(sentences[i][j].split())
        sentence_len_avg.append(np.mean(sentences[i]))
        sentence_len_std.append(np.std(sentences[i]))

    return {f'{col}_lemmas': lemmas,
            f'{col}_bigram': bigrams,
            f'{col}_sentence_len_avg': sentence_len_avg,
            f'{col}_sentence_len_std': sentence_len_std,
            f'{col}_stopwords': n_stopwords,
#             'pos': pos
            }


def make_split(summaries_path: Path, prompts_path: Path, dtype_backend: Optional[str] = 'pyarrow') -> pd.DataFrame:
    summaries_df = pd.read_csv(summaries_path)
    prompts_df = pd.read_csv(prompts_path)

    for column in ['prompt_title', 'prompt_question', 'prompt_text']:
        nlp_preprocess(prompts_df, column)
        prompts_df[f'{column}_unique_bigrams'] = prompts_df[f'{column}_bigram'].str.len()

    summaries_dataset = datasets.Dataset.from_pandas(summaries_df, preserve_index=False)
    proc_func = partial(process_col, col='text')
    summaries_df = summaries_dataset.map(function=lambda example: {
                                         **proc_func(example['text']), **example}, num_proc=os.cpu_count(), keep_in_memory=True, batched=True).to_pandas()
    summaries_df['text_bigram'] = summaries_df.text_bigram.apply(lambda row: {(x[0], x[1]) for x in row})
    summaries_df['text_unique_bigrams'] = summaries_df['text_bigram'].str.len()

    df = pd.merge(summaries_df, prompts_df, how='left', on='prompt_id')
    df.fillna('')

    return df

In [4]:
# Load data
df = make_split(summaries_test, prompts_test)
# df = make_split(summaries_train, prompts_train)

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
df_test = df#.copy(deep=True)

# Create n-gram based features
df_test['text_bigram_overlap'] = df_test[['prompt_text_bigram', 'text_bigram']].apply(
    lambda row: len(row[0] & row[1]), axis=1) / df_test.text_unique_bigrams
df_test['question_bigram_overlap'] = df_test[['prompt_question_bigram', 'text_bigram']].apply(
    lambda row: len(row[0] & row[1]), axis=1) / df_test.text_unique_bigrams
df_test['text_bigram_ratio'] = df_test['text_unique_bigrams'] / (df_test['prompt_text_unique_bigrams'])

df_test['text_bigram_diff'] = df_test[['prompt_text_bigram', 'text_bigram']].apply(
    lambda row: len(row[1] - row[0]), axis=1) / df_test.text_unique_bigrams
df_test['question_bigram_diff'] = df_test[['prompt_question_bigram', 'text_bigram']].apply(
    lambda row: len(row[1] - row[0]), axis=1) / df_test.text_unique_bigrams

df_test['text_bigram_exclusive'] = df_test[['prompt_text_bigram', 'text_bigram']].apply(
    lambda row: len(row[0] ^ row[1]), axis=1) / df_test.text_unique_bigrams
df_test['question_bigram_exclusive'] = df_test[['prompt_question_bigram', 'text_bigram']].apply(
    lambda row: len(row[0] ^ row[1]), axis=1) / df_test.text_unique_bigrams

In [6]:
df_test['n_words'] = df_test.text_lemmas.str.len()
df_test['unique_words'] = df_test.text_lemmas.apply(set).str.len()
df_test['unique_ratio'] = df_test.unique_words / df_test.n_words

In [7]:
df_test['word_lengths'] = df_test.text_lemmas.apply(lambda x: [len(y) for y in x])
df_test['word_len_avg'] = df_test.word_lengths.apply(np.mean)

In [8]:
df_test['word_len_q10'] = df_test.word_lengths.apply(partial(np.percentile, q=10))
df_test['word_len_q90'] = df_test.word_lengths.apply(partial(np.percentile, q=90))
df_test['word_len_std'] = df_test.word_lengths.apply(np.std)

def pos_counts(tags):
    dd = defaultdict(lambda: 0)
    for _, pos in tags:
        dd[pos] += 1
    return dd


df_test['pos_counts'] = df_test.pos.apply(pos_counts)

df_test['verb_count'] = df_test.pos_counts.str['VERB'].replace(np.nan, 0)
df_test['noun_count'] = df_test.pos_counts.str['NOUN'].replace(np.nan, 0)
df_test['adv_count'] = df_test.pos_counts.str['ADV'].replace(np.nan, 0)
df_test['adj_count'] = df_test.pos_counts.str['ADJ'].replace(np.nan, 0)
df_test['det_count'] = df_test.pos_counts.str['DET'].replace(np.nan, 0)

In [9]:
with open(word_dictionary, 'r') as f:
    en_words = set(word.strip() for word in f.read().split())

In [10]:
def get_unique_words(col: str) -> Set[str]:
    word_sets = df_test[col].apply(set).tolist()
    return reduce(or_, word_sets)


prompt_words = get_unique_words('prompt_text_lemmas')
question_words = get_unique_words('prompt_question_lemmas')
title_words = get_unique_words('prompt_title_lemmas')

word_set = en_words | prompt_words | question_words | title_words

In [11]:
def count_missing_words(tokens): return sum([word not in word_set for word in tokens if word.isalpha()])


df_test['missing_wordcount'] = df_test.text_lemmas.apply(count_missing_words)

In [12]:
numeric_features = df_test.select_dtypes(include=np.number)
target_columns = ['content', 'wording']
feature_columns = [col for col in numeric_features if col not in target_columns]

features = numeric_features[feature_columns]
prompt_group = pd.Categorical(df['prompt_title'])

f_cols_content = ['text_unique_bigrams',
                  'text_bigram_overlap',
                  'text_bigram_ratio',
                  'text_bigram_diff',
                  'text_bigram_exclusive',
                  'question_bigram_exclusive',
                  'n_words',
                  'unique_words',
                  'unique_ratio',
                  'verb_count',
                  'noun_count',
                  'adv_count',
                  'adj_count',
                  'det_count',
                  'missing_wordcount']

f_cols_wording = ['text_sentence_len_std',
                  'text_unique_bigrams',
                  'text_bigram_ratio',
                  'text_bigram_exclusive',
                  'question_bigram_exclusive',
                  'n_words',
                  'unique_words',
                  'unique_ratio',
                  'verb_count',
                  'noun_count',
                  'adv_count',
                  'adj_count',
                  'det_count',
                  'missing_wordcount']

In [13]:
f_cols_content = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_std',
         'missing_wordcount', 'text_sentence_len_std']
f_cols_wording = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_std',
         'missing_wordcount', 'text_sentence_len_std']

In [14]:
content_bst = CatBoostRegressor().load_model(fname=content_model)
wording_bst = CatBoostRegressor().load_model(fname=wording_model)

In [15]:
df['content'] = content_bst.predict(df_test[f_cols_content])
df['wording'] = wording_bst.predict(df_test[f_cols_wording])

submission_df = df[['student_id', 'content', 'wording']]
submission_df.to_csv('submission.csv', index=False)