In [None]:
import os
download_dir = '/kaggle/input/commonlit-lgb-nlp-train-01/nltk_data'
os.environ['NLTK_DATA'] = download_dir

import nltk
nltk.data.path.append(download_dir)
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk import ngrams

import datasets
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import plotly.express as px
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import lightgbm as lgb
from math import sqrt

from collections import defaultdict
from functools import partial, reduce
from operator import or_
from pathlib import Path
from pprint import pprint
from typing import Optional, Union, List, Tuple, Dict, Set, Any

In [None]:
stop_words = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()

kaggle_input = Path('/kaggle/input')
# train_output = Path('commonlit-lgb-nlp-train-01')
train_output = Path('commonlit-lgb-lucky')
data_dir = kaggle_input / 'commonlit-evaluate-student-summaries'

sample_submission = data_dir / 'sample_submission.csv'
summaries_train = data_dir / 'summaries_train.csv'
summaries_test = data_dir / 'summaries_test.csv'
prompts_train = data_dir / 'prompts_train.csv'
prompts_test = data_dir / 'prompts_test.csv'

content_model = kaggle_input / train_output / 'content.txt'
wording_model = kaggle_input / train_output / 'wording.txt'
word_dictionary = kaggle_input / train_output / 'commonlit_words.txt'

In [None]:
def predict(model: lgb.Booster, df: pd.DataFrame, features: List[str]) -> pd.Series:
    return model.predict(df[features])

In [None]:
def text_tokenize(text: str) -> List[str]:
    return [lemmatiser.lemmatize(tok.lower()) for tok in word_tokenize(text) if tok.isalnum() and tok not in stop_words]

        
def make_bigram(tokens: List[str]) -> Set[str]:
    if type(tokens) != list:
        tokens = tokens.tolist()
    return set(ngrams(tokens, 2))


def tokenize(row: Dict[str, Any]) -> List[str]:
    for col in ['prompt_text', 'prompt_title', 'prompt_question', 'prompt_text']:
        row[f'{col}_lemmas'] = lemmas = text_tokenize(row[col])
        row[f'{col}_bigram'] = make_bigram(lemmas)

        
def nlp_preprocess(df: pd.DataFrame, column: str):
    df[f'{column}_lemmas'] = df[column].apply(text_tokenize)
    df[f'{column}_bigram'] = df[f'{column}_lemmas'].apply(make_bigram)
    
    
def batch_tokenize(data: Dict[str, Any]) -> List[str]:
    lemmas = [text_tokenize(row) for row in data]
    return lemmas

def process_col(data: Dict[str, Any], col: str) -> List[str]:
    lemmas = [text_tokenize(row) for row in data]
    bigrams = [make_bigram(lemma) for lemma in lemmas]
    return {f'{col}_lemmas': lemmas, f'{col}_bigram': bigrams}

def make_split(summaries_path: Path, prompts_path: Path, dtype_backend: Optional[str] = 'pyarrow') -> pd.DataFrame:
    summaries_df = pd.read_csv(summaries_path)
    prompts_df = pd.read_csv(prompts_path)
    
    for column in ['prompt_title', 'prompt_question', 'prompt_text']:
        nlp_preprocess(prompts_df, column)
        prompts_df[f'{column}_unique_bigrams'] = prompts_df[f'{column}_bigram'].str.len()
    
    summaries_dataset = datasets.Dataset.from_pandas(summaries_df, preserve_index=False)
    proc_func = partial(process_col, col='text')
    summaries_df = summaries_dataset.map(function=lambda example: {**proc_func(example['text']), **example}, num_proc=os.cpu_count(), keep_in_memory=True, batched=True).to_pandas()
    summaries_df['text_bigram'] = summaries_df.text_bigram.apply(lambda row: {(x[0], x[1]) for x in row})
    summaries_df['text_unique_bigrams'] = summaries_df['text_bigram'].str.len()
    
    df = pd.merge(summaries_df, prompts_df, how='left', on='prompt_id')
    df.fillna('')
    
    return df

In [None]:
# Load data
# df = make_split(summaries_train, prompts_train)
df = make_split(summaries_test, prompts_test)

In [None]:
df_test = df

# Create n-gram based features
df_test['text_bigram_overlap'] = df_test[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1) / df_test.text_unique_bigrams
df_test['question_bigram_overlap'] = df_test[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1) / df_test.text_unique_bigrams
df_test['text_bigram_ratio'] = df_test['text_unique_bigrams'] / (df_test['prompt_text_unique_bigrams'])

df_test['text_bigram_diff'] = df_test[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[1] - row[0]), axis=1) / df_test.text_unique_bigrams
df_test['question_bigram_diff'] = df_test[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[1] - row[0]), axis=1) / df_test.text_unique_bigrams

df_test['text_bigram_exclusive'] = df_test[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1) / df_test.text_unique_bigrams
df_test['question_bigram_exclusive'] = df_test[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1) / df_test.text_unique_bigrams

In [None]:
# Create word based features
df_test['n_words'] = df_test.text_lemmas.str.len()
df_test['unique_words'] = df_test.text_lemmas.apply(set).str.len()
df_test['unique_ratio'] = df_test.unique_words / df_test.n_words

In [None]:
df_test['word_lengths'] = df_test.text_lemmas.apply(lambda x: [len(y) for y in x])
df_test['word_len_avg'] = df_test.word_lengths.apply(np.mean)

In [None]:
df_test['word_len_q10'] = df_test.word_lengths.apply(partial(np.percentile, q=10))
df_test['word_len_q90'] = df_test.word_lengths.apply(partial(np.percentile, q=90))

with open(word_dictionary, 'r') as f:
    word_set = set([line.strip() for line in f.read().split('\n')])
    
def get_unique_words(col: str) -> Set[str]:
    word_sets = df_test[col].apply(set).tolist()
    return reduce(or_, word_sets)

prompt_words = get_unique_words('prompt_text_lemmas')
question_words = get_unique_words('prompt_question_lemmas')
title_words = get_unique_words('prompt_title_lemmas')

word_set = word_set | prompt_words | question_words | title_words

count_missing_words = lambda tokens: sum([word not in word_set for word in tokens if word.isalpha()])
df_test['missing_wordcount'] = df_test.text_lemmas.apply(count_missing_words)

In [None]:
content_bst = lgb.Booster(model_file=content_model)
wording_bst = lgb.Booster(model_file=wording_model)

In [None]:
features = ['text_bigram_overlap', 'text_unique_bigrams', 'unique_ratio', 
          'n_words', 'unique_words', 'word_len_avg', 'word_len_q10', 'word_len_q90']
           #'missing_wordcount']
df['content'] = predict(content_bst, df_test, features)
df['wording'] = predict(wording_bst, df_test, features)

submission_df = df[['student_id', 'content', 'wording']]
submission_df.to_csv('submission.csv', index=False)