In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
import string

import pandas as pd
import numpy as np
import numba
import joblib
from joblib import Parallel, delayed

from pathlib import Path
from typing import Optional, Union, List

[nltk_data] Downloading package stopwords to /home/chris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/chris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/chris/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
stop_words = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()

In [3]:
data_dir = Path('../data')

sample_submission = data_dir / 'sample_submission.csv'
summaries_train = data_dir / 'summaries_train.csv'
summaries_test = data_dir / 'summaries_test.csv'
prompts_train = data_dir / 'prompts_train.csv'
prompts_test = data_dir / 'prompts_test.csv'

def make_split(summaries_path: Path, prompts_path: Path, dtype_backend: Optional[str] = 'pyarrow') -> pd.DataFrame:
    summaries_df = pd.read_csv(summaries_path, dtype_backend=dtype_backend)
    prompts_df = pd.read_csv(prompts_path, dtype_backend=dtype_backend)
    df = pd.merge(summaries_df, prompts_df, how='inner', on='prompt_id')

    if len(df) != len(summaries_df):
        raise AssertionError('Could not match all prompt ids to a prompt')
    
    return df

df = make_split(summaries_train, prompts_train)

In [4]:
df.head(1)

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...


In [5]:
def clear_stopwords(column: pd.Series, idx: int) -> Union[List[str], List[str], List[str], List[str], List[str]]:
    tokens = [tok.lower() for tok in word_tokenize(column.iloc[idx]) if tok.isalnum()]
    cleared_stopwords = [tok for tok in tokens if tok not in stop_words]
    lemmas = [lemmatiser.lemmatize(tok) for tok in cleared_stopwords]
    bigram = set(ngrams(lemmas, 2))
    tri_gram = set(ngrams(lemmas, 3))
    four_gram = set(ngrams(lemmas, 4))

    return tokens, cleared_stopwords, lemmas, bigram, tri_gram, four_gram

def nlp_splits(df: pd.DataFrame, column: str) -> None:
    output = Parallel(n_jobs=4)(delayed(clear_stopwords)(df[column], idx) for idx in range(len(df)))

    df[f'{column}_tokens'] = [part[0] for part in output]
    df[f'{column}_no_stopwords'] = [part[1] for part in output]
    df[f'{column}_lemmas'] = [part[2] for part in output]
    df[f'{column}_bigram'] = [part[3] for part in output]
    df[f'{column}_trigram'] = [part[4] for part in output]
    df[f'{column}_fourgram'] = [part[5] for part in output]

x = clear_stopwords(df.text, 0)

In [6]:
text_columns = ['prompt_title', 'prompt_question', 'prompt_text', 'text']
for column in text_columns:
    nlp_splits(df, column)
    df[f'{column}_unique_bigrams'] = df[f'{column}_bigram'].str.len()
    df[f'{column}_unique_trigrams'] = df[f'{column}_trigram'].str.len()

In [7]:
df['text_bigram_overlap'] = df[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1)
df['question_bigram_overlap'] = df[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1)
df['title_bigram_overlap'] = df[['prompt_title_bigram', 'text_bigram']].apply(lambda row: len(row[0] & row[1]), axis=1)

df['text_bigram_diff'] = df[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] - row[1]), axis=1)
df['question_bigram_diff'] = df[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] - row[1]), axis=1)
df['title_bigram_diff'] = df[['prompt_title_bigram', 'text_bigram']].apply(lambda row: len(row[0] - row[1]), axis=1)

df['text_bigram_exclusive'] = df[['prompt_text_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1)
df['question_bigram_exclusive'] = df[['prompt_question_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1)
df['title_bigram_exclusive'] = df[['prompt_title_bigram', 'text_bigram']].apply(lambda row: len(row[0] ^ row[1]), axis=1)

In [8]:
df

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,prompt_title_tokens,prompt_title_no_stopwords,...,text_unique_trigrams,text_bigram_overlap,question_bigram_overlap,title_bigram_overlap,text_bigram_diff,question_bigram_diff,title_bigram_diff,text_bigram_exclusive,question_bigram_exclusive,title_bigram_exclusive
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...,"[the, third, wave]","[third, wave]",...,34,1,1,1,290,7,0,324,41,34
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...,"[the, third, wave]","[third, wave]",...,106,5,3,1,286,5,0,384,105,102
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...,"[the, third, wave]","[third, wave]",...,28,5,1,1,286,7,0,310,35,28
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...,"[the, third, wave]","[third, wave]",...,36,7,1,1,284,7,0,314,43,36
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...,"[the, third, wave]","[third, wave]",...,13,1,2,1,290,6,0,303,18,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7160,fef3e85236e5,39c16e,"It has to be made on a complex storyline, with...",-0.981265,-1.548900,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...,"[on, tragedy]",[tragedy],...,15,1,0,0,267,7,0,282,23,16
7161,ff0f65eecf02,39c16e,Aristotle descirbes an ideal tradgedy as being...,-0.511077,-1.589115,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...,"[on, tragedy]",[tragedy],...,13,2,0,0,266,7,0,278,21,14
7162,ff186473ea0a,39c16e,A tragedy should have a complex plan not a sim...,-0.834946,-0.593749,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...,"[on, tragedy]",[tragedy],...,9,4,0,0,264,7,0,270,17,10
7163,ff5e9e6068da,39c16e,Aristotle believed that the ideal tradegy shou...,-0.157460,-0.165811,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...,"[on, tragedy]",[tragedy],...,24,8,0,0,260,7,0,276,31,24


In [9]:
numeric_features = df.select_dtypes(include=np.number)
corr = numeric_features.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,content,wording,prompt_title_unique_bigrams,prompt_title_unique_trigrams,prompt_question_unique_bigrams,prompt_question_unique_trigrams,prompt_text_unique_bigrams,prompt_text_unique_trigrams,text_unique_bigrams,text_unique_trigrams,text_bigram_overlap,question_bigram_overlap,title_bigram_overlap,text_bigram_diff,question_bigram_diff,title_bigram_diff,text_bigram_exclusive,question_bigram_exclusive,title_bigram_exclusive
content,1.0,0.75138,0.052423,0.038543,0.026536,0.026536,-0.03023,-0.028431,0.793555,0.7934,0.409327,0.278565,0.208482,-0.125654,-0.104591,-0.063937,0.110611,0.756706,0.788501
wording,0.75138,1.0,0.026611,-0.003296,-0.056031,-0.056031,-0.125305,-0.117793,0.529123,0.530187,0.03436,0.239429,0.233526,-0.136332,-0.159429,-0.1032,0.076351,0.488407,0.522367
prompt_title_unique_bigrams,0.052423,0.026611,1.0,0.834209,0.940945,0.940945,0.198507,0.192351,0.162399,0.159938,0.192275,0.041176,0.251819,0.15891,0.810938,0.844119,0.179985,0.273401,0.183592
prompt_title_unique_trigrams,0.038543,-0.003296,0.834209,1.0,0.876163,0.876163,-0.233895,-0.25266,0.118931,0.118706,0.113623,-0.069481,0.151078,-0.265925,0.80465,0.736854,-0.23907,0.236444,0.138564
prompt_question_unique_bigrams,0.026536,-0.056031,0.940945,0.876163,1.0,1.0,0.261953,0.244308,0.162973,0.16232,0.219791,-0.013643,0.081916,0.21755,0.88821,0.880158,0.231785,0.288438,0.188069
prompt_question_unique_trigrams,0.026536,-0.056031,0.940945,0.876163,1.0,1.0,0.261953,0.244308,0.162973,0.16232,0.219791,-0.013643,0.081916,0.21755,0.88821,0.880158,0.231785,0.288438,0.188069
prompt_text_unique_bigrams,-0.03023,-0.125305,0.198507,-0.233895,0.261953,0.261953,1.0,0.99933,0.087776,0.087376,0.217837,0.097107,-0.179423,0.974184,0.186408,0.294659,0.947463,0.107351,0.099627
prompt_text_unique_trigrams,-0.028431,-0.117793,0.192351,-0.25266,0.244308,0.244308,0.99933,1.0,0.086171,0.085605,0.214042,0.103397,-0.166109,0.974376,0.167956,0.281228,0.947855,0.102741,0.097398
text_unique_bigrams,0.793555,0.529123,0.162399,0.118931,0.162973,0.162973,0.087776,0.086171,1.0,0.998908,0.723058,0.244324,0.178804,-0.077317,0.031471,0.060681,0.172024,0.98354,0.99926
text_unique_trigrams,0.7934,0.530187,0.159938,0.118706,0.16232,0.16232,0.087376,0.085605,0.998908,1.0,0.719653,0.249532,0.177449,-0.076939,0.028501,0.059011,0.172722,0.981696,0.998141


In [10]:
df.select_dtypes(include=np.number).describe()

Unnamed: 0,content,wording,prompt_title_unique_bigrams,prompt_title_unique_trigrams,prompt_question_unique_bigrams,prompt_question_unique_trigrams,prompt_text_unique_bigrams,prompt_text_unique_trigrams,text_unique_bigrams,text_unique_trigrams,text_bigram_overlap,question_bigram_overlap,title_bigram_overlap,text_bigram_diff,question_bigram_diff,title_bigram_diff,text_bigram_exclusive,question_bigram_exclusive,title_bigram_exclusive
count,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0
mean,-0.014853,-0.063072,0.993301,0.280391,10.511375,9.511375,323.413957,329.335799,34.973761,34.823866,10.140963,1.359665,0.180042,313.272994,9.15171,0.813259,338.105792,42.765806,35.606978
std,1.043569,1.036048,0.753336,0.449222,3.198101,3.198101,62.4909,62.598278,25.55395,26.62115,14.10803,1.66617,0.424301,60.992201,3.626206,0.765874,61.836086,25.68374,25.543449
min,-1.729859,-1.962614,0.0,0.0,7.0,6.0,268.0,273.0,6.0,5.0,0.0,0.0,0.0,103.0,1.0,0.0,106.0,6.0,6.0
25%,-0.799545,-0.87272,0.0,0.0,7.0,6.0,268.0,273.0,18.0,17.0,1.0,0.0,0.0,267.0,7.0,0.0,292.0,26.0,19.0
50%,-0.093814,-0.081769,1.0,0.0,11.0,10.0,300.0,304.0,27.0,26.0,5.0,1.0,0.0,290.0,8.0,1.0,314.0,35.0,28.0
75%,0.49966,0.503833,2.0,1.0,15.0,14.0,422.0,428.0,43.0,43.0,14.0,2.0,0.0,390.0,11.0,1.0,410.0,51.0,44.0
max,3.900326,4.310693,2.0,1.0,15.0,14.0,422.0,428.0,325.0,336.0,197.0,9.0,2.0,422.0,15.0,2.0,533.0,340.0,325.0


## TODO
- Needs feature cross correlations
- Outlier analysis
- Start basic modelling xgboost etc with k-fold cross validation split on each prompt
  - all answers for one prompt are the val set, and the rest are train
  - rotate training over all prompts
  - get distribution of errors / metrics