In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams

import pandas as pd
import numpy as np
import numba
import joblib
from joblib import Parallel, delayed

from pathlib import Path
from typing import Optional, Union, List

[nltk_data] Downloading package stopwords to /home/chris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/chris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/chris/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
stop_words = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()

In [3]:
data_dir = Path('../data')

sample_submission = data_dir / 'sample_submission.csv'
summaries_train = data_dir / 'summaries_train.csv'
summaries_test = data_dir / 'summaries_test.csv'
prompts_train = data_dir / 'prompts_train.csv'
prompts_test = data_dir / 'prompts_test.csv'

def make_split(summaries_path: Path, prompts_path: Path, dtype_backend: Optional[str] = 'pyarrow') -> pd.DataFrame:
    summaries_df = pd.read_csv(summaries_path, dtype_backend=dtype_backend)
    prompts_df = pd.read_csv(prompts_path, dtype_backend=dtype_backend)
    df = pd.merge(summaries_df, prompts_df, how='inner', on='prompt_id')

    if len(df) != len(summaries_df):
        raise AssertionError('Could not match all prompt ids to a prompt')
    
    return df

df = make_split(summaries_train, prompts_train)

In [4]:
df.head(1)

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...


In [5]:
def clear_stopwords(column: pd.Series, idx: int) -> Union[List[str], List[str], List[str], List[str], List[str]]:
    tokens = word_tokenize(column.iloc[idx])
    cleared_stopwords = [tok for tok in tokens if tok not in stop_words]
    lemmas = [lemmatiser.lemmatize(tok) for tok in cleared_stopwords]
    tri_gram = list(ngrams(lemmas, 3))
    four_gram = list(ngrams(lemmas, 4))

    return tokens, cleared_stopwords, lemmas, tri_gram, four_gram

def nlp_splits(df: pd.DataFrame, column: str) -> None:
    output = Parallel(n_jobs=4)(delayed(clear_stopwords)(df[column], idx) for idx in range(len(df)))

    df[f'{column}_tokens'] = [part[0] for part in output]
    df[f'{column}_no_stopwords'] = [part[1] for part in output]
    df[f'{column}_lemmas'] = [part[2] for part in output]
    df[f'{column}_trigram'] = [part[3] for part in output]
    df[f'{column}_fourgram'] = [part[4] for part in output]

x = clear_stopwords(df.text, 0)

In [7]:
nlp_splits(df, 'text')

In [8]:
df

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,text_tokens,text_no_stopwords,text_lemmas,text_trigram,text_fourgram
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...,"[The, third, wave, was, an, experimentto, see,...","[The, third, wave, experimentto, see, people, ...","[The, third, wave, experimentto, see, people, ...","[(The, third, wave), (third, wave, experimentt...","[(The, third, wave, experimentto), (third, wav..."
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...,"[The, Third, Wave, developed, rapidly, because...","[The, Third, Wave, developed, rapidly, student...","[The, Third, Wave, developed, rapidly, student...","[(The, Third, Wave), (Third, Wave, developed),...","[(The, Third, Wave, developed), (Third, Wave, ..."
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...,"[The, third, wave, only, started, as, an, expe...","[The, third, wave, started, experiment, within...","[The, third, wave, started, experiment, within...","[(The, third, wave), (third, wave, started), (...","[(The, third, wave, started), (third, wave, st..."
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...,"[The, experimen, was, orginally, about, how, e...","[The, experimen, orginally, even, terrible, th...","[The, experimen, orginally, even, terrible, th...","[(The, experimen, orginally), (experimen, orgi...","[(The, experimen, orginally, even), (experimen..."
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...,"[The, third, wave, developed, so, quickly, due...","[The, third, wave, developed, quickly, due, st...","[The, third, wave, developed, quickly, due, st...","[(The, third, wave), (third, wave, developed),...","[(The, third, wave, developed), (third, wave, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7160,fef3e85236e5,39c16e,"It has to be made on a complex storyline, with...",-0.981265,-1.548900,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...,"[It, has, to, be, made, on, a, complex, storyl...","[It, made, complex, storyline, ,, plot, makes,...","[It, made, complex, storyline, ,, plot, make, ...","[(It, made, complex), (made, complex, storylin...","[(It, made, complex, storyline), (made, comple..."
7161,ff0f65eecf02,39c16e,Aristotle descirbes an ideal tradgedy as being...,-0.511077,-1.589115,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...,"[Aristotle, descirbes, an, ideal, tradgedy, as...","[Aristotle, descirbes, ideal, tradgedy, one, c...","[Aristotle, descirbes, ideal, tradgedy, one, c...","[(Aristotle, descirbes, ideal), (descirbes, id...","[(Aristotle, descirbes, ideal, tradgedy), (des..."
7162,ff186473ea0a,39c16e,A tragedy should have a complex plan not a sim...,-0.834946,-0.593749,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...,"[A, tragedy, should, have, a, complex, plan, n...","[A, tragedy, complex, plan, simple, one.A, goo...","[A, tragedy, complex, plan, simple, one.A, goo...","[(A, tragedy, complex), (tragedy, complex, pla...","[(A, tragedy, complex, plan), (tragedy, comple..."
7163,ff5e9e6068da,39c16e,Aristotle believed that the ideal tradegy shou...,-0.157460,-0.165811,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...,"[Aristotle, believed, that, the, ideal, tradeg...","[Aristotle, believed, ideal, tradegy, include,...","[Aristotle, believed, ideal, tradegy, include,...","[(Aristotle, believed, ideal), (believed, idea...","[(Aristotle, believed, ideal, tradegy), (belie..."
