In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

In [2]:
prompts_df = pd.read_csv("./data/prompts_train.csv")
summaries_df = pd.read_csv("./data/summaries_train.csv")

In [3]:
def read_data(training=True):
    source = "train" if training else "test"
    prompts_df = pd.read_csv("./data/prompts_train.csv")
    summaries_df = pd.read_csv("./data/summaries_train.csv")

    return prompts_df, summaries_df

In [4]:
def merge_data(prompts_df, summaries_df):
    prompts_columns = prompts_df.columns;
    summaries_columns = summaries_df.columns;

    merge_columns = prompts_columns.intersection(summaries_columns)

    df = prompts_df.merge(summaries_df, on=list(merge_columns), how="inner")

    return df

In [5]:
dfs = read_data()
df = merge_data(dfs[0], dfs[1])

In [6]:
df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886


In [7]:
df["textLowerCase"] = df.text.str.lower()

In [8]:
# nltk.download('stopwords')
# nltk.download('punkt')
stop_words = set(stopwords.words('english'))

In [9]:
df["textLowerCase"][0]

'1 element of an ideal tragedy is that it should be arranged on a complex plan.  another element of an ideal tragedy is that it should only have one main issue. the last element of an ideal tragedy is that it should have a double thread plot and an opposite catastrophe for both good and bad.'

In [10]:
def tokenize_text(text_str):
    return nltk.tokenize.word_tokenize(text_str)

In [11]:
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

In [12]:
df["textTokens"] = df.textLowerCase.apply(tokenize_text)

In [13]:
df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording,textLowerCase,textTokens
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,1 element of an ideal tragedy is that it shoul...,"[1, element, of, an, ideal, tragedy, is, that,..."
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,the three elements of an ideal tragedy are: h...,"[the, three, elements, of, an, ideal, tragedy,..."
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,aristotle states that an ideal tragedy should ...,"[aristotle, states, that, an, ideal, tragedy, ..."
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471,one element of an ideal tragedy is having a co...,"[one, element, of, an, ideal, tragedy, is, hav..."
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,the 3 ideal of tragedy is how complex you need...,"[the, 3, ideal, of, tragedy, is, how, complex,..."


In [17]:
df["text_length"] = df.text.apply(lambda x: len(x.split()))

In [18]:
df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording,textLowerCase,textTokens,text_length
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,1 element of an ideal tragedy is that it shoul...,"[1, element, of, an, ideal, tragedy, is, that,...",56
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,the three elements of an ideal tragedy are: h...,"[the, three, elements, of, an, ideal, tragedy,...",25
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,aristotle states that an ideal tragedy should ...,"[aristotle, states, that, an, ideal, tragedy, ...",52
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471,one element of an ideal tragedy is having a co...,"[one, element, of, an, ideal, tragedy, is, hav...",58
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,the 3 ideal of tragedy is how complex you need...,"[the, 3, ideal, of, tragedy, is, how, complex,...",60
