In [33]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

In [21]:
prompts_df = pd.read_csv("./data/prompts_train.csv")
summaries_df = pd.read_csv("./data/summaries_train.csv")

In [22]:
def read_data(training=True):
    source = "train" if training else "test"
    prompts_df = pd.read_csv("./data/prompts_train.csv")
    summaries_df = pd.read_csv("./data/summaries_train.csv")

    return prompts_df, summaries_df

In [25]:
def merge_data(prompts_df, summaries_df):
    prompts_columns = prompts_df.columns;
    summaries_columns = summaries_df.columns;

    merge_columns = prompts_columns.intersection(summaries_columns)

    df = prompts_df.merge(summaries_df, on=list(merge_columns), how="inner")

    return df

In [26]:
dfs = read_data()
df = merge_data(dfs[0], dfs[1])

In [27]:
df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886


In [28]:
df["textLowerCase"] = df.text.str.lower()

In [38]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bastienwinant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bastienwinant/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [39]:
df["textLowerCase"][0]

'1 element of an ideal tragedy is that it should be arranged on a complex plan.  another element of an ideal tragedy is that it should only have one main issue. the last element of an ideal tragedy is that it should have a double thread plot and an opposite catastrophe for both good and bad.'

In [44]:
def tokenize_text(text_str):
    return nltk.tokenize.word_tokenize(text_str)

In [46]:
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

In [47]:
df["textTokens"] = df.textLowerCase.apply(tokenize_text)

In [None]:
df[]

In [3]:
prompts_df = pd.read_csv("./data/prompts_train.csv")
summaries_df = pd.read_csv("./data/summaries_train.csv")

In [4]:
try:
    assert(prompts_df.shape[0] == summaries_df.shape[0])
except AssertionError:
    print("Number of training prompts:", prompts_df.shape[0])
    print("Number of training summaries:", summaries_df.shape[0])

Number of training prompts: 4
Number of training summaries: 7165


In [5]:
def checkSharedColumn():
    prompts_cols = set(prompts_df.columns)
    summaries_cols = set(summaries_df.columns)

    # ensure the columns are returned as sets
    assert(type(prompts_cols) == set)
    assert(type(summaries_cols) == set)

    # check that the intersection results in a set
    assert(type(prompts_cols.intersection(summaries_cols)) == set)

    # ensure that the two dataframes share at least one column
    shared_columns = prompts_cols.intersection(summaries_cols)
    assert(len(shared_columns) > 0)

    return shared_columns

In [6]:
merge_columns = checkSharedColumn()

In [7]:
df = summaries_df.merge(prompts_df, how="inner", on=list(merge_columns))

In [8]:
# verify that each summary match one of the prompts
assert(df.loc[:, list(merge_columns)].isna().sum().sum() == 0)

In [9]:
df.shape

(7165, 8)

In [13]:
prompts_df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [14]:
summaries_df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [17]:
test_prompts_df = pd.read_csv("./data/prompts_test.csv")

In [18]:
test_prompts_df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,abc123,Summarize...,Example Title 1,Heading\nText...
1,def789,Summarize...,Example Title 2,Heading\nText...


In [20]:
test_summaries_df = pd.read_csv("./data/summaries_test.csv")
test_summaries_df.tail()

Unnamed: 0,student_id,prompt_id,text
0,000000ffffff,abc123,Example text 1
1,111111eeeeee,def789,Example text 2
2,222222cccccc,abc123,Example text 3
3,333333dddddd,def789,Example text 4
