## M2 HW - Andrew Cornfeld

In [1]:
import pandas as pd

In [2]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [3]:
data_home, output_dir

('/home/cpm6gh/Documents/MSDS/DS5001/data',
 '/home/cpm6gh/Documents/MSDS/DS5001/output')

In [4]:
text_file = f"{data_home}/gutenberg/pg161.txt"
csv_file = f"{output_dir}/austen-persuasion.csv"

In [5]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num', 'book_num']

In [6]:
LINES = pd.DataFrame(open(text_file, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES.index.name = 'line_num'
LINES.line_str = LINES.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

In [7]:
LINES.sample(20)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
12972,spread public support and donations to carry o...
3870,"""Is Mr. Willoughby much known in your part of ..."
4312,"meant to be a great secret, and I am sure has ..."
156,"propriety of going, and her own tender love fo..."
1486,favourite authors were brought forward and dwe...
7912,"next morning more openly, for at her particula..."
4265,will do me the justice of believing that I do ...
6621,"Dashwood's communication, in such an instantan..."
12731,and help preserve free future access to Projec...
11271,


In [8]:
title = LINES.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')

In [9]:
print(title)

Sense and Sensibility, by Jane Austen


# Remove front and back matter

In [10]:
clip_pats = [
    r"\*\*\*\s*START OF (?:THE|THIS) PROJECT",
    r"\*\*\*\s*END OF (?:THE|THIS) PROJECT"
]

In [11]:
pat_a = LINES.line_str.str.match(clip_pats[0])
pat_b = LINES.line_str.str.match(clip_pats[1])

In [12]:
line_a = LINES.loc[pat_a].index[0] + 1
line_b = LINES.loc[pat_b].index[0] - 1

In [13]:
line_a, line_b

(20, 12666)

In [14]:
LINES = LINES.loc[line_a : line_b]

In [15]:
LINES.head(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,


In [16]:
LINES.tail(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
12657,
12658,
12659,
12660,
12661,
12662,
12663,
12664,
12665,End of the Project Gutenberg EBook of Sense an...
12666,


# chunk by chapter

In [17]:
chap_pat = r"^\s*(?:chapter|letter)\s+\d+"

In [18]:
chap_lines = LINES.line_str.str.match(chap_pat, case=False) # Returns a truth vector

In [19]:
LINES.loc[chap_lines] # Use as filter for dataframe

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
42,CHAPTER 1
196,CHAPTER 2
399,CHAPTER 3
561,CHAPTER 4
756,CHAPTER 5
858,CHAPTER 6
986,CHAPTER 7
1112,CHAPTER 8
1244,CHAPTER 9
1448,CHAPTER 10


In [20]:
LINES.loc[chap_lines, 'chap_num'] = [i+1 for i in range(LINES.loc[chap_lines].shape[0])]

In [21]:
LINES.loc[chap_lines]

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
42,CHAPTER 1,1.0
196,CHAPTER 2,2.0
399,CHAPTER 3,3.0
561,CHAPTER 4,4.0
756,CHAPTER 5,5.0
858,CHAPTER 6,6.0
986,CHAPTER 7,7.0
1112,CHAPTER 8,8.0
1244,CHAPTER 9,9.0
1448,CHAPTER 10,10.0


In [22]:
LINES.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
3084,,
9166,"he did not regard his mother's anger, while he...",
825,time when her son-in-law's promise to his fath...,
5015,to be speaking their united inclinations. The...,
2892,"""Oh,"" cried Marianne, ""with what transporting ...",
5932,"lady, however, saw only that Marianne had rece...",
4561,"forgive! He had been blamable, highly blamabl...",
9941,"mother was half frantic.""",
34,,
8706,"Marianne.--THEN, if I had not been bound to si...",


In [23]:
LINES.chap_num = LINES.chap_num.ffill()

In [24]:
LINES.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
6472,public conversation. I must do THIS justice t...,30.0
12012,"swell his heart, and raise his spirits. He wa...",49.0
1298,"unwillingly, to turn back, for no shelter was ...",9.0
2515,He then hastily took leave of them all and lef...,15.0
11193,young man!--and without selfishness--without e...,45.0
10045,summits Combe Magna might be seen.,42.0
11608,slowly continued--,47.0
12219,"smallest regard, and who had only two thousand...",49.0
10785,"journey!""",44.0
5856,,28.0


In [25]:
LINES.head(20)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
20,,
21,,
22,,
23,,
24,,
25,,
26,,
27,,
28,,
29,,


In [26]:
LINES = LINES.dropna(subset=['chap_num']) 

LINES = LINES.loc[~chap_lines]
LINES.chap_num = LINES.chap_num.astype('int')

In [27]:
LINES.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
6776,"""You have probably entirely forgotten a conver...",31
7573,marriage by every possible attention. He had ...,33
1343,against Marianne received particular spirit fr...,9
7993,It was a very awkward moment; and the countena...,35
2601,once. Secrecy may be advisable; but still I c...,15
11171,"characters, or feelings, could be given;--but ...",45
1711,"Willoughby, and the fond attachment to Norland...",11
2986,,17
4340,"""Our acquaintance, however, is of many years d...",22
11895,"""He comes from Mr. Pratt's purposely to see us...",48


In [28]:
OHCO[:1]

['chap_num']

In [29]:
CHAPS = LINES.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

In [30]:
CHAPS.head(10)

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,\n\nThe family of Dashwood had long been settl...
2,\n\nMrs. John Dashwood now installed herself m...
3,\n\nMrs. Dashwood remained at Norland several ...
4,"\n\n""What a pity it is, Elinor,"" said Marianne..."
5,"\n\nNo sooner was her answer dispatched, than ..."
6,\n\nThe first part of their journey was perfor...
7,\n\nBarton Park was about half a mile from the...
8,\n\nMrs. Jennings was a widow with an ample jo...
9,\n\nThe Dashwoods were now settled at Barton w...
10,"\n\nMarianne's preserver, as Margaret, with mo..."


In [31]:
CHAPS['chap_str'] = CHAPS.chap_str.str.strip()

In [32]:
CHAPS

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,The family of Dashwood had long been settled i...
2,Mrs. John Dashwood now installed herself mistr...
3,Mrs. Dashwood remained at Norland several mont...
4,"""What a pity it is, Elinor,"" said Marianne, ""t..."
5,"No sooner was her answer dispatched, than Mrs...."
6,The first part of their journey was performed ...
7,Barton Park was about half a mile from the cot...
8,Mrs. Jennings was a widow with an ample jointu...
9,The Dashwoods were now settled at Barton with ...
10,"Marianne's preserver, as Margaret, with more e..."


# Split into paragraphs

In [33]:
para_pat = r'\n\n+'

In [34]:
PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack()\
    .to_frame('para_str').sort_index()
PARAS.index.names = OHCO[:2]

In [35]:
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,The family of Dashwood had long been settled i...
1,1,"By a former marriage, Mr. Henry Dashwood had o..."
1,2,"The old gentleman died: his will was read, and..."
1,3,"Mr. Dashwood's disappointment was, at first, s..."
1,4,His son was sent for as soon as his danger was...


In [36]:
PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
PARAS['para_str'] = PARAS['para_str'].str.strip()
PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')]

In [37]:
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,The family of Dashwood had long been settled i...
1,1,"By a former marriage, Mr. Henry Dashwood had o..."
1,2,"The old gentleman died: his will was read, and..."
1,3,"Mr. Dashwood's disappointment was, at first, s..."
1,4,His son was sent for as soon as his danger was...


# Split into sentences

In [38]:
sent_pat = r'[.?!;:]+'
SENTS = PARAS['para_str'].str.split(sent_pat, expand=True).stack()\
    .to_frame('sent_str')
SENTS.index.names = OHCO[:3]

In [39]:
SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')]
SENTS.sent_str = SENTS.sent_str.str.strip()

In [40]:
SENTS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,The family of Dashwood had long been settled i...
1,0,1,"Their estate was large, and their residence wa..."
1,0,2,The late owner of this estate was a single man...
1,0,3,"But her death, which happened ten years before..."
1,0,4,"for to supply her loss, he invited and receive..."


In [41]:
SENTS.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
44,22,2,"to open my whole heart to you, and by convinci..."
46,29,5,"You, my mother, and Margaret, must henceforth ..."
45,4,17,Willoughby's death
20,42,6,"Willoughby at Cleveland, and whether they were..."
35,20,0,"Her manners gave some re-assurance to Edward, ..."
42,11,4,they talked of the friends they had left behin...
38,11,9,I do not care if it IS the Doctor's favourite ...
29,63,6,"no one, no one--he talked to me only of myself"
3,14,0,"""In a few months, my dear Marianne"
33,35,0,"""Certainly,"" said Elinor"


# Split into tokens

In [42]:
token_pat = r"[\s',-]+"
TOKENS = SENTS['sent_str'].str.split(token_pat, expand=True).stack()\
    .to_frame('token_str')

In [43]:
TOKENS.index.names = OHCO[:4]

In [44]:
TOKENS.to_csv('asdf.csv')

# Bring in persuasion

In [45]:
persuasion = pd.read_csv(csv_file)

In [46]:
persuasion

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str,term_str
0,1,0,0,0,Sir,sir
1,1,0,0,1,Walter,walter
2,1,0,0,2,Elliot,elliot
3,1,0,0,3,of,of
4,1,0,0,4,Kellynch,kellynch
...,...,...,...,...,...,...
85009,24,13,0,6,of,of
85010,24,13,0,7,Persuasion,persuasion
85011,24,13,0,8,by,by
85012,24,13,0,9,Jane,jane


In [47]:
persuasion = persuasion.drop("term_str", axis = 1)
persuasion

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str
0,1,0,0,0,Sir
1,1,0,0,1,Walter
2,1,0,0,2,Elliot
3,1,0,0,3,of
4,1,0,0,4,Kellynch
...,...,...,...,...,...
85009,24,13,0,6,of
85010,24,13,0,7,Persuasion
85011,24,13,0,8,by
85012,24,13,0,9,Jane


In [48]:
persuasion['book_id'] = 1
persuasion.set_index(['book_id', persuasion.index], inplace=True)
persuasion.reset_index(inplace = True)
persuasion

Unnamed: 0,book_id,level_1,chap_num,para_num,sent_num,token_num,token_str
0,1,0,1,0,0,0,Sir
1,1,1,1,0,0,1,Walter
2,1,2,1,0,0,2,Elliot
3,1,3,1,0,0,3,of
4,1,4,1,0,0,4,Kellynch
...,...,...,...,...,...,...,...
85009,1,85009,24,13,0,6,of
85010,1,85010,24,13,0,7,Persuasion
85011,1,85011,24,13,0,8,by
85012,1,85012,24,13,0,9,Jane


In [49]:
TOKENS['book_id'] = 0
TOKENS.set_index(['book_id', TOKENS.index], inplace=True)
TOKENS.reset_index(inplace = True)
TOKENS

Unnamed: 0,book_id,chap_num,para_num,sent_num,token_num,token_str
0,0,1,0,0,0,The
1,0,1,0,0,1,family
2,0,1,0,0,2,of
3,0,1,0,0,3,Dashwood
4,0,1,0,0,4,had
...,...,...,...,...,...,...
122877,0,50,22,0,8,and
122878,0,50,22,0,9,Sensibility
122879,0,50,22,0,10,by
122880,0,50,22,0,11,Jane


# Combine dataframes

In [50]:
merged_df = pd.concat([TOKENS, persuasion], axis=0, ignore_index=True)

In [51]:
merged_df = merged_df.drop('level_1', axis=1)

In [52]:
m = merged_df

In [53]:
m

Unnamed: 0,book_id,chap_num,para_num,sent_num,token_num,token_str
0,0,1,0,0,0,The
1,0,1,0,0,1,family
2,0,1,0,0,2,of
3,0,1,0,0,3,Dashwood
4,0,1,0,0,4,had
...,...,...,...,...,...,...
207891,1,24,13,0,6,of
207892,1,24,13,0,7,Persuasion
207893,1,24,13,0,8,by
207894,1,24,13,0,9,Jane


# Q1: There are 207896 raw tokens in the combined dataframe.

# Extract Vocabulary

In [54]:
m['term_str'] = m.token_str.replace(r'[\W_]+', '', regex=True).str.lower()
VOCAB = m.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'

In [55]:
VOCAB

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,the,7435
1,to,6923
2,and,6290
3,of,6146
4,her,3747
...,...,...
8234,unconquerable,1
8235,outgrown,1
8236,prosperously,1
8237,nominal,1


# Q2: There are 8239 distinct terms in the combined dataframe.

In [56]:
TOKENS['term_str'] = TOKENS.token_str.replace(r'[\W_]+', '', regex=True).str.lower()
VOCAB_SENSE = TOKENS.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
VOCAB_SENSE.index.name = 'term_id'
VOCAB_SENSE

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,to,4115
1,the,4105
2,of,3574
3,and,3490
4,her,2543
...,...,...
6275,prefer,1
6276,dissolving,1
6277,beset,1
6278,effectually,1


In [57]:
persuasion['term_str'] = persuasion.token_str.replace(r'[\W_]+', '', regex=True).str.lower()
VOCAB_P = persuasion.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
VOCAB_P.index.name = 'term_id'
VOCAB_P

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,the,3330
1,to,2808
2,and,2800
3,of,2572
4,a,1595
...,...,...
5755,reins,1
5756,judiciously,1
5757,rut,1
5758,dung,1


# Q3: Sense and Sensibility has 6280 vocab words and Persuasion has 5760 vocab words, which is a difference of 520 words.

In [58]:
m

Unnamed: 0,book_id,chap_num,para_num,sent_num,token_num,token_str,term_str
0,0,1,0,0,0,The,the
1,0,1,0,0,1,family,family
2,0,1,0,0,2,of,of
3,0,1,0,0,3,Dashwood,dashwood
4,0,1,0,0,4,had,had
...,...,...,...,...,...,...,...
207891,1,24,13,0,6,of,of
207892,1,24,13,0,7,Persuasion,persuasion
207893,1,24,13,0,8,by,by
207894,1,24,13,0,9,Jane,jane


In [59]:
tokens_per_chap = m.groupby(['book_id', 'chap_num']).count()

In [60]:
tokens_per_chap['term_str'].mean()

2807.9189189189187

# Q4: The average number of tokens per chapter is 2808, rounded to the nearest integer.

In [61]:
tokens_per_par = m.groupby(['book_id', 'chap_num', 'para_num']).count()

In [62]:
tokens_per_par['term_str'].mean()

73.7091167080525

# Q5: The average number of tokens per paragraph is 74, rounded to the nearest integer.

In [64]:
m.to_csv("austen-combo-TOKENS.csv")