# Project 1 : Insights into Classic Texts

## Importing and Preprocessing Text Data

In [7]:
# Importing NLTK functions for part-of-speech tagging and regex-based parsing

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag, RegexpParser

#Importing function that returns most common noun phrases and verb phrases chunks
from chunk_counters import np_chunk_counter, vp_chunk_counter
#Importing function that tokenizes text input firstly into sentences, then into words
from tokenize_words import word_sentence_tokenize



# importing The Iliad from Homer found on Project Gutenberg and lowering the case for processing
text = open('the_iliad.txt', encoding = 'utf-8').read().lower()



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Usuari\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Usuari\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
# sentence and word tokenizing text in order to perform sentence-by-sentence parsing analysis later.
word_tokenized_text = word_sentence_tokenize(text)



In [9]:
# storing and printing a random index of word_tokenized_text to visualize what the function returns

test_word_tokenized_sentence = word_tokenized_text[50]
print(test_word_tokenized_sentence)

['neptune', 'rising', 'from', 'the', 'sea', '.']


## Part-of-speech Tag Text

In [10]:
# creating an empty list to hold part-of-speech tagged sentences from the text under analysis.
pos_tagged_text = []

In [11]:
# creating a for loop through each word tokenized sentence (token) and append to list of pos tagged text using nltk's pos_tag() function.
for token in word_tokenized_text:
    pos_tagged_text.append(pos_tag(token))


In [12]:
# storing and printing any random part-of-speech tagged sentence to visualize single pos-tagged sentences
test_pos_sentence = pos_tagged_text[89]
print(test_pos_sentence)



[('probability', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('and', 'CC'), ('troublesome', 'JJ'), ('test', 'NN'), (';', ':'), ('and', 'CC'), ('it', 'PRP'), ('is', 'VBZ'), ('by', 'IN'), ('this', 'DT'), ('troublesome', 'JJ'), ('standard', 'NN'), ('that', 'IN'), ('a', 'DT'), ('large', 'JJ'), ('portion', 'NN'), ('of', 'IN'), ('historical', 'JJ'), ('evidence', 'NN'), ('is', 'VBZ'), ('sifted', 'VBN'), ('.', '.')]


## Chunk Sentences

In [13]:
# defining a noun phrase 
#(considered here as an optional determiner, any number of adjectives and an obligatory noun, in this order) chunk grammar 
np_chunk_grammar = 'NP: {<DT>?<JJ>*<NN>}'


In [14]:
# creating a noun phrase RegexpParser object 
np_chunk_parser = RegexpParser(np_chunk_grammar)

In [15]:
# defining verb phrase (here considered as noun phrase, a verb in any form and an optional adverb, in this order) chunk grammar
vp_chunk_grammar = 'VP: {<DT>?<JJ>*<NN><VB.*><RB>?}'


In [16]:
# creating verb phrase RegexpParser object
vp_chunk_parser = RegexpParser(vp_chunk_grammar)


In [17]:
# creating empty lists to hold noun phrase chunked sentences and verb phrase chunked sentences
np_chunked_text = []
vp_chunked_text = []



In [18]:
# creating a for loop through each pos-tagged sentence which chunks each sentence and appends to list 
for pos_tagged in pos_tagged_text:
    np_chunked_text.append(np_chunk_parser.parse(pos_tagged))
    vp_chunked_text.append(vp_chunk_parser.parse(pos_tagged))
 
    
    


In [19]:
#visualizing random indexes of chunked sentences
print(vp_chunked_text[3478])  
print(np_chunked_text[3478])

(S
  these/DT
  ,/,
  as/IN
  my/PRP$
  first/JJ
  essay/NN
  of/IN
  arms/NNS
  ,/,
  i/JJ
  won/VBD
  ;/:
  (VP old/JJ neleus/NN gloried/VBN)
  in/IN
  his/PRP$
  conquering/VBG
  son/NN
  ./.)
(S
  these/DT
  ,/,
  as/IN
  my/PRP$
  (NP first/JJ essay/NN)
  of/IN
  arms/NNS
  ,/,
  i/JJ
  won/VBD
  ;/:
  (NP old/JJ neleus/NN)
  gloried/VBN
  in/IN
  his/PRP$
  conquering/VBG
  (NP son/NN)
  ./.)


## Analyze Chunks

In [20]:
# storing and printing the most common NP-chunks in order to gain insights about the book's most relevant noun phrases
# using the function np_chunk_counter()

most_common_np_chunks = np_chunk_counter(np_chunked_text)
print(most_common_np_chunks)

#Thanks to this process, we gain a clear insight about the novel's main protagonists, recurrent themes and symbolics as well as recurrent associations


[((('hector', 'NN'),), 322), ((('i', 'NN'),), 277), ((('jove', 'NN'),), 257), ((('troy', 'NN'),), 208), ((('vain', 'NN'),), 195), ((('war', 'NN'),), 193), ((('son', 'NN'),), 170), ((('thou', 'NN'),), 158), ((('the', 'DT'), ('plain', 'NN')), 157), ((('the', 'DT'), ('field', 'NN')), 154), ((('the', 'DT'), ('ground', 'NN')), 138), ((('death', 'NN'),), 134), ((('hand', 'NN'),), 134), ((('greece', 'NN'),), 128), ((('heaven', 'NN'),), 127), ((('fate', 'NN'),), 127), ((('thee', 'NN'),), 122), ((('breast', 'NN'),), 121), ((('the', 'DT'), ('trojan', 'NN')), 120), ((('the', 'DT'), ('god', 'NN')), 119), ((('the', 'DT'), ('war', 'NN')), 117), ((('the', 'DT'), ('greeks', 'NN')), 116), ((('blood', 'NN'),), 115), ((('homer', 'NN'),), 112), ((('the', 'DT'), ('king', 'NN')), 105), ((('rage', 'NN'),), 103), ((('force', 'NN'),), 103), ((('care', 'NN'),), 99), ((('head', 'NN'),), 98), ((('man', 'NN'),), 97)]


In [21]:
# storing and printing the most common VP-chunks using the function vp_chunk_counter()
most_common_vp_chunks = vp_chunk_counter(vp_chunked_text)
print(most_common_vp_chunks)


[((("'t", 'NN'), ('is', 'VBZ')), 19), ((('i', 'NN'), ('am', 'VBP')), 11), ((("'t", 'NN'), ('was', 'VBD')), 11), ((('the', 'DT'), ('hero', 'NN'), ('said', 'VBD')), 9), ((('i', 'NN'), ('know', 'VBP')), 8), ((('i', 'NN'), ('saw', 'VBD')), 8), ((('the', 'DT'), ('scene', 'NN'), ('lies', 'VBZ')), 7), ((('i', 'NN'), ('was', 'VBD')), 6), ((('confess', 'NN'), ("'d", 'VBD')), 6), ((('the', 'DT'), ('scene', 'NN'), ('is', 'VBZ')), 6), ((('view', 'NN'), ("'d", 'VBD')), 5), ((('i', 'NN'), ('felt', 'VBD')), 5), ((('i', 'NN'), ('bear', 'VBP')), 5), ((('hector', 'NN'), ('is', 'VBZ')), 5), ((('vain', 'NN'), ('was', 'VBD')), 5), ((('homer', 'NN'), ('was', 'VBD')), 4), ((('i', 'NN'), ('have', 'VBP')), 4), ((('hunger', 'NN'), ('was', 'VBD')), 4), ((('glory', 'NN'), ('lost', 'VBN')), 4), ((('i', 'NN'), ('see', 'VBP')), 4), ((('war', 'NN'), ('be', 'VB')), 4), ((('the', 'DT'), ('weapon', 'NN'), ('stood', 'VBD')), 4), ((('i', 'NN'), ('go', 'VBP')), 4), ((('the', 'DT'), ('silence', 'NN'), ('broke', 'VBD')), 4),