# Project 1.2 : Insights into Nicki Minaj

## Importing and Preprocessing Text Data

In [39]:
# importing NLTK functions for part-of-speech tagging and regex-based parsing

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag, RegexpParser

# importing function that returns most common noun phrases and verb phrases chunks
from chunk_counters import np_chunk_counter, vp_chunk_counter
# importing function that tokenizes text input firstly into sentences, then into words
from tokenize_words import word_sentence_tokenize



# importing the song Chun-Li from Nicki Minaj, lowering the case for processing
text = open('nicki_minaj.txt', encoding = 'utf-8').read().lower()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Usuari\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Usuari\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [40]:
# sentence and word tokenizing text in order to perform sentence-by-sentence parsing analysis later.
word_tokenized_text = word_sentence_tokenize(text)



In [41]:
# storing and printing a random index of word_tokenized_text to visualize what the function returns

test_word_tokenized_sentence = word_tokenized_text[1]
print(test_word_tokenized_sentence)

['now', 'everybody', 'like', ',', '``', 'she', 'really', 'is', 'the', 'best', "''", 'you', 'play', 'checkers', ',', 'could', "n't", 'beat', 'me', 'playin', "'", 'chess', 'now', 'i', "'m", 'about', 'to', 'turn', 'around', 'and', 'beat', 'my', 'chest', 'bitch', ',', 'it', "'s", 'king', 'kong', ',', 'yes', ',', 'it', "'s", 'king', 'kong', 'bitch', ',', 'it', "'s", 'king', 'kong', ',', 'this', 'is', 'kin', "'", 'kong', 'chinese', 'ink', 'on', ',', 'siamese', 'links', 'on', 'call', 'me', '2', 'chainz', ',', 'name', 'go', 'ding-dong', 'bitch', ',', 'it', "'s", 'king', 'kong', ',', 'yes', ',', 'i', "'m", 'king', 'kong', 'this', 'is', 'king', 'kong', '?']


## Part-of-speech Tag Text

In [42]:
# creating an empty list to hold part-of-speech tagged sentences from the text under analysis.
pos_tagged_text = []

In [43]:
# creating a for loop through each word tokenized sentence (token) and append to list of pos tagged text using nltk's pos_tag() function.
for token in word_tokenized_text:
    pos_tagged_text.append(pos_tag(token))


In [44]:
# storing and printing any random part-of-speech tagged sentence to visualize single pos-tagged sentences
test_pos_sentence = pos_tagged_text[4]
print(test_pos_sentence)



[('six', 'CD'), ('rings', 'NNS'), ('on', 'IN'), ('they', 'PRP'), ('need', 'VBP'), ('rappers', 'NNS'), ('like', 'IN'), ('me', 'PRP'), ('they', 'PRP'), ('need', 'VBP'), ('rappers', 'NNS'), ('like', 'IN'), ('me', 'PRP'), ('so', 'IN'), ('they', 'PRP'), ('can', 'MD'), ('get', 'VB'), ('on', 'IN'), ('their', 'PRP$'), ('fuckin', 'NN'), ("'", "''"), ('keyboards', 'NNS'), ('and', 'CC'), ('make', 'VB'), ('me', 'PRP'), (',', ','), ('the', 'DT'), ('bad', 'JJ'), ('guy', 'NN'), ('chun-li', 'JJ'), ('ayy', 'NN'), (',', ','), ('yo', 'NN'), (',', ','), ('i', 'RB'), ('been', 'VBN'), ('on', 'IN'), (',', ','), ('bitch', 'NN'), (',', ','), ('you', 'PRP'), ('been', 'VBN'), ('corn', 'NN'), ('bentley', 'NN'), ('tints', 'NNS'), ('on', 'IN'), (',', ','), ('fendi', 'JJ'), ('prints', 'NNS'), ('on', 'IN'), ('i', 'JJ'), ('mean', 'VBP'), ('i', 'VBP'), ('been', 'VBN'), ('storm', 'JJ'), (',', ','), ('x-men', 'JJ'), ('been', 'VBN'), ('formed', 'VBN'), ('he', 'PRP'), ('keep', 'VB'), ('on', 'IN'), ('dialin', 'NN'), ("'", "

## Chunk Sentences

In [45]:
# defining a noun phrase 
# (considered here as an optional determiner, any number of adjectives and an obligatory noun, in this order) chunk grammar 
np_chunk_grammar = 'NP: {<DT>?<JJ>*<NN>}'


In [46]:
# creating a noun phrase RegexpParser object 
np_chunk_parser = RegexpParser(np_chunk_grammar)

In [47]:
# defining verb phrase (here considered as noun phrase, a verb in any form and an optional adverb, in this order) chunk grammar
vp_chunk_grammar = 'VP: {<DT>?<JJ>*<NN><VB.*><RB>?}'


In [48]:
# creating verb phrase RegexpParser object
vp_chunk_parser = RegexpParser(vp_chunk_grammar)


In [49]:
# creating empty lists to hold noun phrase chunked sentences and verb phrase chunked sentences
np_chunked_text = []
vp_chunked_text = []



In [50]:
# creating a for loop through each pos-tagged sentence which chunks each sentence and appends to list 
for pos_tagged in pos_tagged_text:
    np_chunked_text.append(np_chunk_parser.parse(pos_tagged))
    vp_chunked_text.append(vp_chunk_parser.parse(pos_tagged))
 
    
    


In [51]:
# visualizing random indexes of chunked sentences
print(vp_chunked_text[2])  
print(np_chunked_text[2])

(S
  yes/UH
  ,/,
  miss/JJ
  king/NN
  kong/NN
  in/IN
  my/PRP$
  kingdom/NN
  with/IN
  my/PRP$
  timbs/NNS
  on/IN
  (/(
  how/WRB
  many/JJ
  championships/NNS
  ?/.
  )/))
(S
  yes/UH
  ,/,
  (NP miss/JJ king/NN)
  (NP kong/NN)
  in/IN
  my/PRP$
  (NP kingdom/NN)
  with/IN
  my/PRP$
  timbs/NNS
  on/IN
  (/(
  how/WRB
  many/JJ
  championships/NNS
  ?/.
  )/))


## Analyze Chunks

In [52]:
# storing and printing the most common NP-chunks in order to gain insights about the book's most relevant noun phrases
# using the function np_chunk_counter()

most_common_np_chunks = np_chunk_counter(np_chunked_text)
print(most_common_np_chunks)

# thanks to this process, we gain a clear insight about the song's main themes. 
# however, we can notice that the analysis is way less accurate than with a novel like The Iliad due to many interjections and less meaning


[((('bitch', 'NN'),), 6), ((('i', 'NN'),), 5), ((('ayy', 'NN'),), 4), ((('yo', 'NN'),), 4), ((('corn', 'NN'),), 4), ((('bentley', 'NN'),), 4), ((('dialin', 'NN'),), 2), ((('the', 'DT'), ('prince', 'NN')), 2), ((('song', 'NN'),), 2), ((('the', 'DT'), ('benz', 'NN')), 2), ((('the', 'DT'), ('bad', 'JJ'), ('guy', 'NN')), 2), ((('kong', 'NN'),), 2), ((('high', 'JJ'), ('designer', 'NN')), 2), ((('goin', 'NN'),), 1), ((('a', 'DT'), ('swim', 'NN')), 1), ((('swingin', 'NN'),), 1), ((('the', 'DT'), ('rim', 'JJ'), ('bitch', 'NN')), 1), ((('the', 'DT'), ('bench', 'NN')), 1), ((('the', 'DT'), ('court', 'NN')), 1), ((('some', 'DT'), ('haterade', 'NN')), 1), ((('thirst', 'JJ'), ('quenched', 'JJ'), ('style', 'NN')), 1), ((('this', 'DT'), ('burberry', 'NN')), 1), ((('every', 'DT'), ('word', 'NN')), 1), ((('every', 'DT'), ('inch', 'NN')), 1), ((('the', 'DT'), ('hammer', 'NN')), 1), ((('the', 'DT'), ('wrench', 'NN')), 1), ((('brrt', 'NN'),), 1), ((('that', 'DT'), ('quarter', 'NN')), 1), ((('the', 'DT'), 

In [53]:
# storing and printing the most common VP-chunks using the function vp_chunk_counter()
most_common_vp_chunks = vp_chunk_counter(vp_chunked_text)
print(most_common_vp_chunks)


[((('i', 'NN'), ("'m", 'VBP')), 2), ((('a', 'DT'), ('swim', 'NN'), ('dunked', 'VBN')), 1), ((('the', 'DT'), ('rim', 'JJ'), ('bitch', 'NN'), ('ai', 'VBP'), ("n't", 'RB')), 1), ((('this', 'DT'), ('burberry', 'NN'), ('trench', 'VBZ')), 1), ((('i', 'NN'), ('pull', 'VBP')), 1), ((('that', 'DT'), ('quarter', 'NN'), ('milli', 'VBD')), 1), ((('i', 'NN'), ('forgot', 'VBD'), ('show', 'RB')), 1), ((('the', 'DT'), ('roc', 'NN'), ('ai', 'VBP'), ("n't", 'RB')), 1), ((('a', 'DT'), ('bad', 'JJ'), ('guy', 'NN'), ('do', 'VBP')), 1), ((('i', 'NN'), ("'m", 'VBP'), ('always', 'RB')), 1), ((('name', 'NN'), ('go', 'VB')), 1)]
