In [None]:
import numpy as np
import pandas as pd

In [None]:
text = 'Years after a rebellion spurred by a stolen bride to be and the blind ambitions of a mad King, Robert of the house Baratheon (Mark Addy) sits on the much desired Iron Throne. In the mythical land of Westeros, nine noble families fight for every inch of control and every drop of power.'

In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
word_tokens = word_tokenize(text)
word_tokens 

In [None]:
sentence_tokens = sent_tokenize(text)
sentence_tokens

['Years after a rebellion spurred by a stolen bride to be and the blind ambitions of a mad King, Robert of the house Baratheon (Mark Addy) sits on the much desired Iron Throne.',
 'In the mythical land of Westeros, nine noble families fight for every inch of control and every drop of power.']

### POS Tagging

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
tag_words = nltk.pos_tag(word_tokens)
tag_words

[('Years', 'NNS'),
 ('after', 'IN'),
 ('a', 'DT'),
 ('rebellion', 'NN'),
 ('spurred', 'VBN'),
 ('by', 'IN'),
 ('a', 'DT'),
 ('stolen', 'VBN'),
 ('bride', 'NN'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('blind', 'JJ'),
 ('ambitions', 'NNS'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('mad', 'JJ'),
 ('King', 'NNP'),
 (',', ','),
 ('Robert', 'NNP'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('house', 'NN'),
 ('Baratheon', 'NNP'),
 ('(', '('),
 ('Mark', 'NNP'),
 ('Addy', 'NNP'),
 (')', ')'),
 ('sits', 'VBZ'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('much', 'RB'),
 ('desired', 'VBN'),
 ('Iron', 'NNP'),
 ('Throne', 'NNP'),
 ('.', '.'),
 ('In', 'IN'),
 ('the', 'DT'),
 ('mythical', 'JJ'),
 ('land', 'NN'),
 ('of', 'IN'),
 ('Westeros', 'NNP'),
 (',', ','),
 ('nine', 'CD'),
 ('noble', 'JJ'),
 ('families', 'NNS'),
 ('fight', 'VBP'),
 ('for', 'IN'),
 ('every', 'DT'),
 ('inch', 'NN'),
 ('of', 'IN'),
 ('control', 'NN'),
 ('and', 'CC'),
 ('every', 'DT'),
 ('drop', 'NN'),
 ('of', 'IN'),
 ('power', 'NN'),
 ('

### Stop Words Removal

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
cleaned_text = [x for x in word_tokens if x not in stop_words]
cleaned_text

['Years',
 'rebellion',
 'spurred',
 'stolen',
 'bride',
 'blind',
 'ambitions',
 'mad',
 'King',
 ',',
 'Robert',
 'house',
 'Baratheon',
 '(',
 'Mark',
 'Addy',
 ')',
 'sits',
 'much',
 'desired',
 'Iron',
 'Throne',
 '.',
 'In',
 'mythical',
 'land',
 'Westeros',
 ',',
 'nine',
 'noble',
 'families',
 'fight',
 'every',
 'inch',
 'control',
 'every',
 'drop',
 'power',
 '.']

### Stemming

In [None]:
from nltk.stem import PorterStemmer
porter_stem = PorterStemmer()

In [None]:
for w in cleaned_text:
  print(w, ' : ', porter_stem.stem(w))

Years  :  year
rebellion  :  rebellion
spurred  :  spur
stolen  :  stolen
bride  :  bride
blind  :  blind
ambitions  :  ambit
mad  :  mad
King  :  king
,  :  ,
Robert  :  robert
house  :  hous
Baratheon  :  baratheon
(  :  (
Mark  :  mark
Addy  :  addi
)  :  )
sits  :  sit
much  :  much
desired  :  desir
Iron  :  iron
Throne  :  throne
.  :  .
In  :  In
mythical  :  mythic
land  :  land
Westeros  :  westero
,  :  ,
nine  :  nine
noble  :  nobl
families  :  famili
fight  :  fight
every  :  everi
inch  :  inch
control  :  control
every  :  everi
drop  :  drop
power  :  power
.  :  .


In [None]:
tokens = [porter_stem.stem(w) for w in word_tokens]

In [None]:
tokens

['year',
 'after',
 'a',
 'rebellion',
 'spur',
 'by',
 'a',
 'stolen',
 'bride',
 'to',
 'be',
 'and',
 'the',
 'blind',
 'ambit',
 'of',
 'a',
 'mad',
 'king',
 ',',
 'robert',
 'of',
 'the',
 'hous',
 'baratheon',
 '(',
 'mark',
 'addi',
 ')',
 'sit',
 'on',
 'the',
 'much',
 'desir',
 'iron',
 'throne',
 '.',
 'In',
 'the',
 'mythic',
 'land',
 'of',
 'westero',
 ',',
 'nine',
 'nobl',
 'famili',
 'fight',
 'for',
 'everi',
 'inch',
 'of',
 'control',
 'and',
 'everi',
 'drop',
 'of',
 'power',
 '.']

### Lemmatization

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [None]:
lemma.lemmatize('rebellion')

'rebellion'

In [None]:
lemma.lemmatize('thrones')

'throne'

### Term Frequency

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()

In [None]:
transformed_df = vector.fit_transform(word_tokens)

In [None]:
for i, j in zip(vector.get_feature_names(), np.ravel(transformed_df).sum()):
  print(i, ' ', j)

addy     (0, 40)	1
after     (0, 1)	1
ambitions   
and     (0, 31)	1
baratheon     (0, 34)	1
be     (0, 8)	1
blind   
bride     (0, 35)	1
by     (0, 7)	1
control     (0, 38)	1
desired     (0, 5)	1
drop     (0, 3)	1
every     (0, 36)	1
families     (0, 6)	1
fight     (0, 2)	1
for     (0, 28)	1
house   
in     (0, 22)	1
inch     (0, 20)	1
iron   
king     (0, 32)	1
land     (0, 28)	1
mad     (0, 36)	1
mark     (0, 16)	1
much     (0, 4)	1
mythical   
nine     (0, 23)	1
noble     (0, 0)	1
of   
on     (0, 33)	1
power     (0, 29)	1
rebellion     (0, 36)	1
robert     (0, 24)	1
sits     (0, 10)	1
spurred     (0, 19)	1
stolen     (0, 37)	1
the   
throne     (0, 17)	1
to     (0, 36)	1
westeros     (0, 25)	1
years     (0, 21)	1




### Inverse Document Frequency

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [None]:
result = tfidf.fit_transform(word_tokens)
for i, j in zip(tfidf.get_feature_names_out(), tfidf.idf_):
  print(i, ' ', j)

addy   4.401197381662156
after   4.401197381662156
ambitions   4.401197381662156
and   3.995732273553991
baratheon   4.401197381662156
be   4.401197381662156
blind   4.401197381662156
bride   4.401197381662156
by   4.401197381662156
control   4.401197381662156
desired   4.401197381662156
drop   4.401197381662156
every   3.995732273553991
families   4.401197381662156
fight   4.401197381662156
for   4.401197381662156
house   4.401197381662156
in   4.401197381662156
inch   4.401197381662156
iron   4.401197381662156
king   4.401197381662156
land   4.401197381662156
mad   4.401197381662156
mark   4.401197381662156
much   4.401197381662156
mythical   4.401197381662156
nine   4.401197381662156
noble   4.401197381662156
of   3.302585092994046
on   4.401197381662156
power   4.401197381662156
rebellion   4.401197381662156
robert   4.401197381662156
sits   4.401197381662156
spurred   4.401197381662156
stolen   4.401197381662156
the   3.4849066497880004
throne   4.401197381662156
to   4.4011973816