# Text_Representation_NLP_Assessment

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('reviews.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,0,One of the other reviewers has mentioned that ...,positive
1,1,A wonderful little production. <br /><br />The...,positive
2,2,I thought this was a wonderful way to spend ti...,positive
3,3,Basically there's a family where a little boy ...,negative
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.sample(4)

Unnamed: 0.1,Unnamed: 0,review,sentiment
3085,3085,Not only is this a great African-American clas...,positive
4385,4385,In this film I prefer Deacon Frost. He's so se...,positive
2253,2253,The Twins Effect - Chinese Action/Comedy - (Ch...,positive
1548,1548,"Wow,this is in my opinion the best sitcom sinc...",positive


In [5]:
df.shape

(5000, 3)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5000 non-null   int64 
 1   review      5000 non-null   object
 2   sentiment   5000 non-null   object
dtypes: int64(1), object(2)
memory usage: 117.3+ KB


In [7]:
df.isnull().sum()

Unnamed: 0    0
review        0
sentiment     0
dtype: int64

# Text Preprocessing

## 1. Lowercasing

In [8]:
df['review'] = df['review'].str.lower()

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,0,one of the other reviewers has mentioned that ...,positive
1,1,a wonderful little production. <br /><br />the...,positive
2,2,i thought this was a wonderful way to spend ti...,positive
3,3,basically there's a family where a little boy ...,negative
4,4,"petter mattei's ""love in the time of money"" is...",positive


## 2. Removing  HTML Tags

In [10]:
import re

In [11]:
df['review'][0]

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the fa

In [12]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [13]:
df['review'] = df['review'].apply(remove_html_tags)

In [14]:
df['review'][0]

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.i would say the main appeal of the show is due to the fact that it goes where other shows wo

## 3. Removing Punctuation

In [15]:
df['review'][3000]

'i bought a set of 4 dvds for 10 bucks at my local suncoast, which contained this movie and three other trashy horror flicks (including its sequel "witchcraft xi"). so basically i paid the rock bottom price of $2.50 for this movie, if you do the math. i can\'t exactly say i was ripped off. i have a thing for trashy horror movies, but this is the kind of trash that gives trash a bad name. the budget couldn\'t be over $1,000 (though it appears as if they spent a total of $1.50). i know it\'s a low-budget film, but that\'s no excuse for totally uninspired camerawork. the film "blood cult," though not very good, was made for an extremely low budget and still had fairly good camerawork and acting. the acting in this movie is the definition of "effortless," especially from that muscular guy with the texas accent. everyone is pretty much reading their lines off the page. you can take that figuratively or literally. i wouldn\'t be surprised if the script was off-camera as they were performing.

In [16]:
import string

In [17]:
exclude = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

In [18]:
df['review'] = df['review'].apply(remove_punc)

In [19]:
df['review'][3000]

'i bought a set of 4 dvds for 10 bucks at my local suncoast which contained this movie and three other trashy horror flicks including its sequel witchcraft xi so basically i paid the rock bottom price of 250 for this movie if you do the math i cant exactly say i was ripped off i have a thing for trashy horror movies but this is the kind of trash that gives trash a bad name the budget couldnt be over 1000 though it appears as if they spent a total of 150 i know its a lowbudget film but thats no excuse for totally uninspired camerawork the film blood cult though not very good was made for an extremely low budget and still had fairly good camerawork and acting the acting in this movie is the definition of effortless especially from that muscular guy with the texas accent everyone is pretty much reading their lines off the page you can take that figuratively or literally i wouldnt be surprised if the script was offcamera as they were performing i said before that ive never seen a bad engli

## 4. Stemming

In [20]:
from nltk.stem.porter import PorterStemmer

In [21]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [22]:
df['review'] = df['review'].apply(stem_words)

In [23]:
df['review'][0]

'one of the other review ha mention that after watch just 1 oz episod youll be hook they are right as thi is exactli what happen with meth first thing that struck me about oz wa it brutal and unflinch scene of violenc which set in right from the word go trust me thi is not a show for the faint heart or timid thi show pull no punch with regard to drug sex or violenc it is hardcor in the classic use of the wordit is call oz as that is the nicknam given to the oswald maximum secur state penitentari it focus mainli on emerald citi an experiment section of the prison where all the cell have glass front and face inward so privaci is not high on the agenda em citi is home to manyaryan muslim gangsta latino christian italian irish and moreso scuffl death stare dodgi deal and shadi agreement are never far awayi would say the main appeal of the show is due to the fact that it goe where other show wouldnt dare forget pretti pictur paint for mainstream audienc forget charm forget romanceoz doesnt 

## 5. Tokenization

In [24]:
import spacy

In [25]:
nlp = spacy.load('en_core_web_sm')

def tokenize_review(review):
    doc = nlp(review)
    tokens = [token.text for token in doc if not token.is_space]
    
    return tokens

In [26]:
df['review'] = df['review'].apply(tokenize_review)

In [27]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,0,"[one, of, the, other, review, ha, mention, tha...",positive
1,1,"[a, wonder, littl, product, the, film, techniq...",positive
2,2,"[i, thought, thi, wa, a, wonder, way, to, spen...",positive
3,3,"[basic, there, a, famili, where, a, littl, boy...",negative
4,4,"[petter, mattei, love, in, the, time, of, mone...",positive


In [28]:
df['review'][0]

['one',
 'of',
 'the',
 'other',
 'review',
 'ha',
 'mention',
 'that',
 'after',
 'watch',
 'just',
 '1',
 'oz',
 'episod',
 'you',
 'll',
 'be',
 'hook',
 'they',
 'are',
 'right',
 'as',
 'thi',
 'is',
 'exactli',
 'what',
 'happen',
 'with',
 'meth',
 'first',
 'thing',
 'that',
 'struck',
 'me',
 'about',
 'oz',
 'wa',
 'it',
 'brutal',
 'and',
 'unflinch',
 'scene',
 'of',
 'violenc',
 'which',
 'set',
 'in',
 'right',
 'from',
 'the',
 'word',
 'go',
 'trust',
 'me',
 'thi',
 'is',
 'not',
 'a',
 'show',
 'for',
 'the',
 'faint',
 'heart',
 'or',
 'timid',
 'thi',
 'show',
 'pull',
 'no',
 'punch',
 'with',
 'regard',
 'to',
 'drug',
 'sex',
 'or',
 'violenc',
 'it',
 'is',
 'hardcor',
 'in',
 'the',
 'classic',
 'use',
 'of',
 'the',
 'wordit',
 'is',
 'call',
 'oz',
 'as',
 'that',
 'is',
 'the',
 'nicknam',
 'given',
 'to',
 'the',
 'oswald',
 'maximum',
 'secur',
 'state',
 'penitentari',
 'it',
 'focus',
 'mainli',
 'on',
 'emerald',
 'citi',
 'an',
 'experiment',
 'section

# Text Representation

### 1. Find out the number of words in the entire corpus and also the total number of unique words(vocabulary)

In [29]:
# Combine all text from the 'review' column into a single corpus string
corpus = ' '.join(df['review'].astype(str))

# Tokenize the corpus into words (assuming words are separated by spaces)
words = corpus.split()

# Calculate the total number of words in the corpus
total_words = len(words)

# Find the unique words (vocabulary) in the corpus
unique_words = set(words)
vocab_size = len(unique_words)

# Print the results
print(f"Total number of words in the corpus: {total_words}")
print(f"Total number of unique words (vocabulary) in the corpus: {vocab_size}")

Total number of words in the corpus: 1140659
Total number of unique words (vocabulary) in the corpus: 43904


### 2. Apply bag words and find the vocabulary also find the times each word has occured

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
# Create a CountVectorizer object to tokenize the text and build the bag of words
vectorizer = CountVectorizer()

# Fit the CountVectorizer on your review column and transform it into a bag of words representation
X = vectorizer.fit_transform(df['review'].astype(str))

# Get the vocabulary (unique words) from the CountVectorizer
vocabulary = vectorizer.get_feature_names_out()

# Get the word frequencies from the bag of words representation
word_frequencies = X.toarray().sum(axis=0)

# Create a dictionary to store the vocabulary and word frequencies
bag_of_words = dict(zip(vocabulary, word_frequencies))

In [32]:
print(len(bag_of_words.keys()))

41250


In [33]:
# Display the vocabulary and word frequencies
print("Vocabulary:", list(bag_of_words.keys()))
print("Word Frequencies:")
for word, freq in bag_of_words.items():
    print(f"{word}: {freq}")

Word Frequencies:
00: 1
007: 10
010: 6
02: 3
0310: 1
05: 1
0510: 1
06: 2
07: 1
0815: 1
0clock: 1
0s: 1
0the: 1
10: 412
100: 83
1000: 16
10000: 2
100000: 6
10002000: 1
100after: 1
100if: 1
100minut: 1
100th: 3
100time: 1
101: 12
1010: 48
1010ay: 1
1010the: 1
1014: 1
1015: 5
101year: 1
102: 1
1020: 2
1020000: 1
103: 2
1030: 1
105: 5
106: 1
107: 2
1072000: 1
108: 1
1095: 1
10a: 1
10bad: 1
10dimension: 1
10dirbrad: 1
10diremmanuel: 1
10dirgeorg: 1
10dirjon: 1
10dirmick: 1
10fda: 1
10for: 1
10go: 1
10good: 1
10i: 2
10it: 1
10may: 1
10note: 1
10now: 1
10page: 1
10star: 1
10th: 5
10the: 1
10thi: 2
10well: 1
10what: 1
10which: 1
10x: 1
11: 32
110: 28
1100: 2
1100ad: 1
11072004: 1
111: 1
1110: 2
1111: 3
1130pm: 1
1133: 1
1138: 1
116: 1
1166: 1
116th: 1
117: 2
11th: 2
12: 95
120: 5
12000000: 2
1201: 1
1201pm: 1
120page: 1
1213: 1
12134: 1
1214: 1
1229: 1
123: 2
1230: 1
12638: 1
129: 1
12a: 1
12hour: 1
12it: 1
12th: 2
12thrateat: 1
12vintag: 1
12yearold: 4
13: 44
1300: 3
13000: 2
133: 1
1331: 1
1

In [34]:
# Display the vocabulary separately
print("Vocabulary:")
for word in bag_of_words.keys():
    print(word)

Vocabulary:
00
007
010
02
0310
05
0510
06
07
0815
0clock
0s
0the
10
100
1000
10000
100000
10002000
100after
100if
100minut
100th
100time
101
1010
1010ay
1010the
1014
1015
101year
102
1020
1020000
103
1030
105
106
107
1072000
108
1095
10a
10bad
10dimension
10dirbrad
10diremmanuel
10dirgeorg
10dirjon
10dirmick
10fda
10for
10go
10good
10i
10it
10may
10note
10now
10page
10star
10th
10the
10thi
10well
10what
10which
10x
11
110
1100
1100ad
11072004
111
1110
1111
1130pm
1133
1138
116
1166
116th
117
11th
12
120
12000000
1201
1201pm
120page
1213
12134
1214
1229
123
1230
12638
129
12a
12hour
12it
12th
12thrateat
12vintag
12yearold
13
1300
13000
133
1331
1335i
134
135
1371sound
13848
13but
13far
13th
13thversion
13thwhat
13yearold
14
140
140am
1415
1416
142cuz
1470
1473
1489
14goingon9
14th
14yrold
15
150
1500
150000
1500000
1500sthe
1516yearsold
1517
1520
15bu
15minut
15th
15the
15year
15yearold
16
160
1617
163minut
169
1697
16hour
16th
17
1700
172003
1745
175
1750
177
1775
1790
1794
17th
18
180

gokermit
golanglobu
gold
goldact
goldberg
goldblum
goldcliché
golddig
golddigg
golden
goldeney
goldeneyethi
goldfing
goldfish
goldi
goldman
goldmask
goldmin
goldoni
goldsel
goldsmith
goldsworthi
goldth
goldthwait
golem
golf
golfyou
golino
golli
golmal
golthwait
golubeva
gomer
gomez
gon
gondek
gone
gonear
gonecassavet
goneh
goner
goneth
gong
gonorrheashould
gonzal
gonzalo
gonzo
gonzález
goo
goober
good
good310
good3internet
good4ghost
good7
good8
goodabout
goodakshay
goodal
goodand
goodbad
goodbut
goodby
goodchild
gooddoer
goodenjoy
goodentertain
goodfella
goodfor
goodgfella
goodgreat
goodheart
goodi
goodin
goodit
goodlook
goodman
goodmi
goodnatur
goodnessthat
goodnessthos
goodnight
goodnightw
goodohwait
goodon
goodplay
goodth
goodther
goodthi
goodthough
goodtim
gooduday
goodvirtu
goodwhil
goodwil
goodygoodi
goodyou
goodyun
gooey
goof
goofbal
goofi
goofiest
goofili
goofylook
goofysur
goofyunfortun
googi
googl
goon
gooni
goos
goosebump
gopal
gopalakrishnan
gopher
gor
goran
goranson
gorbu

In [35]:
# Display the word frequencies separately
print("\nWord Frequencies:")
for freq in bag_of_words.values():
    print(freq)


Word Frequencies:
1
10
6
3
1
1
1
2
1
1
1
1
1
412
83
16
2
6
1
1
1
1
3
1
12
48
1
1
1
5
1
1
2
1
2
1
5
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
5
1
2
1
1
1
1
32
28
2
1
1
1
2
3
1
1
1
1
1
1
2
2
95
5
2
1
1
1
1
1
1
1
2
1
1
1
1
1
1
2
1
1
4
44
3
2
1
1
1
2
1
1
1
1
1
23
1
1
1
25
2
1
2
2
1
1
1
1
1
4
1
104
6
1
1
2
1
1
1
3
1
1
1
1
1
1
21
2
1
1
1
1
1
2
17
1
1
1
1
1
1
1
1
2
1
21
4
8
1
1
1
1
2
1
2
2
3
1
4
1
2
2
1
1
1
3
1
1
2
6
1
11
1
1
3
2
6
1
1
1
1
2
1
2
2
1
1
1
1
1
1
14
5
6
1
1
1
1
2
4
2
1
34
6
1
10
10
1
7
4
16
9
8
12
1
40
1
1
3
1
9
1
8
8
1
14
1
5
1
11
9
8
1
1
60
2
1
1
1
1
1
1
9
9
9
8
1
9
1
1
4
1
1
1
13
3
10
40
1
1
6
5
1
2
9
1
2
11
8
1
16
1
13
1
59
1
1
1
11
16
1
1
12
13
1
9
1
1
14
9
13
10
1
1
52
14
1
1
12
17
1
18
1
9
10
12
1
1
1
1
14
10
3
24
3
1
4
8
14
18
11
10
1
1
1
13
1
1
26
1
1
1
13
1
1
1
2
1
1
1
1
1
1
1
1
2
1
31
1
1
1
141
9
34
3
1
1
1
1
32
1
23
21
1
14
1
31
21
1
19
1
6
1
1
5
1
1
1
1
1
1
1
2
4
1
1
1
1
1
24
1
7
18
1
1
16
1
16
1
1
1
2
1
20
1
1
1
6
1
19
1
1
45
8
1
2
1
1
1
1
3
9
3
1
2
1
8
1
1
2

1
2
5
1
1
1
2
37
5
19
2
1
1
1
14
2
1
2
3
7
5
13
24
1
1
12
1
1
1
1
2
1
2
29
5
3
1
7
28
1
2
1
1
1
1
6
2
2
14
1
1
14
1
1
1
1
323
1
1
2
1
1
2
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
2
2
11
1
28
1
5
1
5
2
4
11
24
2
1
8
29
1
1
4
29
1
2
1
4
1
5
13
1
29
3
6
226
7
1
1
2
1
16
1
1
1
42
21
1
1
2
1
3
2
10
5
1
9
7
3
2
21
3
1
1
1
2
1
5
13
4
29
5
3
1
2
18
14
1
3
10
15
1
1
2
1
12
8
1
1
5
2
2
3
2
2
7
1
405
6
1
1
1
1
1
1
2
21
1
2
9
1
2
7
3
19
16
1
1
3
1
24
1
13
1
5
4
2
2
1
1
1
11
88
1
12
1
1
1
6
5
2
1
5
2
1
1
13
1
17
2
2
2
5
38
4
1
13
3
5
34
1
10
1
9
1
5
19
5
1
1
14
1
2
5
321
1
1
1
1
1
6
1
1
1
1
1
2
6
2
13
4
1
22
14
23
2
1
1
27
1
1
1
41
22
1
2
4
3
1
49
1
1
5
1
1
1
6
1
1
3
1
10
9
5
1
6
2
20
1
1
1
5
3
71
1
1
1
1
1
34
3
2
1
1
1740
1
1
1
2
4
1
1
1
1
30
1
1
4
9
1
1
1
1
1
2
1
2
1
1
1
7
1
1
1
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
31
11
1
1
1
1
2
5
2
1
1
1
1
1
1
1
1
1
7
1
1
1
1
35
1
1
1
1
1
51
1
2
2
13
1
71
1
1
1
4
1
15
102
1
1
1
36
1
51
1
1
1
1
220
1
3
1
5
1
1
3
1
14
1
16
1
4
34
1
3
9
6
810
1
1
1
1
1
1
1
1
1
1
10
1
11
1


1
6
1
1
2
359
3
8
8
7
3
6
2
2
1
1
1
70
1
4
1
1
2
5
3
3
27
1
1
1
4
1
1
9
4
3
7
1
8
3
1
1
1
2454
1
2
4
1
9
3
1
12
19
1
2
5
1
5
1
2
1
3
1
1
2
1
19
1
2
1
1
1
209
1
1
1
17
7
1
3
2
1
3
6
1
1
1
1
4
42
12
2
6
4
2
1
1
6
2
6
3
1
1
1
1
2
1
1
1
1
2
3
31
1
1
18
4
4
1
2
1
1
2
1
4
1
1
7
3
1
813
1
42
1
5
8
1
1
1
1
1
1
1
1
1
1
1023
1
1
1
1
1
1
15
1
41
1
1
1
1
1
1
4
1
1
1
1
2
1
6
2
1
1
6
1
400
1
1
2
3
1
2
8
1
1
1
1
1
19
4
3
1
51
95
1
1
9
1
1
20
1
1
2
2
8
47
1
1
1
3
3
1
2
10
1
1
3
3
3
30
1
2
1
13
1
2
16
2
1
12
3
2
10
3
3
2
1
10
2
2
3
15
1
1
1
1
1
1
2
1
1
195
9
1
2
1
2
1
1
5
136
1
1
1
1
1
1
1
1
1
1
1
81
6
1
1
8
1
1
1
1
1
1
1
7
1
1
3
5
22
1
10
2
2
1
3
6
1
1
1
6
19
2
5
19
46
1
1
3
136
2
1
1
2
1
1
1
1
1
18
22
1
2
1
2
10
1
4
4
4
12
1
1
1
11
1
678
1
1
5
1
23
1
1
7
1
1
3
1
2
55
1
1
7
3
11
1
1
2
1
3
1
1
1
6
1
6
1
1
2
1
1
3
1
1
6
3
2
1
1
4
4
9
21
1
1
1
2
13
7
1
3
1
3
1
1
29
4
14
1
9
2
6
1
19
3
1
1
4
1
1
9
1
10
13
1
2797
1
1
2
3
1
5
2
2
1
5
1
1
1
2
1
8
2
1
11
13
1
1
1
16
442
1
1
1
1
1
1
22
1
5
2
1
1
1
1
4
1
3
1
14

### 3. Apply bag of bi-gram and bag of tri-gram and write down your observation about the dimensionality of the vocabulary

#### Bi-Gram

In [36]:
# Create a CountVectorizer object for bag of bi-grams
bi_gram_vectorizer = CountVectorizer(ngram_range=(2, 2))

# Fit and transform the CountVectorizer for bi-grams on your review column
bi_gram_X = bi_gram_vectorizer.fit_transform(df['review'].astype(str))

# Get the vocabulary for bi-grams
bi_gram_vocabulary = bi_gram_vectorizer.get_feature_names_out()

# Print the dimensionality of the bi-gram vocabulary
print("Dimensionality of Bag of Bi-grams Vocabulary:", len(bi_gram_vocabulary))

Dimensionality of Bag of Bi-grams Vocabulary: 397927


In [37]:
bi_gram_vocabulary

array(['00 and', '007 agent', '007 debut', ..., 'ïn america',
       'önsjön and', 'überwoman the'], dtype=object)

#### Tri-Gram

In [38]:
# Create a CountVectorizer object for bag of tri-grams
tri_gram_vectorizer = CountVectorizer(ngram_range=(3, 3))

# Fit and transform the CountVectorizer for tri-grams on your review column
tri_gram_X = tri_gram_vectorizer.fit_transform(df['review'].astype(str))

# Get the vocabulary for tri-grams
tri_gram_vocabulary = tri_gram_vectorizer.get_feature_names_out()

# Print the dimensionality of the tri-gram vocabulary
print("Dimensionality of Bag of Tri-grams Vocabulary:", len(tri_gram_vocabulary))

Dimensionality of Bag of Tri-grams Vocabulary: 831859


In [39]:
tri_gram_vocabulary

array(['00 and as', '007 agent and', '007 debut in', ...,
       'ïn america with', 'önsjön and it', 'überwoman the snobbi'],
      dtype=object)

### 4. Apply TF-IDF and find out the idf scores of words, also find out the vocabulary

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
# Create a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the TfidfVectorizer on your review column
tfidf_matrix = tfidf_vectorizer.fit_transform(df['review'].astype(str))

# Get the vocabulary from the TfidfVectorizer
vocabulary = tfidf_vectorizer.get_feature_names_out()

# Get the IDF scores from the TfidfVectorizer
idf_scores = tfidf_vectorizer.idf_

# Create a dictionary to store the vocabulary and IDF scores
idf_dict = dict(zip(vocabulary, idf_scores))

In [42]:
# Display the vocabulary
print("Vocabulary:", vocabulary)

Vocabulary: ['00' '007' '010' ... 'ïn' 'önsjön' 'überwoman']


In [43]:
# Display the IDF scores for each word
print("\nIDF Scores:")
for word, idf in idf_dict.items():
    print(f"{word}: {idf}")


IDF Scores:
00: 8.824245990858959
007: 7.571483022363591
010: 7.571483022363591
02: 8.131098810299013
0310: 8.824245990858959
05: 8.824245990858959
0510: 8.824245990858959
06: 8.824245990858959
07: 8.824245990858959
0815: 8.824245990858959
0clock: 8.824245990858959
0s: 8.824245990858959
0the: 8.824245990858959
10: 3.6651906916444292
100: 5.147945318951882
1000: 6.684179827362688
10000: 8.418780882750795
100000: 7.725633702190849
10002000: 8.824245990858959
100after: 8.824245990858959
100if: 8.824245990858959
100minut: 8.824245990858959
100th: 8.131098810299013
100time: 8.824245990858959
101: 7.1194978986205335
1010: 5.625572873308277
1010ay: 8.824245990858959
1010the: 8.824245990858959
1014: 8.824245990858959
1015: 7.725633702190849
101year: 8.824245990858959
102: 8.824245990858959
1020: 8.418780882750795
1020000: 8.824245990858959
103: 8.418780882750795
1030: 8.824245990858959
105: 7.907955258984804
106: 8.824245990858959
107: 8.418780882750795
1072000: 8.824245990858959
108: 8.82424