In [42]:
import numpy as np
from nltk.tokenize import word_tokenize
import math
import pandas as pd

In [43]:
text = [
    "The Cricket World Cup is a major event in the world of sports.\n"+
    "Cricket players from different countries participate in the World Cup.\n"+
    "The final match of the World Cup was an exciting game.\n"+
    "Batsmen scored heavily in the tournament.\n"+
    "The World Cup trophy is awarded to the winning team.\n"
]

Added stop_words which occur frequently and carry no useful meaning. Else tf-idf can assign higher values to these words affecting its overall performance. These will not be included in the word set.

In [44]:
stop_words=['the','is','a','in','of','from','was','an','to']

In [45]:
sentences = []
word_set = set()

In [46]:
for sent in text:
    words = [word.lower() for word in word_tokenize(sent) if word.isalpha() and word not in stop_words]
    sentences.append(words)
    word_set.update(words)

In [47]:
print(word_set)
print(sentences)

{'scored', 'cup', 'game', 'participate', 'batsmen', 'different', 'major', 'countries', 'cricket', 'exciting', 'sports', 'final', 'tournament', 'winning', 'event', 'world', 'the', 'players', 'team', 'match', 'trophy', 'awarded', 'heavily'}
[['the', 'cricket', 'world', 'cup', 'major', 'event', 'world', 'sports', 'cricket', 'players', 'different', 'countries', 'participate', 'world', 'cup', 'the', 'final', 'match', 'world', 'cup', 'exciting', 'game', 'batsmen', 'scored', 'heavily', 'tournament', 'the', 'world', 'cup', 'trophy', 'awarded', 'winning', 'team']]


In [48]:
index_dict = {word: i for i, word in enumerate(word_set)}

In [49]:
word_count = {word: 0 for word in word_set}
for word in word_set:
    for sent in sentences:
        if word in sent:
            word_count[word] += 1
total_documents = len(sentences)

added smoothing (+1) to prevent divison by 0 in the calculation of idf

In [50]:
def tf_idf(sentence):
    tf_idf_vec = np.zeros((len(word_set),))
    for word in sentence:
        tf = sentence.count(word) / len(sentence)  
        idf = math.log(total_documents / (word_count[word] + 1)) 
        tf_idf_value = tf * idf
        tf_idf_vec[index_dict[word]] = tf_idf_value
    return tf_idf_vec

normalized the tf-idf values by dividing by the euclidean norm i.e. the root of the sum of squares of its elements

In [51]:
vectors = [tf_idf(sent) for sent in sentences]
vectors = [vec / np.linalg.norm(vec) for vec in vectors]

In [52]:
print(vectors[0])

[-0.11704115 -0.46816459 -0.11704115 -0.11704115 -0.11704115 -0.11704115
 -0.11704115 -0.11704115 -0.23408229 -0.11704115 -0.11704115 -0.11704115
 -0.11704115 -0.11704115 -0.11704115 -0.58520574 -0.35112344 -0.11704115
 -0.11704115 -0.11704115 -0.11704115 -0.11704115 -0.11704115]


In [53]:
embedding1=pd.DataFrame({'word':list(word_set),'embedding':vectors[0]})

In [54]:
print(embedding1)

           word  embedding
0        scored  -0.117041
1           cup  -0.468165
2          game  -0.117041
3   participate  -0.117041
4       batsmen  -0.117041
5     different  -0.117041
6         major  -0.117041
7     countries  -0.117041
8       cricket  -0.234082
9      exciting  -0.117041
10       sports  -0.117041
11        final  -0.117041
12   tournament  -0.117041
13      winning  -0.117041
14        event  -0.117041
15        world  -0.585206
16          the  -0.351123
17      players  -0.117041
18         team  -0.117041
19        match  -0.117041
20       trophy  -0.117041
21      awarded  -0.117041
22      heavily  -0.117041


In [55]:
embedding1.to_csv("embedding1.csv",index=False)