In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
from pathlib import Path
import pandas as pd

In [3]:
directory_path = "./"
text_files = glob.glob(f"{directory_path}/texts/*.txt")
print(text_files)
text_titles = [Path(text).stem for text in text_files]
print(text_titles)

['.//texts/Trump Immigration Speech 8-31-16.txt', './/texts/Trump National Prayer Breakfast.txt', './/texts/Trump Congressional Address.txt', './/texts/Trump Police Chiefs Speech.txt', './/texts/Trump Black History Month Speech.txt', './/texts/Trump Inauguration Speech.txt', './/texts/Trump CIA Speech.txt', './/texts/Trump Nomination Speech.txt', './/texts/Trump Florida Rally 2-18-17.txt', './/texts/Trump Response to Healthcare Bill Failure.txt', './/texts/Trump CPAC Speech.txt']
['Trump Immigration Speech 8-31-16', 'Trump National Prayer Breakfast', 'Trump Congressional Address', 'Trump Police Chiefs Speech', 'Trump Black History Month Speech', 'Trump Inauguration Speech', 'Trump CIA Speech', 'Trump Nomination Speech', 'Trump Florida Rally 2-18-17', 'Trump Response to Healthcare Bill Failure', 'Trump CPAC Speech']


In [4]:
tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words="english")
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)
print("vector is: ", (tfidf_vector.shape))
# shape: (11, 3474), no of docs x vocabulary words

vector is:  (11, 3474)


In [5]:
words = tfidf_vectorizer.get_feature_names_out()
# print(tfidf_vectorizer.idf_)
print(words)

['000' '10' '100' ... 'zero' 'zone' 'zones']


In [6]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,000,10,100,100th,11,113,116,1182,12,13,...,years,yes,yesterday,young,youngest,youth,zeke,zero,zone,zones
Trump Immigration Speech 8-31-16,0.045362,0.020493,0.020493,0.0,0.057764,0.011263,0.0,0.0,0.008467,0.022526,...,0.057011,0.0,0.006209,0.0,0.0,0.0,0.0,0.03851,0.011263,0.019255
Trump National Prayer Breakfast,0.0,0.018028,0.0,0.0,0.0,0.0,0.0,0.0,0.022345,0.0,...,0.046296,0.0,0.016387,0.032773,0.0,0.0,0.0,0.0,0.0,0.0
Trump Congressional Address,0.017691,0.0,0.010656,0.017571,0.015019,0.0,0.013208,0.0,0.0,0.0,...,0.047889,0.011804,0.0,0.029058,0.0,0.021312,0.0,0.0,0.0,0.0
Trump Police Chiefs Speech,0.011344,0.0,0.013666,0.0,0.0,0.0,0.0,0.022533,0.0,0.0,...,0.017547,0.0,0.012421,0.037264,0.0,0.013666,0.0,0.01926,0.0,0.0
Trump Black History Month Speech,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.021274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Trump Inauguration Speech,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033039,0.028502,0.0,0.023388,0.0,0.0,0.0,0.0,0.0,0.0
Trump CIA Speech,0.115546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.011171,0.0,0.063262,0.126524,0.0,0.0,0.057379,0.0,0.0,0.0
Trump Nomination Speech,0.038515,0.0116,0.0116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.044682,0.0,0.0,0.021087,0.019126,0.0116,0.0,0.0,0.0,0.0
Trump Florida Rally 2-18-17,0.016089,0.009691,0.0,0.0,0.0,0.0,0.012012,0.0,0.012012,0.0,...,0.03733,0.010735,0.017618,0.0,0.0,0.009691,0.0,0.0,0.0,0.054634
Trump Response to Healthcare Bill Failure,0.0,0.039125,0.078249,0.0,0.0,0.0,0.024247,0.0,0.0,0.0,...,0.0,0.021669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
query_path = "./query/query.txt"
query_vector = tfidf_vectorizer.transform([query_path])

query_df = pd.DataFrame(query_vector.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(query_vector.toarray())
print(tfidf_vectorizer.get_feature_names_out())


[[0. 0. 0. ... 0. 0. 0.]]
['000' '10' '100' ... 'zero' 'zone' 'zones']


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(query_df, tfidf_df)
print(text_files[cosine_similarities.argmax()])
print(cosine_similarities)



.//texts/Trump Inauguration Speech.txt
[[0.04133176 0.0291042  0.10933071 0.03025857 0.04713956 0.1199501
  0.04523411 0.09078907 0.05392253 0.05616625 0.08864281]]
0 [0.04133176 0.0291042  0.10933071 0.03025857 0.04713956 0.1199501
 0.04523411 0.09078907 0.05392253 0.05616625 0.08864281]


In [None]:
for idx, val in range(len(cosine_similarities[0])):
    print(idx)

0
1
2
3
4
5
6
7
8
9
10


In [48]:
doc_and_similarity = []

for idx, val in enumerate(cosine_similarities[0]):
    print("Text file: ", text_titles[idx], " , Similarity: ", val)
    doc_and_similarity.append((text_titles[idx], round(val, 3)))

Text file:  Trump Immigration Speech 8-31-16  , Similarity:  0.041331760413373234
Text file:  Trump National Prayer Breakfast  , Similarity:  0.029104196110069668
Text file:  Trump Congressional Address  , Similarity:  0.10933070585808205
Text file:  Trump Police Chiefs Speech  , Similarity:  0.03025857207314158
Text file:  Trump Black History Month Speech  , Similarity:  0.04713955905442806
Text file:  Trump Inauguration Speech  , Similarity:  0.11995010220187391
Text file:  Trump CIA Speech  , Similarity:  0.045234114663388586
Text file:  Trump Nomination Speech  , Similarity:  0.09078906556252624
Text file:  Trump Florida Rally 2-18-17  , Similarity:  0.053922530556413655
Text file:  Trump Response to Healthcare Bill Failure  , Similarity:  0.05616624749241182
Text file:  Trump CPAC Speech  , Similarity:  0.08864281287085676


In [49]:
print(doc_and_similarity)
doc_and_similarity.sort(key= lambda x: x[1], reverse=True)

[('Trump Immigration Speech 8-31-16', np.float64(0.041)), ('Trump National Prayer Breakfast', np.float64(0.029)), ('Trump Congressional Address', np.float64(0.109)), ('Trump Police Chiefs Speech', np.float64(0.03)), ('Trump Black History Month Speech', np.float64(0.047)), ('Trump Inauguration Speech', np.float64(0.12)), ('Trump CIA Speech', np.float64(0.045)), ('Trump Nomination Speech', np.float64(0.091)), ('Trump Florida Rally 2-18-17', np.float64(0.054)), ('Trump Response to Healthcare Bill Failure', np.float64(0.056)), ('Trump CPAC Speech', np.float64(0.089))]


In [60]:
for (file, score) in doc_and_similarity:
    print(f"File: ", file, ' :::: Similarity Score: ', score)

File:  Trump Inauguration Speech  :::: Similarity Score:  0.12
File:  Trump Congressional Address  :::: Similarity Score:  0.109
File:  Trump Nomination Speech  :::: Similarity Score:  0.091
File:  Trump CPAC Speech  :::: Similarity Score:  0.089
File:  Trump Response to Healthcare Bill Failure  :::: Similarity Score:  0.056
File:  Trump Florida Rally 2-18-17  :::: Similarity Score:  0.054
File:  Trump Black History Month Speech  :::: Similarity Score:  0.047
File:  Trump CIA Speech  :::: Similarity Score:  0.045
File:  Trump Immigration Speech 8-31-16  :::: Similarity Score:  0.041
File:  Trump Police Chiefs Speech  :::: Similarity Score:  0.03
File:  Trump National Prayer Breakfast  :::: Similarity Score:  0.029
