## Text vectorization

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the documents
d1 = "information information data train"
d2 = "computer information cpu computer"
d3 = "computer retrieval information"

# Create a list of documents
docs = [d1, d2, d3]

# Initialize the TfidfVectorizer
tfidf_vec = TfidfVectorizer()

# Fit and transform the documents to get TF-IDF vectors
tfidf_matrix = tfidf_vec.fit_transform(docs)

# Calculate the cosine similarities
cosine = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Print the TF-IDF vectors and cosine similarities
print("TF-IDF Vectors:")
print(tfidf_matrix.toarray())

print("\nCosine Similarities:")
print(cosine)


TF-IDF Vectors:
[[0.         0.         0.54270061 0.64105545 0.         0.54270061]
 [0.7948031  0.52253528 0.         0.30861775 0.         0.        ]
 [0.54783215 0.         0.         0.42544054 0.72033345 0.        ]]

Cosine Similarities:
[[1.         0.19784109 0.27273098]
 [0.19784109 1.         0.5667172 ]
 [0.27273098 0.5667172  1.        ]]


In [2]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download("punkt")

# First of all loading the corpus file 
with open("corpus.txt", "r") as file:
    corpuss = file.read().splitlines()

# after loading the file 
# we must tokeniz thema and removing the stopwords, and we should lowercase words
def text(txt_file):
    tok = nltk.word_tokenize(txt_file)
    tok = [w.lower() for w in tok if w.isalnum()]
    stop_w = set(stopwords.words("english"))
    tok = [w for w in tok if w not in stop_w]
    return " ".join(tok)

corpus = [text(doc) for doc in corpuss]

# creating vectors for storing the frequencies of words
count_vect = CountVectorizer()
freq_vect = count_vect.fit_transform(corpus)

# Create a TF-IDF-based vector representation
tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(corpus)

# printing each documents with their frequencies of words
print("Frequency vectors:")
for i, doc in enumerate(corpus):
    print(f"Document {i + 1}: {freq_vect[i].toarray()}")

# printing TFIDF vectors for each document
print("\nTFIDF vectors:")
for i, doc in enumerate(corpus):
    print(f"Document {i + 1}: {tfidf[i].toarray()}")


Frequency vectors:
Document 1: [[1 0 0 1 0 0 1 0 1 0 0 1 1 0 1 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0
  0 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0
  0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0]]
Document 2: [[0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Document 3: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0]]
Document 4: [[0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0
  1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1
  0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1]]
Document 5: [[0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\annaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Calculate cosine similarities between all pairs of documents
sims = cosine_similarity(tfidf)

# Print the cosine similarities
print("Cosine Similarities between Documents:")
for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        sim = sims[i][j]
        print(f"Document {i + 1} and Document {j + 1}: {sim:.4f}")


Cosine Similarities between Documents:
Document 1 and Document 2: 0.0000
Document 1 and Document 3: 0.0000
Document 1 and Document 4: 0.0000
Document 1 and Document 5: 0.0000
Document 1 and Document 6: 0.0000
Document 2 and Document 3: 0.0000
Document 2 and Document 4: 0.0000
Document 2 and Document 5: 0.0647
Document 2 and Document 6: 0.0000
Document 3 and Document 4: 0.0644
Document 3 and Document 5: 0.0000
Document 3 and Document 6: 0.0000
Document 4 and Document 5: 0.0000
Document 4 and Document 6: 0.0000
Document 5 and Document 6: 0.1009
