In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Documents
doc1 = """A computer is a complex electronic device designed to process and store data. It operates on the principles of binary code, using combinations of ones and zeros to represent information. The evolution of computers has been remarkable, starting from room-sized mainframes to today's powerful and compact devices. Modern computers consist of hardware components like processors, memory, storage, and input/output devices, complemented by software that enables various applications and functionalities. The computer has become an integral part of daily life, influencing fields such as communication, entertainment, education, and research."""

doc2 = """Artificial Intelligence, or AI, refers to the development of computer systems that can perform tasks that typically require human intelligence. This includes learning, reasoning, problem-solving, perception, and language understanding. Machine learning, a subset of AI, enables computers to improve their performance on a task through experience, without being explicitly programmed. AI applications range from virtual personal assistants and recommendation systems to complex tasks like image recognition, natural language processing, and autonomous vehicles. As AI continues to advance, ethical considerations and responsible development are crucial to ensure its positive impact on society."""

# Step 1: Preprocess the documents
# (No explicit preprocessing needed for this example)

# Step 2: Vectorize the documents
vectorizer = CountVectorizer()
vector = vectorizer.fit_transform([doc1, doc2])

# Step 3: Find cosine similarity
cos_sim = cosine_similarity(vector)

print("Cosine Similarity Without Embedding:")
print(cos_sim)


Cosine Similarity Without Embedding:
[[1.         0.42588623]
 [0.42588623 1.        ]]


In [3]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: Preprocess the documents
# Tokenize the documents
tokenized_doc1 = word_tokenize(doc1.lower())
tokenized_doc2 = word_tokenize(doc2.lower())

# Step 2: Use Word2Vec model for vector embedding
model = Word2Vec([tokenized_doc1, tokenized_doc2], vector_size=100, window=5, min_count=1, workers=4)

# Step 3: Find the cosine similarity
# Calculate the mean vector for each document
vec_doc1 = np.mean([model.wv[word] for word in tokenized_doc1], axis=0)
vec_doc2 = np.mean([model.wv[word] for word in tokenized_doc2], axis=0)

# Reshape vectors for cosine similarity calculation
vec_doc1 = vec_doc1.reshape(1, -1)
vec_doc2 = vec_doc2.reshape(1, -1)

# Calculate cosine similarity
cos_sim_embedding = cosine_similarity(vec_doc1, vec_doc2)

print("\nCosine Similarity With Embedding:")
print(cos_sim_embedding)



Cosine Similarity With Embedding:
[[0.7351834]]
