In [1]:
import numpy as np
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.data import find
import string

In [2]:
# Download NLTK data if not already available
try:
    find('corpora/stopwords.zip')
except LookupError:
    nltk.download('stopwords')
    nltk.download('punkt')

In [3]:
# Example sentences
sentences = [
    "I love learning about natural language processing",
    "Natural language processing is an interesting field"
]

In [4]:
# Tokenize and preprocess sentences
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

In [5]:
tokenized_sentences = [preprocess(sentence) for sentence in sentences]

In [6]:
# Display tokenized sentences
for i, sentence in enumerate(tokenized_sentences):
    print(f"Sentence {i+1}: {sentence}")

Sentence 1: ['love', 'learning', 'natural', 'language', 'processing']
Sentence 2: ['natural', 'language', 'processing', 'interesting', 'field']


In [7]:
# Train FastText model for word embeddings
model = FastText(sentences=tokenized_sentences, vector_size=50, window=5, min_count=1, sg=1)

In [8]:
# Get word vectors for each word in the vocabulary
word_vectors = model.wv
vocab = list(word_vectors.index_to_key)

In [9]:
# Print word vectors for each word in the vocabulary
print("\nWord Vectors:")
for word in vocab:
    print(f"{word}: {word_vectors[word]}")


Word Vectors:
processing: [-1.4125750e-03  2.5246246e-03  4.3762542e-04  1.5069450e-03
  1.2998371e-03  3.4674790e-03 -1.4368900e-03  2.9371914e-03
 -2.4109876e-03  3.3887534e-04 -5.9384544e-04 -4.2074872e-04
 -3.4072454e-04 -6.3888740e-04  8.5927331e-04 -2.2283827e-03
  1.5533551e-03 -1.8594170e-04  2.9580118e-04 -3.2064372e-03
  3.5882583e-03  1.7079883e-03 -2.3575849e-03  1.7934639e-03
  2.6873202e-04  3.1212226e-03  1.5573500e-03  9.0156816e-04
 -3.1312986e-04  2.2482355e-04 -3.7807488e-04 -1.0531490e-03
  2.8874140e-04  1.7811084e-03 -2.1276380e-04 -1.5895528e-03
  5.3072785e-04  2.9739205e-04  9.6521614e-04  2.9506106e-03
  3.3989849e-03 -2.0648264e-03  1.8246549e-03  9.3141251e-05
 -3.4122440e-04  1.2844171e-03 -3.6812918e-03 -9.8966528e-04
  1.7976073e-03 -1.1527844e-05]
language: [-4.6082996e-03  3.7044389e-03  5.6315638e-04 -3.9709252e-03
 -3.2009007e-03  2.5712312e-03 -4.0966831e-04 -4.3883172e-04
 -2.2495789e-03 -1.6280501e-05  1.6985426e-03 -3.5934447e-04
  1.4138073e-04 

In [10]:
# Create document vectors by averaging word vectors
def get_document_vector(tokens):
    vectors = [word_vectors[word] for word in tokens if word in word_vectors]
    if len(vectors) == 0:
        return np.zeros(word_vectors.vector_size)
    return np.mean(vectors, axis=0)

In [11]:
doc_vectors = np.array([get_document_vector(sentence) for sentence in tokenized_sentences])

# Print document vectors
print("\nDocument Vectors:")
for i, vector in enumerate(doc_vectors):
    print(f"Document {i+1}: {vector}")


Document Vectors:
Document 1: [-1.7187942e-03  1.6956100e-03 -6.5716135e-04 -1.3682388e-03
 -8.3187374e-04  1.5386581e-04  5.8824650e-04  6.2160881e-04
 -2.7294751e-04 -7.1306730e-04 -4.5078783e-04 -1.5462168e-03
 -1.3213302e-04  3.1905880e-04  7.9224643e-04 -7.1973167e-04
  7.1140687e-04  1.2630358e-03 -1.1556464e-03  2.1705995e-04
  2.1446631e-03  1.2289801e-03 -1.4593254e-03  1.2116141e-03
  2.2710320e-04  9.2734536e-04 -1.8309545e-03 -1.4281613e-03
  3.7703087e-04  8.0546987e-04  1.7832678e-03  5.9319871e-05
 -1.6232359e-03  2.1623871e-03 -1.1628784e-03 -1.2030079e-03
  3.6635515e-04 -5.3122122e-04 -3.3186251e-04  2.0470582e-03
  4.9868098e-04  1.9761336e-03  1.2762714e-03  1.1561913e-03
 -3.6271365e-05 -6.8534492e-04 -1.7192587e-03 -1.9773419e-03
 -4.4275998e-04  1.1091233e-03]
Document 2: [-2.81073409e-03  7.89127022e-04  1.62449389e-04 -4.31706663e-04
  8.26908567e-04  2.21354747e-03 -8.46342824e-04  8.29081342e-04
 -1.12846994e-03  1.52801757e-03 -3.41372506e-04  1.21711244e-0

In [12]:
# Compute cosine similarity
similarity = cosine_similarity(doc_vectors)

In [13]:
print("\nCosine Similarity:")
print(similarity)


Cosine Similarity:
[[1.0000001  0.5043084 ]
 [0.5043084  0.99999994]]
