Importing Libraries

In [1]:
import json
import numpy as np
import gensim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# from gensim.models import FastText, Word2Vec, KeyedVectors
# from gensim.models.word2vec import Word2Vec
# from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.linear_model import LogisticRegression

Discovering the models available:

In [2]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


Loadind & discovering the data

In [3]:
# Load the data
data = []
with open(r'D:\4th year 2nd semester\NLP\Assignment-2 Common Sense Reasoning\train_rand_split.jsonl', 'r') as f:
    for line in f:
        json_object = json.loads(line)
        data.append(json_object)

In [4]:
questions = [item['question']['question_concept'] for item in data]

In [5]:
topics = [item['question']['question_concept'] for item in data]
print(len((topics)))
print(len(set(topics)))

9741
2151


Baseline

In [6]:
import random

def baseline_random_guessing(data):
    # List to store predicted answers
    predicted_answers = []
    
    # Iterate over each question in the dataset
    for item in data:
        # Generate a random guess (choose a random choice)
        random_guess = random.choice(item['question']['choices'])
        # Append the random guess to the list of predicted answers
        predicted_answers.append(random_guess['label'])
    
    return predicted_answers

def calculate_accuracy(predictions, true_labels):
    # Count the number of correct predictions
    num_correct = sum(1 for pred, true in zip(predictions, true_labels) if pred == true)
    # Calculate accuracy
    accuracy = num_correct / len(predictions)
    return accuracy

true_labels=[item['answerKey'] for item in data]
# Get random predictions
random_predictions = baseline_random_guessing(data)
# Calculate accuracy
accuracy = calculate_accuracy(random_predictions, true_labels)
print("Baseline (Random Guessing) Accuracy:", accuracy)


Baseline (Random Guessing) Accuracy: 0.1999794682270814


Seeking the topics 

In [7]:

unique_items_set=((set(topics)))

for item in unique_items_set:
    print(item)


cooking
possible
ventilation system
television
cleaning house
indian restaurant
microphone boom
shampoo
copulate
punish
playing games
wine list
teakettle
shoes
clouds
sheep
return to work
escape
work
cashing in
killing
need
socialising
art
taking final exams
harmonica
niece
check
vessel
shed
young
coin
heifer
movies
product
party
lover
harp
opening business
small dog
blood
disk
faith
steel cable
urinating
playing violin
island
god
clavichord
volume
potatoes
teenagers
die
jogging
reading
kingdom
running after ball
sleep
fast
noise
subway stop
fitting room
paper clip
rosebush
stand in line
fast food restaurant
blowfish
drinking
having sex
live life
watch tv
ruler
mighty
centavo
rubber
kleenex
cannon
bowl
bible
listen
talking
antibiotic
piccolo
litter
going jogging
twist
hate
eat breakfast
laser
ski
remember
have lunch
leaves
dishes
cheap
rest
billfold
greed
luggage
dentist
stick
trumpet
paperwork
helium
saucer
court
go to school
laconic
solidity
wheat
making friends
perfect
inch
knight
p

Choosing & downloading the models:

In [8]:
# Download the 'glove-wiki-gigaword-300' embeddings
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-300')

# Download the 'fasttext-wiki-news-subwords-300' embeddings
fasttext_vectors = gensim.downloader.load('fasttext-wiki-news-subwords-300')

# Download the  'word2vec-google-news-300' embeddings
word2vec_vectors = gensim.downloader.load('word2vec-google-news-300')

Exploring the data with embeddings

In [9]:
def get_word_embedding(word, embedding_model):
    if embedding_model == 'glove':
        if word in glove_vectors:
            return glove_vectors[word]
        else:
            return None
    elif embedding_model == 'fasttext':
        if word in fasttext_vectors:
            return fasttext_vectors[word]
        else:
            return None
    elif embedding_model == 'word2vec':
        if word in word2vec_vectors:
            return word2vec_vectors[word]
        else:
            return None
    else:
        print("Invalid embedding model name")
        return None

'Testing stop words Existence' 

----

Results:

1. Glove & fasttext have all the stopwords

2. word2vec does not have (a,and,of,to) but have (A, And, Of, To)

In [10]:
# def test_stop_words_embedding(stop_words, embedding_model):
#     missing_embeddings = []
#     for word in stop_words:
#         embedding = get_word_embedding(word, embedding_model)
#         if embedding is None:
#             missing_embeddings.append(word)
#     return missing_embeddings

def test_stop_words_embedding(stop_words, embedding_model):
    missing_embeddings = []
    for word in stop_words:
        # Check for lowercase version
        embedding = get_word_embedding(word.lower(), embedding_model)
        if embedding is None:
            # If not found, check for uppercase version
            embedding = get_word_embedding(word.upper(), embedding_model)
            if embedding is None:
                missing_embeddings.append(word)
    return missing_embeddings

stop_words = ["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will" , "with"]
embedding_model = ["word2vec",'fasttext','glove']  
for model in embedding_model:
    missing_embeddings = test_stop_words_embedding(stop_words, model)
    if missing_embeddings:
        print("Stop words without embeddings in", model, ":", missing_embeddings)
    else:
        print("All stop words have embeddings in", model)

All stop words have embeddings in word2vec
All stop words have embeddings in fasttext
All stop words have embeddings in glove


Discovering the words' existence in the embeddings:

In [11]:
def test_word_embedding(text_list, embedding_model):
    missing_embeddings = []
    for text in text_list:
        for word in text.split():
            # Check for lowercase version
            embedding = get_word_embedding(word.lower(), embedding_model)
            if embedding is None:
                # If not found, check for uppercase version
                word_upper_first = word[0].upper() + word[1:].lower()
                embedding = get_word_embedding(word_upper_first, embedding_model)
                if embedding is None:
                    missing_embeddings.append(word)
    return missing_embeddings

In [12]:
questions = [item['question']['stem'] for item in data]
embedding_model = ["word2vec",'fasttext','glove']  
for model in embedding_model:
    missing_embeddings = test_word_embedding(questions, model)
    if missing_embeddings:
        print("Words without embeddings in", model, ":", missing_embeddings)
    else:
        print("All words in questions have embeddings in", model)



In [13]:
def test_choices_embedding(questions, embedding_model):
    missing_embeddings = []
    for question in questions:
        for choice in question:  # Iterate over choices in the question
            for word in choice['text'].split():  # Split the choice text into words
                # Check for lowercase version
                embedding = get_word_embedding(word.lower(), embedding_model)
                if embedding is None:
                    # If not found, check for uppercase version
                    word_upper_first = word[0].upper() + word[1:].lower()
                    embedding = get_word_embedding(word_upper_first, embedding_model)
                    if embedding is None:
                        missing_embeddings.append(word)
    return missing_embeddings

In [14]:
choices = [item['question']['choices'] for item in data]
for model in embedding_model:
    missing_embeddings_ch = test_choices_embedding(choices, model)
    if missing_embeddings_ch:
        print("Words without embeddings in", model, ":", missing_embeddings_ch)
    else:
        print("All words in questions have embeddings in", model)

Words without embeddings in word2vec : ['flowers.', "calligrapher's", "neighbor's", 'i.q', 'dispare', "skyscraper's", 'thought.', 'backery', 'dreamworker', "friend's", "neighbor's", "carpenter's", 'exhiliration', 'forceless', 'bargemaster', 'multi-coloured', "friend's", "friend's", "person's", "farmer's", "friend's", "farmer's", "neighbor's", 'things.', 'conservadox', 'sandplain', 'undestroyable', 'puzzel', 'puxxle', 'puxxle', "blacksmith's", 'mnasiu', "neighbor's", 'appointements', "neighbor's", "farmer's", '911', "artist's", "carpenter's", 'resthold', "lady's", 'imprevist', 'diaphram', "friend's", 'fileing', 'cabnet', 'store.', "neighbor's", 'worrie', 'pollenate', 'day.', "penguin's", 'library.', "chemist's", "lincoln's", "friend's", 'netwok', 'bullbleep', 'disengenious', "granddad's", "brother's", "friend's", 'restaurant.', "ship's", "friend's", 'restorand', "friend's", 'driver.', 'in-between', "neighbor's", "friend's", 'vadsø', 'gathhering', 'glassess', "friend's", "neighbor's", "r

Cleaning questions & choices

In [15]:
from text_cleaner import clean_text

In [16]:

# Function to remove specific punctuation marks
def remove_specific_punctuation(text):
    punctuation_marks = "“/!.,?;\""
    for mark in punctuation_marks:
        text = text.replace(mark, "")
    return text
# Extract stems and choices
stems =  [remove_specific_punctuation(item['question']['stem']) for item in data]
choices=[remove_specific_punctuation(choice['text'])for item in data for choice in item['question']['choices']]


In [17]:
# Clean stems and choices
cleaned_stems = [clean_text(stem) for stem  in stems]
cleaned_choices = [clean_text(choice) for choice in choices]


In [18]:
embedding_model = ["word2vec",'fasttext','glove']  

for model in embedding_model:
    missing_embeddings_ch = test_word_embedding(cleaned_stems, model)
    if missing_embeddings_ch:
        print("Words without embeddings in", model, ":", missing_embeddings_ch)
    else:
        print("All words in questions have embeddings in", model)

Words without embeddings in word2vec : ['locomote', 'outoing', 'neating', 'Hobokenn', 'omnidiciplinarian', 'fantasied', 'Moraceae', 'weating', 'weating', 'ttown', 'contemn', 'weating', 'DDT', 'Sahmbi', 'Neuroepithelium', 'Meditorranean', 'Potatosol', 'weating', 'weating', 'weating', 'Yooperland', 'weating', 'feating', 'weating', 'opposingly', 'appeating', 'árbol', 'overbeating', 'DRC', 'whaat']
Words without embeddings in fasttext : ['outoing', 'neating', 'Hobokenn', 'omnidiciplinarian', 'weating', 'weating', 'ttown', 'weating', 'Sahmbi', 'Meditorranean', 'Potatosol', 'weating', 'weating', 'weating', 'Yooperland', 'weating', 'unjammed', 'feating', 'weating', 'opposingly', 'appeating', 'overbeating']
Words without embeddings in glove : ['locomote', 'outoing', 'neating', 'everyones', 'Hobokenn', 'sleighing', 'omnidiciplinarian', 'cogitating', 'fantasied', 'weating', 'weating', 'cogitating', 'debaucherous', 'ttown', 'contemn', 'weating', 'phoneless', 'apiarist', 'Sahmbi', 'heshe', 'cogita

In [19]:
for model in embedding_model:
    missing_embeddings_ch = test_word_embedding(cleaned_choices, model)
    if missing_embeddings_ch:
        print("Words without embeddings in", model, ":", missing_embeddings_ch)
    else:
        print("All words in questions have embeddings in", model)

Words without embeddings in word2vec : ['sandplain', 'undestoryable', 'ttown', 'imprevisto', 'weating', 'vadso', 'weating', 'hexachord', 'nightown', 'deliriousness', 'controvertible', 'garderobe', 'uneccentric', 'centimetre', 'nightown', 'squarial', 'straightforth', 'country’s', 'sitfast', 'folkest', 'comfortness', 'toothrow', 'swamplife', 'cobbed', 'slopewash', 'fotograph', 'transmutate', 'twerk', 'asomatous', 'icbm', 'choit', 'potence', 'deepfelt', 'evercookie', 'evercookie', 'alpenstock', 'waterpoint', 'stanine', 'autolock', 'mailcatcher', 'potence', 'thundershock', 'tryhard', 'undivine', 'schlitztown', 'teetotaller', 'topfull', 'straightforth', 'straightforth', 'vacationgoer', 'racecation', 'azawakh', 'texmex', 'texaphyrin', 'quarterlight', 'sayhi', 'electrobus', 'handglide', 'webvancom', 'quarterlight', 'contemn', 'turkeycock', 'stanine', 'datahub', 'potence', 'aforewritten', 'datahub', 'nonpositive', 'aforewritten', 'choit', 'southernwort', 'indeterminism', 'longplay', 'contemn',

Obtaining Vectors

In [20]:
# Function to get word embeddings with handling missing words
def get_word_embedding_with_handling_missing_words(word, embedding_model):
    if word in embedding_model:
        return embedding_model[word]
    else:
        # Try lowercase
        lower_word = word.lower()
        if lower_word in embedding_model:
            return embedding_model[lower_word]
        
        # Try uppercase
        upper_word = word.upper()
        if upper_word in embedding_model:
            return embedding_model[upper_word]
        
        # Try finding a similar synonym
        try:
            synonyms = embedding_model.most_similar(word)
            for synonym, _ in synonyms:
                if synonym in embedding_model:
                    return embedding_model[synonym]
        except KeyError:
            pass
        
        # If no embedding found, return zero vector
        return np.zeros(embedding_model.vector_size)


In [21]:
# Represent questions using word embeddings
def represent_questions(data, cleaned_stems, cleaned_choices, embedding_model):
    representations = []
    answers = []
    for i, item in enumerate(data):
        stem = cleaned_stems[i]
        choices = [cleaned_choices[j] for j in range(i * 5, (i + 1) * 5)]
        question_representation = []
        for word in stem.split():
            embedding = get_word_embedding_with_handling_missing_words(word, embedding_model)
            if embedding is not None:
                question_representation.append(embedding)
        for choice in choices:
            for word in choice.split():
                embedding = get_word_embedding_with_handling_missing_words(word, embedding_model)
                if embedding is not None:
                    question_representation.append(embedding)
        if len(question_representation) > 0:
            representations.append(np.mean(question_representation, axis=0))
            answers.append(item['answerKey'])
    return np.array(representations), np.array(answers)


In [22]:

# Split data into train and test sets
X_g, y_g = represent_questions(data, cleaned_stems, cleaned_choices, glove_vectors)

X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(X_g, y_g, test_size=0.2, random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_g, y_train_g)

# Evaluate model
y_pred_g = clf.predict(X_test_g)
accuracy = accuracy_score(y_test_g, y_pred_g)
print("Glove Accuracy:", accuracy)


# Split data into train and test sets
X_w, y_w = represent_questions(data, cleaned_stems, cleaned_choices, word2vec_vectors)

X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_w, y_w, test_size=0.2, random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_w, y_train_w)

# Evaluate model
y_pred_w = clf.predict(X_test_w)
accuracy = accuracy_score(y_test_w, y_pred_w)
print("word2vec Accuracy:", accuracy)



# Split data into train and test sets
X_f, y_f = represent_questions(data, cleaned_stems, cleaned_choices, fasttext_vectors)

X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_f, y_f, test_size=0.2, random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_f, y_train_f)

# Evaluate model
y_pred_f = clf.predict(X_test_f)
accuracy = accuracy_score(y_test_f, y_pred_f)
print("fasttext Accuracy:", accuracy)



Glove Accuracy: 0.2180605438686506
word2vec Accuracy: 0.20266803488968702
fasttext Accuracy: 0.19189327860441252


---

N-grams

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

# Represent questions using word embeddings and n-grams
def represent_questions_with_ngrams(data, cleaned_stems, cleaned_choices, embedding_model, ngram_range=(1, 1)):
    representations = []
    answers = []
    for i, item in enumerate(data):
        stem = cleaned_stems[i]
        choices = [cleaned_choices[j] for j in range(i * 5, (i + 1) * 5)]
        
        # Combine stem and choices into text for vectorization
        text = stem + ' '.join(choices)
        
        # Vectorize text using n-grams
        vectorizer = CountVectorizer(ngram_range=ngram_range)
        X = vectorizer.fit_transform([text])
        vocab = vectorizer.get_feature_names_out()
        
        # Create representations using word embeddings
        question_representation = []
        for word in vocab:
            embedding = get_word_embedding_with_handling_missing_words(word, embedding_model)
            if embedding is not None:
                question_representation.append(embedding)
        
        if len(question_representation) > 0:
            question_representation = np.mean(question_representation, axis=0)
            representations.append(question_representation)
            answers.append(item['answerKey'])
    return representations, answers

Uni + Bi

In [24]:
# Call represent_questions_with_ngrams function
X, y = represent_questions_with_ngrams(data, cleaned_stems, cleaned_choices, glove_vectors, ngram_range=(1, 2))

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate model
accuracy = clf.score(X_test, y_test)
print("GloVe Accuracy:", accuracy)



#Split data into train and test sets
X_w, y_w =represent_questions_with_ngrams(data, cleaned_stems, cleaned_choices, word2vec_vectors, ngram_range=(1, 2))

X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_w, y_w, test_size=0.2,random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_w, y_train_w)

# Evaluate model
y_pred_w = clf.predict(X_test_w)
accuracy = accuracy_score(y_test_w, y_pred_w)
print("word2vec Accuracy:", accuracy)


# Split data into train and test sets
X_f, y_f = represent_questions_with_ngrams(data, cleaned_stems, cleaned_choices, fasttext_vectors, ngram_range=(1, 2))

X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_f, y_f, test_size=0.2, random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_f, y_train_f)

# Evaluate model
y_pred_f = clf.predict(X_test_f)
accuracy = accuracy_score(y_test_f, y_pred_f)
print("fasttext Accuracy:", accuracy)


GloVe Accuracy: 0.22267829656233967
word2vec Accuracy: 0.1944586967675731
fasttext Accuracy: 0.19189327860441252


Uni + Bi + Tri

In [25]:


# Call represent_questions_with_ngrams function
X, y = represent_questions_with_ngrams(data, cleaned_stems, cleaned_choices, glove_vectors, ngram_range=(1, 3))

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate model
accuracy = clf.score(X_test, y_test)
print("GloVe Accuracy:", accuracy)


# Split data into train and test sets
X_w, y_w =represent_questions_with_ngrams(data, cleaned_stems, cleaned_choices, word2vec_vectors, ngram_range=(1, 3))

X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_w, y_w, test_size=0.2,random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_w, y_train_w)

# Evaluate model
y_pred_w = clf.predict(X_test_w)
accuracy = accuracy_score(y_test_w, y_pred_w)
print("word2vec Accuracy:", accuracy)


# Split data into train and test sets
X_f, y_f = represent_questions_with_ngrams(data, cleaned_stems, cleaned_choices, fasttext_vectors, ngram_range=(1, 3))

X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_f, y_f, test_size=0.2, random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_f, y_train_f)

# Evaluate model
y_pred_f = clf.predict(X_test_f)
accuracy = accuracy_score(y_test_f, y_pred_f)
print("fasttext Accuracy:", accuracy)


GloVe Accuracy: 0.21600820933812212
word2vec Accuracy: 0.20112878399179066
fasttext Accuracy: 0.19035402770651616


Uni + Bi + Tri + Quad

In [31]:


# Call represent_questions_with_ngrams function
X, y = represent_questions_with_ngrams(data, cleaned_stems, cleaned_choices, glove_vectors, ngram_range=(1, 4))

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate model
accuracy = clf.score(X_test, y_test)
print("GloVe Accuracy:", accuracy)


# Split data into train and test sets
X_w, y_w =represent_questions_with_ngrams(data, cleaned_stems, cleaned_choices, word2vec_vectors, ngram_range=(1, 4))

X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_w, y_w, test_size=0.2,random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_w, y_train_w)

# Evaluate model
y_pred_w = clf.predict(X_test_w)
accuracy = accuracy_score(y_test_w, y_pred_w)
print("word2vec Accuracy:", accuracy)


# Split data into train and test sets
X_f, y_f = represent_questions_with_ngrams(data, cleaned_stems, cleaned_choices, fasttext_vectors, ngram_range=(1, 4))

X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_f, y_f, test_size=0.2, random_state=0)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_f, y_train_f)

# Evaluate model
y_pred_f = clf.predict(X_test_f)
accuracy = accuracy_score(y_test_f, y_pred_f)
print("fasttext Accuracy:", accuracy)


GloVe Accuracy: 0.21600820933812212
word2vec Accuracy: 0.2072857875833761
fasttext Accuracy: 0.18624935864545922


---

Exploring gensim more ...

In [27]:
# access the 'word2vec-google-news-300' embeddings as KeyedVectors
model = word2vec_vectors

# Get the list of words
words = model.index_to_key

# Print a sample of words
sample_size = 100
print("Sample Words:")
for word in words[:sample_size]:
    print(word)

Sample Words:
</s>
in
for
that
is
on
##
The
with
said
was
the
at
not
as
it
be
from
by
are
I
have
he
will
has
####
his
an
this
or
their
who
they
but
$
had
year
were
we
more
###
up
been
you
its
one
about
would
which
out
can
It
all
also
two
after
first
He
do
time
than
when
We
over
last
new
other
her
people
into
In
our
there
A
she
could
just
years
some
U.S.
three
million
them
what
But
so
no
like
if
only
percent
get
did
him
game
back
because
now
#.#
before


In [28]:
# Get the list of words
words = word2vec_vectors.index_to_key

# Print the word vectors for a sample of words
sample_size = 10  
print("Word Vectors:")
for word in words[:sample_size]:
    print(f"{word}: {word2vec_vectors[word]}")

Word Vectors:
</s>: [ 1.1291504e-03 -8.9645386e-04  3.1852722e-04  1.5335083e-03
  1.1062622e-03 -1.4038086e-03 -3.0517578e-05 -4.1961670e-04
 -5.7601929e-04  1.0757446e-03 -1.0223389e-03 -6.1798096e-04
 -7.5531006e-04  1.4038086e-03 -1.6403198e-03 -6.3323975e-04
  1.6326904e-03 -1.0070801e-03 -1.2664795e-03  6.5231323e-04
 -4.1580200e-04 -1.0757446e-03  1.5258789e-03 -2.7465820e-04
  1.4019012e-04  1.5716553e-03  1.3580322e-03 -8.3160400e-04
 -1.4038086e-03  1.5792847e-03  2.5367737e-04 -7.3242188e-04
 -1.0538101e-04 -1.1672974e-03  1.5792847e-03  6.5612793e-04
 -6.5994263e-04  2.9206276e-06  1.1291504e-03  4.2724609e-04
 -3.7002563e-04 -1.1520386e-03  1.2664795e-03 -3.5166740e-06
  2.6512146e-04 -4.0245056e-04  1.4114380e-04 -3.3617020e-05
  7.5912476e-04 -5.1879883e-04 -7.1048737e-05  6.0272217e-04
 -5.0735474e-04 -1.6250610e-03 -4.3678284e-04 -9.9182129e-04
 -1.2207031e-03 -3.2234192e-04  6.8664551e-05 -1.1672974e-03
 -5.1116943e-04  1.4114380e-03  3.3569336e-04 -4.7492981e-04
 -1.

In [29]:
word = "student"
similar_words = model.most_similar(word)
print(f"Words similar to '{word}': {similar_words}")


Words similar to 'student': [('students', 0.7294867038726807), ('Student', 0.6706662774085999), ('teacher', 0.6301366090774536), ('stu_dent', 0.6240993142127991), ('faculty', 0.6087332963943481), ('school', 0.6055627465248108), ('undergraduate', 0.6020305752754211), ('university', 0.600540041923523), ('undergraduates', 0.5755698680877686), ('semester', 0.573759913444519)]


In [30]:
word1 = "car"
word2 = "vehicle"
similarity = model.similarity(word1, word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity}")


Similarity between 'car' and 'vehicle': 0.7821096777915955
