In [11]:
import numpy
import scipy
import gensim
import pandas as pd
from gensim.utils import simple_preprocess
import gensim.models.word2vec as w2v

print("All libraries loaded successfully!")


All libraries loaded successfully!


In [29]:
#loaded CSV

df = pd.read_csv("imdb.csv", encoding='utf-8')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [14]:
df['processed_text'] = df['review'].apply(lambda x: simple_preprocess(x))
df.head()

Unnamed: 0,review,sentiment,processed_text
0,One of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,"[wonderful, little, production, br, br, the, f..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, this, was, wonderful, way, to, spend..."
3,Basically there's a family where a little boy ...,negative,"[basically, there, family, where, little, boy,..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, in, the, time, of, mone..."


In [15]:
#trained model with default values

model_default = w2v.Word2Vec(sentences=df['processed_text'], vector_size=100, window=5, min_count=5, workers=4)

In [17]:
#silmilarity bw words

similarity_good_bad = model_default.wv.similarity('good', 'bad')
similarity_love_hate = model_default.wv.similarity('love', 'hate')
similarity_action_drama = model_default.wv.similarity('action', 'drama')

print(f"Similarity between 'good' and 'bad': {similarity_good_bad}")
print(f"Similarity between 'love' and 'hate': {similarity_love_hate}")
print(f"Similarity between 'action' and 'drama': {similarity_action_drama}")


Similarity between 'good' and 'bad': 0.7932754755020142
Similarity between 'love' and 'hate': 0.5891191959381104
Similarity between 'action' and 'drama': 0.4825916588306427


In [20]:
#similar words

similar_government = model_default.wv.most_similar('actor')
similar_business = model_default.wv.most_similar('business')
similar_technology = model_default.wv.most_similar('flop')

print("Most similar words to 'actor':")
print(similar_government)
print("\nMost similar words to 'business':")
print(similar_business)
print("\nMost similar words to 'flop':")
print(similar_technology)


Most similar words to 'actor':
[('actress', 0.7281460762023926), ('performer', 0.6677981615066528), ('comedian', 0.6600736379623413), ('role', 0.6250587105751038), ('performance', 0.6173135042190552), ('artist', 0.5689354538917542), ('villain', 0.5622029900550842), ('actors', 0.5360922813415527), ('talent', 0.5237036943435669), ('cast', 0.5188909769058228)]

Most similar words to 'business':
[('plans', 0.6111611127853394), ('practice', 0.5979730486869812), ('gambling', 0.5899010300636292), ('property', 0.5881303548812866), ('cattle', 0.5774372816085815), ('estate', 0.5543216466903687), ('debt', 0.551666796207428), ('medicine', 0.53650963306427), ('organization', 0.5364367961883545), ('protection', 0.5353608131408691)]

Most similar words to 'flop':
[('dud', 0.6641660332679749), ('turkey', 0.6538041830062866), ('mistake', 0.6060372591018677), ('disaster', 0.5967321395874023), ('disgrace', 0.579796552658081), ('disappointment', 0.5756376385688782), ('travesty', 0.5734865069389343), ('let

In [21]:
#training with different window size and min count

model_var1 = w2v.Word2Vec(sentences=df['processed_text'], vector_size=100, window=3, min_count=3, workers=4)
model_var2 = w2v.Word2Vec(sentences=df['processed_text'], vector_size=100, window=7, min_count=2, workers=4)
model_var3 = w2v.Word2Vec(sentences=df['processed_text'], vector_size=100, window=10, min_count=1, workers=4)


In [27]:
#silmilarity bw words with different window size and min count

words = ['love', 'comedy', 'money']

similarities_var1 = []
similarities_var2 = []
similarities_var3 = []

for word in words:
    similarity_var1 = model_var1.wv.similarity(word, 'horror')
    similarity_var2 = model_var2.wv.similarity(word, 'horror')
    similarity_var3 = model_var3.wv.similarity(word, 'horror')

    similarities_var1.append(similarity_var1)
    similarities_var2.append(similarity_var2)
    similarities_var3.append(similarity_var3)

# Print similarity results for each word
for i, word in enumerate(words):
    print(f"\nSimilarity between '{word}' and 'horror' for model_var1: {similarities_var1[i]}")
    print(f"Similarity between '{word}' and 'horror' for model_var2: {similarities_var2[i]}")
    print(f"Similarity between '{word}' and 'horror' for model_var3: {similarities_var3[i]}")



Similarity between 'love' and 'horror' for model_var1: 0.24042664468288422
Similarity between 'love' and 'horror' for model_var2: 0.12078578025102615
Similarity between 'love' and 'horror' for model_var3: 0.0757390633225441

Similarity between 'comedy' and 'horror' for model_var1: 0.5023741126060486
Similarity between 'comedy' and 'horror' for model_var2: 0.3636772632598877
Similarity between 'comedy' and 'horror' for model_var3: 0.2753165066242218

Similarity between 'money' and 'horror' for model_var1: 0.10779979079961777
Similarity between 'money' and 'horror' for model_var2: 0.08055546879768372
Similarity between 'money' and 'horror' for model_var3: 0.018825985491275787


In [28]:
#similar words with different window size and min count

words = ['love', 'comedy', 'money']


most_similar_var1 = []
most_similar_var2 = []
most_similar_var3 = []

for word in words:
    similar_words_var1 = model_var1.wv.most_similar(word)
    similar_words_var2 = model_var2.wv.most_similar(word)
    similar_words_var3 = model_var3.wv.most_similar(word)

    most_similar_var1.append(similar_words_var1)
    most_similar_var2.append(similar_words_var2)
    most_similar_var3.append(similar_words_var3)

# Print most similar words for each word and model
for i, word in enumerate(words):
    print(f"\nMost similar words to '{word}' for model_var1:")
    print(most_similar_var1[i])
    print(f"\nMost similar words to '{word}' for model_var2:")
    print(most_similar_var2[i])
    print(f"\nMost similar words to '{word}' for model_var3:")
    print(most_similar_var3[i])



Most similar words to 'love' for model_var1:
[('hate', 0.5944747924804688), ('enjoy', 0.5409210920333862), ('loved', 0.5307765007019043), ('romance', 0.5108920931816101), ('dislike', 0.4863249361515045), ('romantic', 0.472919762134552), ('asleep', 0.4710962176322937), ('friendship', 0.45817291736602783), ('loves', 0.4544978141784668), ('passion', 0.45173168182373047)]

Most similar words to 'love' for model_var2:
[('hate', 0.5829949975013733), ('romance', 0.5433490872383118), ('loved', 0.5007721185684204), ('friendship', 0.5001460313796997), ('romantic', 0.4934869706630707), ('enjoy', 0.47780007123947144), ('loves', 0.47187551856040955), ('bermuda', 0.4674067199230194), ('passion', 0.4581300914287567), ('adore', 0.4392508268356323)]

Most similar words to 'love' for model_var3:
[('hate', 0.5727032423019409), ('romance', 0.5708507895469666), ('romantic', 0.5518801212310791), ('friendship', 0.5400466322898865), ('loved', 0.5146439671516418), ('passion', 0.496574729681015), ('loves', 0.4

**Result Discussion**
---


*   A **smaller window size** like in model_var1 captured more local context or specific relationships between words that appear close to each other. For example, words like "love" and "hate" are stronger because the model is focusing on words that are more directly related within a limited context. and most of the time used togather.

*   A **larger window size** like in model_var3 broadens the context and captures more general relationships. but it larger window size can also relate words that are not related but mostly used in same sentences.


*   **Higher min_count** values like in model_var3 remove infrequent words, leaving behind words that occur more often. This tends to result in stronger, more reliable word embeddings that better reflect common relationships. In this case, words like "money" are more likely to have stronger and more consistent relationships with other common words.

*   **Lower min_count** values like in model_var1 include rare words, which might not have enough context to form meaningful embeddings, and as a result, can lead to less precise relationships. For example, rare words in movie reviews might have weak or inconsistent similarities.




