In [1]:
import nltk
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [2]:
import gensim.downloader as api

# Load the pretrained word2vec-google-news-300 model
model = api.load("word2vec-google-news-300")

In [3]:
# Pick 5 words and find similar words
words = ["music", "movie", "computer", "food", "travel"]

for word in words:
    similar_words = model.most_similar(word, topn=10)
    print(f"Top 10 words similar to '{word}':")
    for similar_word, similarity in similar_words:
        print(f"{similar_word}: {similarity}")
    print("\n")

# Test analogies
analogies = [
    ("king", "man", "woman"),
    ("paris", "france", "italy"),
    ("apple", "fruit", "vegetable")
]

for word1, word2, word3 in analogies:
    result = model.most_similar(positive=[word1, word3], negative=[word2])
    print(f"Analogy: {word1} - {word2} + {word3} = {result[0][0]} (similarity: {result[0][1]})")


Top 10 words similar to 'music':
classical_music: 0.7197794318199158
jazz: 0.683463990688324
Music: 0.6595721244812012
Without_Donny_Kirshner: 0.6416222453117371
songs: 0.6396344900131226
musicians: 0.6336299180984497
tunes: 0.6330114603042603
musical: 0.6186030507087708
Logue_typed: 0.6150090098381042
musics: 0.6148059368133545


Top 10 words similar to 'movie':
film: 0.8676770329475403
movies: 0.8013108372688293
films: 0.7363012433052063
moive: 0.683036208152771
Movie: 0.6693680882453918
horror_flick: 0.6577848792076111
sequel: 0.6577792763710022
Guy_Ritchie_Revolver: 0.6509752869606018
romantic_comedy: 0.6413198709487915
flick: 0.6321909427642822


Top 10 words similar to 'computer':
computers: 0.7979379296302795
laptop: 0.6640493273735046
laptop_computer: 0.6548868417739868
Computer: 0.647333562374115
com_puter: 0.6082080006599426
technician_Leonard_Luchko: 0.566274881362915
mainframes_minicomputers: 0.5617720484733582
laptop_computers: 0.5585449934005737
PC: 0.5539618730545044
mak

# IMDB

In [2]:
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [9]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'\S+@\S+', '', text)  # Remove email addresses
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stop words
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize
    return ' '.join(words)

df['cleaned_reviews'] = df['review'].apply(preprocess_text)

# Display the cleaned reviews
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aryan_zingade/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aryan_zingade/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/aryan_zingade/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                     cleaned_reviews  
0  one reviewer mentioned watching oz episode you...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically there family little boy jake think t...  
4  petter matteis love time money visually stunni...  


### CBOW

In [10]:
sentences = [review.split() for review in df['cleaned_reviews']]
cbow_model =Word2Vec(sentences, vector_size=100, window=5, workers=4, epochs=10, min_count=5, sg=0)
cbow_model.save("cbow_word2vec.model")

In [11]:
sgram_model =Word2Vec(sentences, vector_size=100, window=5, workers=4, epochs=10, min_count=5, sg=1)
sgram_model.save("sgram_word2vec.model")

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def get_vector(rev, model, vector_size):
    vector = np.zeros(vector_size)
    count = 0
    for word in rev.split():
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    if count != 0:
        vector /= count
    return vector

# Generate feature vectors using different models
vector_size = 100  # Update based on model's vector size
df['skipgram_vector'] = df['cleaned_reviews'].apply(lambda x: get_vector(x, sgram_model, vector_size))
df['cbow_vector'] = df['cleaned_reviews'].apply(lambda x: get_vector(x, cbow_model, vector_size))

# Convert vectors to lists for training
X_skipgram = np.vstack(df['skipgram_vector'].to_numpy())
X_cbow = np.vstack(df['cbow_vector'].to_numpy())

y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Split the data
X_train_sg, X_test_sg, y_train_sg, y_test_sg = train_test_split(X_skipgram, y, test_size=0.2, random_state=42)
X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(X_cbow, y, test_size=0.2, random_state=42)

# Train and evaluate the model for Skip-gram vectors
model_sg = LogisticRegression()
model_sg.fit(X_train_sg, y_train_sg)
y_pred_sg = model_sg.predict(X_test_sg)
print("Skip-gram Model Performance:")
print(classification_report(y_test_sg, y_pred_sg))

# Train and evaluate the model for CBoW vectors
model_cb = LogisticRegression()
model_cb.fit(X_train_cb, y_train_cb)
y_pred_cb = model_cb.predict(X_test_cb)
print("CBoW Model Performance:")
print(classification_report(y_test_cb, y_pred_cb))



Skip-gram Model Performance:
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      4961
           1       0.87      0.88      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

CBoW Model Performance:
              precision    recall  f1-score   support

           0       0.87      0.86      0.86      4961
           1       0.86      0.87      0.87      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

