In [1]:
import gensim.downloader as api
import gensim
import numpy as np

In [4]:
w_pretrained = api.load("word2vec-google-news-300")



In [2]:
w_pretrained = gensim.models.KeyedVectors.load("word2vec-google-news-300.kv", mmap='r')

In [3]:
w_pretrained.most_similar(positive=["computer", "university", "beautiful", "fast", "car"],negative=['professor'])

[('laptop', 0.4873006343841553),
 ('slingshot_dragster', 0.47504204511642456),
 ('LH_Yeah', 0.4735804498195648),
 ('KNYAF', 0.466823935508728),
 ('bike', 0.4662119746208191),
 ('Fabio_complimented', 0.46549883484840393),
 ('cars', 0.4605005085468292),
 ('Audi_S4_Cabriolet', 0.45188385248184204),
 ('GREG_BIFFLE_Yeah', 0.45104363560676575),
 ('Dan_Schieler_Senior', 0.45039424300193787)]

In [4]:
words = ["computer", "university", "beautiful", "fast", "car"]
similar_words = {word: w_pretrained.most_similar(word, topn=5) for word in words}

# Perform analogy tests
analogy_tests = [
    ("king", "man", "woman"),  # Expected to be close to 'queen'
    ("paris", "france", "germany"),  # Expected to be close to 'berlin'
    ("apple", "fruit", "vegetable"),  # Expected to be close to 'carrot' or similar
]

analogy_results = {f"{w1} - {w2} + {w3}": w_pretrained.most_similar(positive=[w1, w3], negative=[w2], topn=1) for w1, w2, w3 in analogy_tests}

In [5]:
analogy_results

{'king - man + woman': [('queen', 0.7118193507194519)],
 'paris - france + germany': [('berlin', 0.48413652181625366)],
 'apple - fruit + vegetable': [('potato', 0.5865277647972107)]}

The below shows that you should not be reliant on pretrained data 

It may be that the context has not been fit properly

In [6]:
words = ["computer", "university", "beautiful", "fast", "car"]
similar_words = {word: w_pretrained.most_similar(word, topn=5) for word in words}

# Perform analogy tests
analogy_tests = [
    ("king", "man", "woman"),  # Expected to be close to 'queen'
    ("paris", "france", "india"),  # Expected to be close to 'berlin'
    ("apple", "fruit", "vegetable"),  # Expected to be close to 'carrot' or similar
]

analogy_results = {f"{w1} - {w2} + {w3}": w_pretrained.most_similar(positive=[w1, w3], negative=[w2], topn=1) for w1, w2, w3 in analogy_tests}

In [7]:
analogy_results

{'king - man + woman': [('queen', 0.7118193507194519)],
 'paris - france + india': [('chennai', 0.5442505478858948)],
 'apple - fruit + vegetable': [('potato', 0.5865277647972107)]}

In [8]:
import pandas as pd

# Load the dataset
df = pd.read_csv("C:\\SNLP\\IMDB_Dataset.csv")

# Basic EDA
print(df.info())
print(df.head())
print(df['sentiment'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [9]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    return tokens

df['cleaned_review'] = df['review'].apply(clean_text)
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                      cleaned_review  
0  [one, reviewers, mentioned, watching, oz, epis...  
1  [wonderful, little, production, filming, techn...  
2  [thought, wonderful, way, spend, time, hot, su...  
3  [basically, family, little, boy, jake, thinks,...  
4  [petter, mattei, love, time, money, visually, ...  


In [10]:
from gensim.models import Word2Vec

# Tokenize cleaned reviews
tokenized_reviews = df['cleaned_review'].tolist()

# Train a Skip-gram model
skipgram_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, sg=1, min_count=2)

# Train a CBoW model
cbow_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, sg=0, min_count=2)

In [11]:
def get_review_vector(model, tokens):
    # Get vectors for tokens that are in the model's vocabulary
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(vectors, axis=0)

df['skipgram_vectors'] = df['cleaned_review'].apply(lambda x: get_review_vector(skipgram_model, x))
df['cbow_vectors'] = df['cleaned_review'].apply(lambda x: get_review_vector(cbow_model, x))


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Prepare the data
X_skipgram = np.array(df['skipgram_vectors'].tolist())
X_cbow = np.array(df['cbow_vectors'].tolist())
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

X_train_sg, X_test_sg, y_train, y_test = train_test_split(X_skipgram, y, test_size=0.2, random_state=42)
X_train_cb, X_test_cb, _, _ = train_test_split(X_cbow, y, test_size=0.2, random_state=42)

# Train and evaluate models
models = {
    'Skip-gram': (X_train_sg, X_test_sg),
    'CBoW': (X_train_cb, X_test_cb)
}

results = []
for model_name, (X_train, X_test) in models.items():
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append((model_name, accuracy, f1))

# Display the results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'F1 Score'])
print(results_df)


       Model  Accuracy  F1 Score
0  Skip-gram    0.8768  0.878044
1       CBoW    0.8644  0.866509


In [13]:
# Display the results
print(results_df)


       Model  Accuracy  F1 Score
0  Skip-gram    0.8768  0.878044
1       CBoW    0.8644  0.866509
