In [None]:
from gensim.models import Word2Vec
import numpy as np
import re
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split

In [None]:
# ----------------------
# Dataset
# ----------------------

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews",
  "IMDB Dataset.csv",
  pandas_kwargs={"encoding": "latin-1"}
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)
review = df["review"].to_numpy()
rating = df["sentiment"].map({"positive": 1, "negative": 0}).to_numpy()

sentences = [
    ("The movie was amazing and full of heart", 1),
    ("A boring plot with terrible acting", 0),
    ("I loved the characters but hated the ending", 0),
    ("The film was not good at all", 0),
    ("Surprisingly fun and well written", 1),
    ("I expected more it was disappointing", 0),
    ("Absolutely fantastic experience", 1),
    ("The story was dull and predictable", 0)
]

sentences += list(zip(review, rating))
print(len(sentences))

  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
50008


In [None]:
# ----------------------
# Preprocessing
# ----------------------
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

X = [s[0] for s in sentences]
y = [s[1] for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train size: {len(X_train)}")
print(f"Test size:  {len(X_test)}")

Train size: 40006
Test size:  10002


In [22]:
corpus = [tokenize(text) for text in X_train]#
#print(corpus)

In [None]:
# ----------------------
# Train Word2Vec
# ----------------------
model = Word2Vec(corpus, vector_size=50, window=4, min_count=1, sg=1)

In [None]:
# ----------------------
# Sentence vector
# ----------------------
def sentence_vector(tokens):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    if not vecs:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

In [None]:
X = np.array([sentence_vector(tokens) for tokens in corpus])
y = np.array(y_train)

In [None]:
# ----------------------
# Simple sentiment prototypes
# ----------------------
pos_vec = np.mean(X[y == 1], axis=0)
neg_vec = np.mean(X[y == 0], axis=0)

In [None]:
def cosine(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def predict(sentence):
    v = sentence_vector(tokenize(sentence))
    return "positive" if cosine(v, pos_vec) > cosine(v, neg_vec) else "negative"

In [21]:
# ----------------------
# Try it
# ----------------------
tests = [
    ["great acting and wonderful story", 1],
    ["painfully slow and boring", 0],
    ["I loved the visuals", 1]
]
tests += list(zip(X_test, y_test))
correct = 0
display = 10
for t in tests:
    guess = predict(t[0])
    answer = "positive" if t[1] == 1 else "negative"
    if display > 0:
      print(t, "→", guess, "; Correct: ", answer)
      display -= 1
    if guess == answer: correct += 1
print("Accuracy: ", correct / len(tests))

['great acting and wonderful story', 1] → positive ; Correct:  positive
['painfully slow and boring', 0] → negative ; Correct:  negative
['I loved the visuals', 1] → positive ; Correct:  positive
("I hope Robert Redford continues to make more films like this. Hillerman's books are wonderful, and as a young child raised in the Southwest his stories hit home! Adam Beach is a highly under rated and under used actor. Wake up Hollywood, not everyone thinks that your Mel Gibson's are cool! Many movie goer's today want to see films that make you think. I have seen all of the Redford/Hillerman series. They are thoughtful, scenic and have great plots. I'm hoping that if enough people write to Robert Redford he may decide to make a few more! Thank you Adam Beach and Tony Hillerman for great entertainment! If anyone get's a chance to read Tony Hillerman's latest book do so! It's great. I also recommend traveling through Arizona, New Mexico, Utah and Colorado. Stop at every view site and feel the 