<a href="https://colab.research.google.com/github/26aharikrishnan/Sentiment-Analysis-Project/blob/main/Sentiment_Analysis_of_Movie_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
from gensim.models import Word2Vec
import numpy as np
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(tokenizer=tokenize, min_df=5, max_df=0.8)
tfidf.fit(X_train_texts)

idf = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))



In [25]:
# ----------------------
# Dataset
# ----------------------

# Load IMDB dataset safely
df = pd.read_csv(
    "IMDB Dataset.csv",
    engine="python",
    on_bad_lines="skip"
)


# Convert sentiment labels
df["label"] = df["sentiment"].map({"positive": 1, "negative": 0})

texts = df["review"].values
labels = df["label"].values

X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts,
    labels,
    test_size=0.30,
    random_state=42,
    stratify=labels
)


In [26]:
# ----------------------
# Preprocessing
# ----------------------

def tokenize(text):
    # Lowercase
    text = text.lower()

    # Remove HTML line breaks
    text = re.sub(r"<br\s*/?>", " ", text)

    # Remove punctuation and numbers
    text = re.sub(r"[^a-z\s]", "", text)

    tokens = text.split()

    # Handle negation (e.g., "not good" → "NOT_good")
    negation_words = {"not", "no", "never"}
    result = []
    negate = False

    for word in tokens:
        if word in negation_words:
            negate = True
            continue

        if word in ENGLISH_STOP_WORDS:
            continue

        if negate:
            result.append("NOT_" + word)
            negate = False
        else:
            result.append(word)

    return result


In [27]:
train_corpus = [tokenize(text) for text in X_train_texts]
test_corpus = [tokenize(text) for text in X_test_texts]


In [28]:
# ----------------------
# Train Word2Vec
# ----------------------

model = Word2Vec(
    train_corpus[:15000],
    vector_size=100,
    window=8,
    min_count=5,
    sg=1,
    negative=10,
    epochs=5,
    workers=8
)


In [29]:
# ----------------------
# Sentence vector
# ----------------------

def sentence_vector(tokens):
    vecs, weights = [], []

    for w in tokens:
        if w in model.wv and w in idf:
            vecs.append(model.wv[w])
            weights.append(idf[w])

    if not vecs:
        return np.zeros(model.vector_size)

    return np.average(vecs, axis=0, weights=weights)

In [30]:
def normalize(v):
    return v / (np.linalg.norm(v) + 1e-9)

X_train = np.array([normalize(sentence_vector(t)) for t in train_corpus])
X_test = np.array([normalize(sentence_vector(t)) for t in test_corpus])


In [31]:
# ----------------------
# Simple sentiment prototypes
# ----------------------
pos_vec = np.mean(X_train[y_train == 1], axis=0)
neg_vec = np.mean(X_train[y_train == 0], axis=0)


In [32]:
def cosine(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def predict(sentence):
    tokens = tokenize(sentence)
    v = sentence_vector(tokens)
    return 1 if cosine(v, pos_vec) > cosine(v, neg_vec) else 0


In [33]:
# ----------------------
# Try it
# ----------------------

predictions = [predict(text) for text in X_test_texts]

accuracy = np.mean(predictions == y_test)
print("Test Accuracy:", accuracy)

tests = [
    "great acting and wonderful story",
    "painfully slow and boring",
    "not bad but not great",
    "I loved the visuals"
]

for t in tests:
    label = predict(t)
    print(t, "→", "positive" if label == 1 else "negative")


Test Accuracy: 0.7665333333333333
great acting and wonderful story → positive
painfully slow and boring → negative
not bad but not great → negative
I loved the visuals → positive


In [23]:
pip install gensim


