<a href="https://colab.research.google.com/github/EkansTCG/AI-hw-sentiment-analysis/blob/main/sentiment%20Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
import kagglehub
import pandas as pd
import os
import re
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# ----------------------
# 1. Download & Load Dataset
# ----------------------
print("Downloading dataset...")
path = kagglehub.dataset_download("andrewmvd/steam-reviews")
print("Path to dataset files:", path)

# Find the CSV file
csv_file = [f for f in os.listdir(path) if f.endswith('.csv')][0]
full_file_path = os.path.join(path, csv_file)

# Load data (Limiting to 500,000 rows for speed)
df = pd.read_csv(full_file_path, nrows=500000)

# Drop rows with missing reviews
df = df.dropna(subset=['review_text'])

# ----------------------
# UPDATED SECTION: Handle 'review_score' column
# ----------------------
# Your columns are: app_id, app_name, review_text, review_score, review_votes
# Usually: 1 = Positive, -1 = Negative.
# We map this to: 1 = Positive, 0 = Negative (for binary classification)
df['label'] = df['review_score'].apply(lambda x: 1 if x > 0 else 0)

print(f"Total reviews loaded: {len(df)}")
print("Label distribution:", df['label'].value_counts().to_dict())

# ----------------------
# 2. Preprocessing
# ----------------------
def tokenize(text):
    if not isinstance(text, str): return []
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

# Create corpus from the 'review_text' column
df['tokens'] = df['review_text'].apply(tokenize)
corpus = df['tokens'].tolist()

# ----------------------
# 3. Train Word2Vec
# ----------------------
print("Training Word2Vec model (this may take a moment)...")
model = Word2Vec(corpus, vector_size=100, window=5, min_count=2, sg=1)

# ----------------------
# 4. Sentence Vectorization
# ----------------------
def sentence_vector(tokens):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    if not vecs:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

# Create feature matrix X and label vector y
print("Vectorizing sentences...")
X = np.array([sentence_vector(tokens) for tokens in corpus])
y = df['label'].values

# ----------------------
# 5. Train/Test Split
# ----------------------
# Split data: 80% to train, 20% to test accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------
# 6. Build Sentiment Prototypes (Train Data Only)
# ----------------------
print("Building sentiment prototypes...")
pos_vec = np.mean(X_train[y_train == 1], axis=0)
neg_vec = np.mean(X_train[y_train == 0], axis=0)

def cosine_sim(a, b):
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if norm_a == 0 or norm_b == 0:
        return 0
    return np.dot(a, b) / (norm_a * norm_b)

def predict_label(vec):
    score_pos = cosine_sim(vec, pos_vec)
    score_neg = cosine_sim(vec, neg_vec)
    return 1 if score_pos > score_neg else 0

# ----------------------
# 7. Calculate Accuracy
# ----------------------
print("Evaluating model accuracy...")
y_pred = [predict_label(v) for v in X_test]

acc = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {acc * 100:.2f}%")
print("-" * 50)
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

# ----------------------
# 8. Manual Test
# ----------------------
tests = [
    "This game is an absolute masterpiece with great mechanics",
    "Terrible performance and boring gameplay"
]
print("\n--- Manual Tests ---")
for t in tests:
    v = sentence_vector(tokenize(t))
    pred = predict_label(v)
    print(f"'{t}' → {'Positive' if pred == 1 else 'Negative'}")

Downloading dataset...
Downloading from https://www.kaggle.com/api/v1/datasets/download/andrewmvd/steam-reviews?dataset_version_number=3...


100%|██████████| 685M/685M [00:09<00:00, 74.2MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/andrewmvd/steam-reviews/versions/3
Total reviews loaded: 499072
Label distribution: {1: 432333, 0: 66739}
Training Word2Vec model (this may take a moment)...
Vectorizing sentences...
Building sentiment prototypes...
Evaluating model accuracy...

Model Accuracy: 76.05%
------------------------------
              precision    recall  f1-score   support

    Negative       0.33      0.78      0.46     13339
    Positive       0.96      0.76      0.85     86476

    accuracy                           0.76     99815
   macro avg       0.64      0.77      0.66     99815
weighted avg       0.87      0.76      0.79     99815


--- Manual Tests ---
'This game is an absolute masterpiece with great mechanics' → Positive
'Terrible performance and boring gameplay' → Negative
