In [1]:
from gensim.models import Word2Vec
import numpy as np
import re
import pandas as pd

In [2]:
# ============================================================================
# SECTION 1: LOAD AND PREPARE THE DATASET
# ============================================================================

df = pd.read_csv('steam_reviews.csv')
reviews = df['review'].tolist()

# Convert "Recommended" to 1 and "Not Recommended" to 0
raw_labels = df['recommendation'].tolist()
labels = []
for label in raw_labels:
    if isinstance(label, str):
        if 'Recommended' in label and 'Not' not in label:
            labels.append(1)
        else:
            labels.append(0)
    elif isinstance(label, bool):
        labels.append(1 if label else 0)
    else:
        labels.append(1 if label else 0)

sentences = [(text, label) for text, label in zip(reviews, labels) 
             if isinstance(text, str) and len(text.strip()) > 0]

print(f"Loaded {len(sentences)} reviews")

Loaded 19931 reviews


In [3]:
# ============================================================================
# SECTION 2: TEXT PREPROCESSING
# ============================================================================

def tokenize(text):
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

corpus = []
valid_sentences = []

for text, label in sentences:
    tokens = tokenize(text)
    if len(tokens) > 0:
        corpus.append(tokens)
        valid_sentences.append((text, label))

sentences = valid_sentences

print(f"Valid reviews after tokenization: {len(sentences)}")
print(f"Sample review: {sentences[0][0][:100]}...")
print(f"Tokenized: {corpus[0][:10]}...")

Valid reviews after tokenization: 19601
Sample review: Pretty much same controls and buggy mess as all the other battle royale games....
Tokenized: ['pretty', 'much', 'same', 'controls', 'and', 'buggy', 'mess', 'as', 'all', 'the']...


In [4]:
# ============================================================================
# SECTION 3: TRAIN WORD2VEC MODEL
# ============================================================================

model = Word2Vec(corpus, vector_size=50, window=4, min_count=1, sg=1, epochs=10)
print(f"Word2Vec trained with {len(model.wv)} words")

Word2Vec trained with 33241 words


In [5]:
# ============================================================================
# SECTION 4: CONVERT SENTENCES TO VECTORS
# ============================================================================

def sentence_vector(tokens):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    if not vecs:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

X = np.array([sentence_vector(tokens) for tokens in corpus])
y = np.array([label for _, label in sentences])

In [6]:
# ============================================================================
# SECTION 5: CREATE SENTIMENT PROTOTYPES
# ============================================================================

print(f"\nLabel distribution:")
print(f"  Positive (1): {sum(y == 1)}")
print(f"  Negative (0): {sum(y == 0)}")

if sum(y == 1) == 0:
    print("ERROR: No positive reviews found!")
if sum(y == 0) == 0:
    print("ERROR: No negative reviews found!")

pos_vec = np.mean(X[y == 1], axis=0)
neg_vec = np.mean(X[y == 0], axis=0)


Label distribution:
  Positive (1): 13750
  Negative (0): 5851


In [7]:
# ============================================================================
# SECTION 6: PREDICTION FUNCTION
# ============================================================================

def cosine(a, b):
    norm = np.linalg.norm(a) * np.linalg.norm(b)
    if norm == 0:
        return 0
    return np.dot(a, b) / norm

def predict(sentence):
    v = sentence_vector(tokenize(sentence))
    return 1 if cosine(v, pos_vec) > cosine(v, neg_vec) else 0

In [8]:
# ============================================================================
# SECTION 7: CALCULATE ACCURACY
# ============================================================================

predictions = [predict(text) for text, _ in sentences]
correct = sum([1 for pred, true in zip(predictions, y) if pred == true])
accuracy = correct / len(y)

print(f"\n{'='*50}")
print(f"RESULTS")
print(f"{'='*50}")
print(f"Total reviews: {len(y)}")
print(f"Correct predictions: {correct}")
print(f"Accuracy: {accuracy:.2%}")

print(f"\nSample predictions (first 5):")
for i in range(min(5, len(sentences))):
    text, true_label = sentences[i]
    pred = predictions[i]
    print(f"  Review: {text[:60]}...")
    print(f"  True: {true_label}, Predicted: {pred}, {'✓' if pred == true_label else '✗'}")
    
print(f"{'='*50}\n")


RESULTS
Total reviews: 19601
Correct predictions: 13308
Accuracy: 67.89%

Sample predictions (first 5):
  Review: Pretty much same controls and buggy mess as all the other ba...
  True: 0, Predicted: 1, ✗
  Review: game cant even find stone...
  True: 0, Predicted: 0, ✓
  Review: EVERY DRAGON NEED A BLOOD BAR !!!!!!!! I spend 40 minutes ch...
  True: 0, Predicted: 0, ✓
  Review: Great game. Just not into the whole open world type gameplay...
  True: 0, Predicted: 1, ✗
  Review: People compare this game to dark souls but it is not very si...
  True: 0, Predicted: 1, ✗



In [9]:
# ============================================================================
# SECTION 8: EXAMPLES
# ============================================================================

tests = [
    "amazing game love it",
    "terrible waste of money",
    "pretty good overall",
    "buggy and broken"
]

print("TEST PREDICTIONS:")
for t in tests:
    pred = predict(t)
    sentiment = 'positive' if pred == 1 else 'negative'
    print(f"{t} → {sentiment}")

TEST PREDICTIONS:
amazing game love it → positive
terrible waste of money → negative
pretty good overall → positive
buggy and broken → positive
