<a href="https://colab.research.google.com/github/27abernal/Adv_AI/blob/main/Sentiment_Analysis_of_Amazon_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [103]:
pip install gensim



In [104]:
# ----------------------
# Imports
# ----------------------
# Data manipulation
import pandas as pd
import numpy as np
import re

# Word embedding model
from gensim.models import Word2Vec

# Machine learning utilities
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [105]:
# ----------------------
# Load Dataset
# ----------------------
df = pd.read_csv("hf://datasets/hugginglearners/amazon-reviews-sentiment-analysis/amazon_reviews.csv")

# Keep only relevant columns
df = df[["reviewText", "overall"]]

# Drop rows with missing review text
df = df.dropna(subset=["reviewText"])

In [106]:
# ----------------------
# Sentiment Labels
# ----------------------
# Here we keep neutral reviews (3-class sentiment)

def get_sentiment_label(rating):
    """
    Convert star rating into sentiment class:
    0 = Negative
    1 = Neutral
    2 = Positive
    """
    if rating <= 2:
        return 0
    elif rating == 3:
        return 1
    else:
        return 2

# Apply labeling
df["sentiment"] = df["overall"].apply(get_sentiment_label)

In [107]:
# ----------------------
# Preprocessing
# ----------------------

def tokenize(text):
    """
    Lowercase text, remove punctuation and numbers,
    and split into word tokens.
    """
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

# Pair reviews with sentiment labels
sentences = list(zip(df["reviewText"], df["sentiment"]))

# Tokenize all review texts
corpus = [tokenize(text) for text, _ in sentences]

# Inspect tokenized corpus
print(corpus[:5])

[['no', 'issues'], ['purchased', 'this', 'for', 'my', 'device', 'it', 'worked', 'as', 'advertised', 'you', 'can', 'never', 'have', 'too', 'much', 'phone', 'memory', 'since', 'i', 'download', 'a', 'lot', 'of', 'stuff', 'this', 'was', 'a', 'no', 'brainer', 'for', 'me'], ['it', 'works', 'as', 'expected', 'i', 'should', 'have', 'sprung', 'for', 'the', 'higher', 'capacity', 'i', 'think', 'its', 'made', 'a', 'bit', 'cheesier', 'than', 'the', 'earlier', 'versions', 'the', 'paint', 'looks', 'not', 'as', 'clean', 'as', 'before'], ['this', 'think', 'has', 'worked', 'out', 'greathad', 'a', 'diff', 'bran', 'gb', 'card', 'and', 'if', 'went', 'south', 'after', 'monthsthis', 'one', 'has', 'held', 'up', 'pretty', 'well', 'since', 'i', 'had', 'my', 's', 'now', 'on', 'my', 'note', 'update', 'ive', 'had', 'this', 'for', 'a', 'few', 'months', 'and', 'have', 'had', 'zero', 'issues', 'since', 'it', 'was', 'transferred', 'from', 'my', 's', 'to', 'my', 'note', 'and', 'into', 'a', 'note', 'this', 'card', 'is',

In [108]:
# ----------------------
# Train Word2Vec Model
# ----------------------

model = Word2Vec(
    sentences=corpus,    # tokenized reviews
    vector_size=100,     # dimensionality of embeddings
    window=5,            # context window size
    min_count=2,         # ignore rare words
    workers=4,           # parallel training
    sg=1,                # skip-gram architecture
    epochs=50            # number of training epochs
)

In [109]:
# ----------------------
# Sentence Vector Function
# ----------------------
def sentence_vector(tokens):
    """
    Convert a list of tokens into a single sentence vector
    by averaging Word2Vec word embeddings.
    """
    vectors = []

    for word in tokens:
        if word in model.wv:
            vectors.append(model.wv[word])

    # Handle sentences with no known words
    if len(vectors) == 0:
        return np.zeros(model.vector_size)

    # Average all word vectors
    return np.mean(vectors, axis=0)

In [110]:
# ----------------------
# Build Feature Matrix
# ----------------------
# Convert each review into a sentence vector
X = np.array([sentence_vector(tokenize(text)) for text, _ in sentences])

# Extract sentiment labels
y = np.array([label for _, label in sentences])


In [111]:
# ----------------------
# Train / Validation / Test Split
# ----------------------
# Split into training set and temporary set
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# Split temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

In [112]:
# ----------------------
# Logistic Regression Classifier
# ----------------------
# Initialize multinomial logistic regression
clf = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial"
)

# Train classifier on sentence vectors
clf.fit(X_train, y_train)



In [113]:
# ----------------------
# Validation Evaluation
# ----------------------
# Predict on validation set
y_val_pred = clf.predict(X_val)

# Compute validation accuracy
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

# Detailed validation metrics
print("\nValidation Classification Report:\n")
print(classification_report(
    y_val,
    y_val_pred,
    target_names=["Negative", "Neutral", "Positive"]
))

Validation Accuracy: 0.9240162822252375

Validation Classification Report:

              precision    recall  f1-score   support

    Negative       0.73      0.33      0.46        48
     Neutral       0.00      0.00      0.00        22
    Positive       0.93      1.00      0.96       667

    accuracy                           0.92       737
   macro avg       0.55      0.44      0.47       737
weighted avg       0.89      0.92      0.90       737



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [114]:
# ----------------------
# Test Evaluation
# ----------------------
# Predict on test set
y_test_pred = clf.predict(X_test)

# Compute test accuracy
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

# Detailed test metrics
print("\nTest Classification Report:\n")
print(classification_report(
    y_test,
    y_test_pred,
    target_names=["Negative", "Neutral", "Positive"]
))


Test Accuracy: 0.9119241192411924

Test Classification Report:

              precision    recall  f1-score   support

    Negative       0.53      0.37      0.43        49
     Neutral       0.00      0.00      0.00        21
    Positive       0.93      0.98      0.95       668

    accuracy                           0.91       738
   macro avg       0.49      0.45      0.46       738
weighted avg       0.88      0.91      0.89       738



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [115]:
# ----------------------
# Final Prediction Function
# ----------------------
def predict(sentence):
    """
    Predict sentiment using logistic regression
    on Word2Vec sentence vectors.
    """
    tokens = tokenize(sentence)
    vec = sentence_vector(tokens).reshape(1, -1)
    pred = clf.predict(vec)[0]

    if pred == 0:
        return "Negative"
    elif pred == 1:
        return "Neutral"
    else:
        return "Positive"

In [116]:
# ----------------------
# Example Predictions
# ----------------------
examples = [
    "This product is amazing and works perfectly",
    "It's fine, nothing special",
    "Terrible quality and waste of money"
]

for s in examples:
    print(s, "→", predict(s))


This product is amazing and works perfectly → Positive
It's fine, nothing special → Positive
Terrible quality and waste of money → Positive
