<a href="https://colab.research.google.com/github/27ABH/AI-Data-Science-2025-26/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

# 1. Load Data
df = pd.read_parquet("hf://datasets/hanshan1988/customers-reviews-on-banks-sampled-pt25/data/train-00000-of-00001.parquet")

In [52]:
# 2. Setup Columns (finding them automatically)
text_col = next(col for col in df.columns if 'text' in col.lower() or 'body' in col.lower())
score_col = next(col for col in df.columns if 'score' in col.lower() or 'star' in col.lower())

df = df.dropna(subset=[text_col, score_col])
# Define 4-5 stars as positive (1), 1-3 stars as negative (0)
df['label'] = df[score_col].apply(lambda x: 1 if x >= 4 else 0)

In [53]:
# 3. Preprocessing
def tokenize(text):
    return re.sub(r"[^a-z\s]", "", str(text).lower()).split()

In [54]:
# 4. Train-Test Split
# We split the dataframe first so we can evaluate later
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_corpus = [tokenize(t) for t in train_df[text_col]]

In [55]:
# 5. Train Word2Vec
print("Training Word2Vec...")
model = Word2Vec(sentences=train_corpus, vector_size=100, window=5, min_count=2, workers=4, epochs=40)


Training Word2Vec...


In [56]:
# 6. Sentence Vector function
def sentence_vector(tokens):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    if not vecs:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)


In [57]:
# 7. Create Training Vectors and Prototypes
X_train = np.array([sentence_vector(tokens) for tokens in train_corpus])
y_train = train_df['label'].values

# To improve accuracy, we calculate the average of each class
pos_vec = np.mean(X_train[y_train == 1], axis=0)
neg_vec = np.mean(X_train[y_train == 0], axis=0)

In [58]:
# 8. Evaluation Logic
def cosine_sim(a, b):
    norm = np.linalg.norm(a) * np.linalg.norm(b)
    return np.dot(a, b) / norm if norm != 0 else 0

def predict(text):
    v = sentence_vector(tokenize(text))
    p_sim = cosine_sim(v, pos_vec)
    n_sim = cosine_sim(v, neg_vec)
    return 1 if p_sim > n_sim else 0

In [59]:
# 9. Run Evaluation on Test Set
test_reviews = test_df[text_col].tolist()
test_labels = test_df['label'].tolist()

predictions = [predict(r) for r in test_reviews]
accuracy = sum(1 for p, l in zip(predictions, test_labels) if p == l) / len(test_labels)

print(f"\nModel Accuracy on Test Split: {accuracy:.2%}")


Model Accuracy on Test Split: 84.90%


In [60]:
# 10. Try it
print("\n--- RESULTS ---")
tests = [
    "The staff was very helpful and the interest rates are great",
    "I am very disappointed with the hidden fees",
    "The mobile app is intuitive and fast",
    "Terrible customer service, they put me on hold for an hour"
]

for t in tests:
    pred_label = "positive" if predict(t) == 1 else "negative"
    print(f"Review: {t} -> {pred_label}")


--- RESULTS ---
Review: The staff was very helpful and the interest rates are great -> positive
Review: I am very disappointed with the hidden fees -> positive
Review: The mobile app is intuitive and fast -> positive
Review: Terrible customer service, they put me on hold for an hour -> negative
