In [10]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import logging

logging.basicConfig(level=logging.INFO)

def load_reviews(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    required_columns = ['Review', 'Rating', 'Bank']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Missing column in dataset: {col}")
    df.dropna(subset=["Review"], inplace=True)
    return df


def apply_vader_sentiment(df: pd.DataFrame) -> pd.DataFrame:
    analyzer = SentimentIntensityAnalyzer()

    def get_sentiment_label(text: str) -> str:
        score = analyzer.polarity_scores(text)["compound"]
        if score >= 0.05:
            return "positive"
        elif score <= -0.05:
            return "negative"
        else:
            return "neutral"

    df["vader_score"] = df["Review"].astype(str).apply(lambda x: analyzer.polarity_scores(x)["compound"])
    df["vader_label"] = df["Review"].astype(str).apply(get_sentiment_label)
    return df


def aggregate_sentiment_by_bank_rating(df: pd.DataFrame) -> pd.DataFrame:
    agg = df.groupby(["Bank", "Rating"])["vader_label"].value_counts(normalize=True).unstack().fillna(0)
    return agg


def save_results(df: pd.DataFrame, path: str):
    df.to_csv(path, index=False)
    logging.info(f"Saved sentiment-labeled data to: {path}")


In [11]:
import os
import sys
sys.path.append(os.path.abspath("../"))

# from vader_sentiment_analysis import load_reviews, apply_vader_sentiment, aggregate_sentiment_by_bank_rating, save_results

# Step 1: Load cleaned reviews
df_reviews = load_reviews("cleaned_reviews.csv")

# Step 2: Apply VADER sentiment analysis
df_with_sentiment = apply_vader_sentiment(df_reviews)

# Step 3: Save reviews with sentiment
save_results(df_with_sentiment, "reviews_with_vader_sentiment.csv")

# Step 4: Aggregate by bank and rating
agg_sentiment = aggregate_sentiment_by_bank_rating(df_with_sentiment)

# Step 5: View the result
print(agg_sentiment)

ValueError: Missing column in dataset: Review