In [None]:
import numpy as np
import pandas as pd 
import torch
import random
import shap
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm 
import pickle
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score 

In [None]:
# Define a helper function to map stars to sentiment
def map_stars_to_sentiment(stars):
    
    if stars >= 4:
        return "POSITIVE"
    else:
        return "NEGATIVE"
    

In [None]:
# --------------------------
# 1. Setup the model and tokenizer
# --------------------------
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load tokenizer and model, and move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to("cuda")
model.eval()  # Set model to evaluation mode

# Create the Hugging Face sentiment analysis pipeline.
# The pipeline will apply truncation automatically.
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=0  # Use GPU
)

In [None]:
# --------------------------
# 2. Define a prediction function for SHAP
# --------------------------
def model_predict(texts):
    """
    Takes a list of texts, tokenizes them with padding and truncation,
    moves inputs to GPU, and returns logits as numpy arrays.
    """
    # Ensure all inputs are strings
    texts = [str(t) for t in texts]
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.logits.detach().cpu().numpy()

# Create a text masker for SHAP (to correctly mask tokens)
masker = shap.maskers.Text(tokenizer)

# Initialize the SHAP explainer with our custom prediction function and masker.
explainer = shap.Explainer(model_predict, masker, output_names=["Negative", "Positive"])


In [None]:

# --------------------------
# 3. Process Your Reviews from CSV in Batches and Save Sentiment Predictions
# --------------------------
# file_path = "reviews_sample.csv"  # Your CSV file with a "review_text" column
file_path = "pre_covid_reviews.csv"
chunk_size = 2000  # Adjust based on your system's memory
batch_size = 64
counter = 0


all_prediction_dfs = []
all_shap_explanations = []

# Process the CSV file in chunks
for chunk in tqdm(pd.read_csv(file_path, chunksize=chunk_size), desc="Processing CSV chunks"):
    counter += 1
    if counter % 100 == 0:
        print(f"Processed {counter} chunks")
    # Filter out missing review texts and ensure they are strings
    texts = [str(t) for t in chunk["text"].dropna().tolist()]
    
    # Run sentiment analysis in batches; pipeline applies truncation automatically.
    predictions = sentiment_pipeline(texts, batch_size=batch_size, truncation=True)
    
    # Append prediction results to the chunk dataframe:
    # Create new columns for the predicted label and score.
    chunk = chunk.dropna(subset=["text"]).copy()
    chunk["prediction"] = [p["label"] for p in predictions]
    chunk["score"] = [p["score"] for p in predictions]
    
    all_prediction_dfs.append(chunk)
    
    # For SHAP explanations, select a small random sample from the current chunk.
    if texts:
        sample_texts = random.sample(texts, k=min(10, len(texts)))
        shap_expl = explainer(sample_texts)
        all_shap_explanations.append(shap_expl)

# Combine all chunk prediction dataframes into one
predictions_df = pd.concat(all_prediction_dfs, ignore_index=True)

# Apply the mapping to get "true" sentiment levels (if stars >= 4)
predictions_df["true_sentiment"] = predictions_df["stars"].apply(map_stars_to_sentiment)

# Create a numeric sentiment value:
# For Positive predictions, sentiment_value = score; for Negative, sentiment_value = -score.
predictions_df['sentiment_value'] = predictions_df.apply(
    lambda row: row['score'] if row['prediction'] == "POSITIVE" else -row['score'], axis=1
)

predictions_df.to_csv("sentiment_predictions.csv", index=False)
print("Sentiment predictions saved to sentiment_predictions.csv")

In [None]:
import pickle

# To save the list to a file:
with open("all_shap_explanations.pkl", "wb") as f:
    pickle.dump(all_shap_explanations, f)
print("SHAP explanations saved to all_shap_explanations.pkl")


In [None]:
# --------------------------
# 4. Aggregate SHAP Values to Extract Top Positive and Top Negative Tokens
# --------------------------
# We will aggregate a "net contribution" per token for each sample.
# For binary classification, each token gets an array like [neg_value, pos_value].
# We'll compute net_contrib = pos_value - neg_value.
positive_contributions = defaultdict(list)
negative_contributions = defaultdict(list)

for explanation in all_shap_explanations:
    # explanation.data is a list (per sample) of token lists.
    # explanation.values is an array (per sample) of SHAP value arrays.
    for sample_tokens, sample_values in zip(explanation.data, explanation.values):

        for token, shap_val in zip(sample_tokens, sample_values):
            # If shap_val is an array with two values (for negative and positive)
            # compute the net contribution.
            net_contrib = shap_val[1] - shap_val[0]
            if net_contrib >= 0:
                positive_contributions[token].append(net_contrib)
            else:
                negative_contributions[token].append(net_contrib)

# Compute average net contribution per token
avg_positive = {token: np.mean(vals) for token, vals in positive_contributions.items() if vals}
avg_negative = {token: np.mean(vals) for token, vals in negative_contributions.items() if vals}


sorted_positive = sorted(avg_positive.items(), key=lambda x: x[1], reverse=True)
sorted_negative = sorted(avg_negative.items(), key=lambda x: x[1])


pd.DataFrame(sorted_positive, columns=["token", "avg_net_shap"]).to_csv("top_positive_terms.csv", index=False)
pd.DataFrame(sorted_negative, columns=["token", "avg_net_shap"]).to_csv("top_negative_terms.csv", index=False)

print("Top positive terms saved to top_positive_terms.csv")
print("Top negative terms saved to top_negative_terms.csv")


### Evaluation

In [None]:
# predictions_df = pd.read_csv("./pre_covid/sentiment_predictions.csv")
predictions_df.head()

In [None]:
print("Unique true sentiment labels:", predictions_df["true_sentiment"].unique())
print("Unique true sentiment labels:", predictions_df["prediction"].unique())

In [None]:
print(f"Accuracy {accuracy_score(predictions_df['true_sentiment'], predictions_df['prediction'])}")

cm = confusion_matrix(predictions_df["true_sentiment"], predictions_df["prediction"], labels=["NEGATIVE", "POSITIVE"])
cm_df = pd.DataFrame(cm, index=["NEGATIVE", "POSITIVE"], columns=["NEGATIVE", "POSITIVE"])

plt.figure(figsize=(8, 6))
sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues")
plt.title("Pre-Covid Confusion Matrix")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()

print("Pre-Covid Classification Report")
print(classification_report(predictions_df["true_sentiment"], predictions_df["prediction"], labels=["NEGATIVE", "POSITIVE"]))

### Shap summary plot

In [None]:
# import pickle
# with open("./pre_covid/all_shap_explanations.pkl", "rb") as f:
#     all_shap_explanations = pickle.load(f)
# print("SHAP explanations loaded from all_shap_explanations.pkl")

In [None]:
positive_contributions = defaultdict(list)
negative_contributions = defaultdict(list)

# Loop over each SHAP Explanation object (each may contain several samples)
for explanation in all_shap_explanations:
    # explanation.data: list of lists (tokens per sample)
    # explanation.values: numpy array with shape (n_samples, n_tokens, n_classes)
    for sample_tokens, sample_values in zip(explanation.data, explanation.values):
        # For a two-class model, assume index 1 corresponds to positive and index 0 to negative.
        for token, token_values in zip(sample_tokens, sample_values):
            positive_contributions[token].append(token_values[1])
            negative_contributions[token].append(token_values[0])

# Compute the average contribution per token for positive and negative
avg_positive = {token: np.mean(vals) for token, vals in positive_contributions.items()}
avg_negative = {token: np.mean(vals) for token, vals in negative_contributions.items()}

# Compute net contribution per token: (average positive contribution) minus (average negative contribution)
net_contributions = {token: avg_positive.get(token, 0) - avg_negative.get(token, 0)
                     for token in set(avg_positive) | set(avg_negative)}

sorted_net_positive = sorted(net_contributions.items(), key=lambda x: x[1], reverse=True)
sorted_net_negative = sorted(net_contributions.items(), key=lambda x: x[1])

print("Top tokens driving positive predictions (net contribution):")
for token, net in sorted_net_positive[:20]:
    print(f"{token}: {net:.4f}")

print("\nTop tokens driving negative predictions (net contribution):")
for token, net in sorted_net_negative[:20]:
    print(f"{token}: {net:.4f}")

    
if sorted_net_positive:
    pos_tokens, pos_scores = zip(*sorted_net_positive[:20])
    plt.figure(figsize=(10, 5))
    plt.bar(pos_tokens, pos_scores)
    plt.title("Top Tokens Driving Positive Predictions (Net Contribution) [Pre-Covid]")
    plt.xlabel("Token")
    plt.ylabel("Net SHAP Value")
    plt.xticks(rotation=45)
    plt.show()

if sorted_net_negative:
    neg_tokens, neg_scores = zip(*sorted_net_negative[:20])
    plt.figure(figsize=(10, 5))
    plt.bar(neg_tokens, neg_scores)
    plt.title("Top Tokens Driving Negative Predictions (Net Contribution) [Pre-Covid]")
    plt.xlabel("Token")
    plt.ylabel("Net SHAP Value")
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# --- Box Plot: Distribution of Sentiment Values per Star Rating ---
plt.figure(figsize=(10, 6))
sns.boxplot(x="stars", y="sentiment_value", data=predictions_df)
plt.title("Distribution of Sentiment Values by Star Rating")
plt.xlabel("Stars")
plt.ylabel("Sentiment Value (Positive = Higher, Negative = Lower)")
plt.show()

# --- Line Plot: Average Sentiment Value per Star Rating ---
avg_sentiment = predictions_df.groupby("stars")["sentiment_value"].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.lineplot(x="stars", y="sentiment_value", data=avg_sentiment, marker="o")
plt.title("Average Sentiment Value by Star Rating")
plt.xlabel("Stars")
plt.ylabel("Average Sentiment Value")
plt.show()


In [None]:
binary_df = predictions_df[predictions_df["prediction"].isin(["POSITIVE", "NEGATIVE"])]

# --- Box Plot: Distribution of Star Ratings by Sentiment ---
plt.figure(figsize=(10, 6))
sns.boxplot(x="prediction", y="stars", data=binary_df)
plt.title("Star Rating Distribution by Sentiment Prediction [Pre-Covid]")
plt.xlabel("Sentiment Prediction")
plt.ylabel("Star Rating")
plt.show()

# --- Bar Plot: Average Star Rating for Each Sentiment ---
avg_stars = binary_df.groupby("prediction")["stars"].mean().reset_index()

plt.figure(figsize=(8, 6))
sns.barplot(x="prediction", y="stars", data=avg_stars)
plt.title("Average Star Rating for Positive vs Negative Reviews")
plt.xlabel("Sentiment Prediction")
plt.ylabel("Average Star Rating")
plt.show()

In [None]:
# Scatter plot with regression fit for each sentiment label
plt.figure(figsize=(10, 6))
sns.lmplot(x="stars", y="sentiment_value", hue="prediction", data=binary_df,
           markers=["o", "x"], aspect=1.5, ci=None)
plt.title("Relationship Between Star Rating and Sentiment Value")
plt.xlabel("Star Rating")
plt.ylabel("Sentiment Value")
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(x="stars", y="sentiment_value", hue="prediction", data=binary_df, split=True)
plt.title("Distribution of Sentiment Values by Star Rating")
plt.xlabel("Star Rating")
plt.ylabel("Sentiment Value")
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="stars", y="sentiment_value", data=binary_df, hue="prediction")
plt.title("Box Plot of Sentiment Value by Star Rating and Sentiment")
plt.xlabel("Star Rating")
plt.ylabel("Sentiment Value")
plt.show()


# Aggregate # reviews for binary classification

In [None]:
pre_covid = pd.read_csv("./pre_covid/sentiment_predictions.csv")
post_covid = pd.read_csv("./post_covid/sentiment_predictions.csv")

print(pre_covid.shape)
print(post_covid.shape)
post_covid.head()

In [None]:
pre_businesses = pre_covid['business_id'].value_counts()
post_businesses = post_covid['business_id'].value_counts()
print(len(pre_businesses))
print(len(post_businesses))

In [None]:
pre_covid['is_positive'] = pre_covid['prediction'] == 'POSITIVE'
pre_covid['is_negative'] = pre_covid['prediction'] == 'NEGATIVE'
pre_grouped = pre_covid.groupby("business_id").sum()

post_covid['is_positive'] = post_covid['prediction'] == 'POSITIVE'
post_covid['is_negative'] = post_covid['prediction'] == 'NEGATIVE'
post_grouped = post_covid.groupby("business_id").sum()

In [None]:
pre_covid_agg = pre_grouped[['is_positive', 'is_negative']].rename(columns={'is_positive': 'pre_cov_num_pos', 'is_negative':'pre_cov_num_neg'})
post_covid_agg = post_grouped[['is_positive', 'is_negative']].rename(columns={'is_positive': 'post_cov_num_pos', 'is_negative':'post_cov_num_neg'})
print(pre_covid_agg.shape, post_covid_agg.shape)
pre_covid_agg.head()

In [None]:
merged = pre_covid_agg.merge(post_covid_agg, on='business_id', how='outer').fillna(0)
print(merged.shape)
merged

In [None]:
merged['total_pos_reviews'] = merged['pre_cov_num_pos'] + merged['post_cov_num_pos']
merged['total_neg_reviews'] = merged['pre_cov_num_neg'] + merged['post_cov_num_neg']
merged['total_num_reviews'] = merged['total_pos_reviews'] + merged['total_neg_reviews']
merged.head()

In [None]:
merged.to_csv("business_review_counts.csv")