In [2]:
# Import necessary libraries for data manipulation and causal inference.
import pandas as pd
import numpy as np
from scipy import stats

# Rationale: This notebook simulates a causal inference study to evaluate the impact of the new "GenShuffle" feature.
# Since we cannot run a perfect A/B test on all users, this method allows us to approximate the effect
# of the new feature by comparing a "treatment" group (users exposed to the AI)
# with a similar "control" group (users not yet exposed).

# --- Section 1: Data Loading and Preprocessing ---
# We will use the same mock data files generated in Phase 1 to ensure a consistent analysis.
try:
    user_behavior_df = pd.read_csv('user_behavior.csv')
    qualitative_ratings_df = pd.read_csv('qualitative_ratings.csv')
    print("✅ Data loaded successfully.")
except FileNotFoundError:
    print("Error: Please ensure 'user_behavior.csv' and 'qualitative_ratings.csv' are uploaded to the Colab environment.")
    raise

# --- Section 2: Simulating Causal Analysis with T-Test ---
# Method: We will identify a group of users who were "treated" (exposed to the AI feature)
# and a control group. We then compare the average stream duration between these two groups
# using a statistical test (a t-test).

# Let's assume users with an even-numbered ID were a test group for the AI feature.
# This simulates a feature rollout to a specific segment of users.
user_behavior_df['exposed_to_ai_feature'] = user_behavior_df['user_id'].apply(lambda x: 1 if int(x.split('_')[1]) % 2 == 0 else 0)

# Isolate the stream durations for the control and treatment groups.
treatment_group_durations = user_behavior_df[user_behavior_df['exposed_to_ai_feature'] == 1]['stream_duration_sec']
control_group_durations = user_behavior_df[user_behavior_df['exposed_to_ai_feature'] == 0]['stream_duration_sec']

print("\n--- Causal Impact Simulation: T-Test Analysis ---")
print(f"Average stream duration for AI-exposed users: {treatment_group_durations.mean():.2f} seconds")
print(f"Average stream duration for non-AI users: {control_group_durations.mean():.2f} seconds")

# Perform an independent t-test to check for a statistically significant difference.
t_stat, p_value = stats.ttest_ind(treatment_group_durations, control_group_durations, equal_var=False)

print(f"\nIndependent T-test Results:")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpretation for stakeholders:
alpha = 0.05
if p_value < alpha:
    print(f"Conclusion: The p-value ({p_value:.4f}) is less than the significance level (0.05).")
    print("This suggests a statistically significant difference in stream duration between the two user groups.")
    print("The new 'GenShuffle' feature likely had a measurable impact on user engagement.")
else:
    print(f"Conclusion: The p-value ({p_value:.4f}) is greater than the significance level (0.05).")
    print("There is no statistically significant evidence to conclude a difference in stream duration.")
    print("Further analysis or a longer test period may be needed to detect an impact.")

# --- Section 3: LLM-as-a-Judge Evaluation Simulation ---
# Method: We will use the 'qualitative_ratings' data to simulate a common technique in AI evaluation.
# We'll compare the simulated ratings of a human expert with a Large Language Model used as a judge.
# The goal is to see if our LLM judge's ratings are consistent with human judgment.

print("\n--- LLM-as-a-Judge Simulation: Comparing Human vs. AI Ratings ---")

# We will measure the consistency using a simple correlation coefficient.
correlation, _ = stats.pearsonr(qualitative_ratings_df['human_coherence_rating'], qualitative_ratings_df['llm_coherence_rating'])

print(f"Pearson Correlation between Human and LLM ratings: {correlation:.4f}")

# Interpretation for stakeholders:
if correlation > 0.8:
    print(f"Conclusion: A high correlation ({correlation:.4f}) indicates that the LLM judge is highly consistent with human ratings.")
    print("This suggests that we can confidently use this LLM for automated quality assessment of commentary.")
elif correlation > 0.5:
    print(f"Conclusion: A moderate correlation ({correlation:.4f}) suggests some alignment, but the LLM judge may require further tuning.")
else:
    print(f"Conclusion: A low correlation ({correlation:.4f}) suggests a significant disconnect between the LLM and human judgment.")
    print("We should not use the LLM as a judge until its rating methodology is improved.")

✅ Data loaded successfully.

--- Causal Impact Simulation: T-Test Analysis ---
Average stream duration for AI-exposed users: 134.66 seconds
Average stream duration for non-AI users: 134.16 seconds

Independent T-test Results:
T-statistic: 2.0628
P-value: 0.0391
Conclusion: The p-value (0.0391) is less than the significance level (0.05).
This suggests a statistically significant difference in stream duration between the two user groups.
The new 'GenShuffle' feature likely had a measurable impact on user engagement.

--- LLM-as-a-Judge Simulation: Comparing Human vs. AI Ratings ---
Pearson Correlation between Human and LLM ratings: -0.9682
Conclusion: A low correlation (-0.9682) suggests a significant disconnect between the LLM and human judgment.
We should not use the LLM as a judge until its rating methodology is improved.
