In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

# --- Configuration ---
CLEAN_DATA_PATH = r"D:\Intelligent Review Rating Prediction Using AI and LLMs\data\clean_combined_data.csv"
DATA_LIMIT = 200000 # Use the same downsampled size as the final model
RANDOM_SEED = 42
EDA_OUTPUT_FOLDER = 'eda_charts_advanced'

# --- Setup ---
if not os.path.exists(EDA_OUTPUT_FOLDER):
    os.makedirs(EDA_OUTPUT_FOLDER)

# --- 1. Load and Sample Data (Matching Training Setup) ---
print("--- 1. Loading and Sampling Data ---")
try:
    df = pd.read_csv(CLEAN_DATA_PATH)
    
    # Downsample the data to match the training size
    if len(df) > DATA_LIMIT:
        df = df.sample(n=DATA_LIMIT, random_state=RANDOM_SEED).reset_index(drop=True)
        print(f"Successfully downsampled to {DATA_LIMIT} rows for analysis.")
    
    # --- FIX: Calculate User Frequency and Log Frequency (MISSING COLUMNS) ---
    # The 'user_frequency' column was not saved in the clean CSV, so we recalculate it.
    user_counts = df['user_id'].value_counts()
    df['user_frequency'] = df['user_id'].map(user_counts)
    
    # Create the 'review_length' feature for analysis
    df['review_length'] = df['consolidated_text'].apply(lambda x: len(str(x).split()))
    
    # Log transform user frequency for correlation analysis (used in plotting)
    df['user_frequency_log'] = np.log1p(df['user_frequency'])
    
    print(f"Data ready. Shape: {df.shape}")

except FileNotFoundError:
    print(f"Error: Clean data file not found at {CLEAN_DATA_PATH}.")
    exit()

# --- 2. Advanced Distribution Analysis ---
print("\n--- 2. Advanced Distribution Analysis ---")

## 2.1 Star Rating Distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='rating', data=df, palette='viridis')
plt.title('Distribution of Target Variable (Star Ratings)')
plt.xlabel('Star Rating')
plt.ylabel('Count')
plt.grid(axis='y', alpha=0.5)
plt.savefig(os.path.join(EDA_OUTPUT_FOLDER, '01_rating_distribution.png'))
plt.close()
print("Saved 01_rating_distribution.png")

## 2.2 Review Length vs. Rating
# Reviews tend to be longer for 1-star (detailed complaints) and 5-star (enthusiastic praise).
plt.figure(figsize=(10, 6))
# Showfliers=False removes extreme outliers for a cleaner look
sns.boxplot(x='rating', y='review_length', data=df, showfliers=False, palette='coolwarm') 
plt.title('Review Length Distribution by Star Rating (Outliers Removed)')
plt.xlabel('Star Rating')
plt.ylabel('Review Length (Word Count)')
plt.savefig(os.path.join(EDA_OUTPUT_FOLDER, '02_length_vs_rating_boxplot.png'))
plt.close()
print("Saved 02_length_vs_rating_boxplot.png")

## 2.3 User Activity (Frequency) Distribution
plt.figure(figsize=(8, 5))
# Use the newly calculated 'user_frequency' column
sns.histplot(df['user_frequency'], bins=50, log_scale=True, kde=True, color='skyblue')
plt.title('Distribution of User Review Frequency (Log Scale)')
plt.xlabel('Review Count per User')
plt.ylabel('Number of Users')
plt.savefig(os.path.join(EDA_OUTPUT_FOLDER, '03_user_frequency.png'))
plt.close()
print("Saved 03_user_frequency.png")


# --- 3. Feature Correlation Analysis ---
print("\n--- 3. Feature Correlation Analysis ---")

## 3.1 Correlation Matrix of Numerical Features
# Correlation between the target (rating) and the engineered features (log-transformed frequency, length, avg_rating).
numerical_features = df[['rating', 'average_rating', 'user_frequency_log', 'review_length']].copy()
correlation_matrix = numerical_features.corr()

plt.figure(figsize=(7, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, linecolor='black')
plt.title('Correlation Matrix of Numerical Features')
plt.savefig(os.path.join(EDA_OUTPUT_FOLDER, '04_correlation_matrix.png'))
plt.close()
print("Saved 04_correlation_matrix.png")

# Key Takeaway
print("\nKey Feature Correlation Summary:")
print(f"Correlation (Rating vs. Avg. Rating): {correlation_matrix.loc['rating', 'average_rating']:.2f}")
print(f"Correlation (Rating vs. Log User Frequency): {correlation_matrix.loc['rating', 'user_frequency_log']:.2f}")

print("\nAdvanced EDA Complete. Charts saved in the 'eda_charts_advanced' folder.")


--- 1. Loading and Sampling Data ---
Successfully downsampled to 200000 rows for analysis.
Data ready. Shape: (200000, 9)

--- 2. Advanced Distribution Analysis ---



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='rating', data=df, palette='viridis')


Saved 01_rating_distribution.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='rating', y='review_length', data=df, showfliers=False, palette='coolwarm')


Saved 02_length_vs_rating_boxplot.png
Saved 03_user_frequency.png

--- 3. Feature Correlation Analysis ---
Saved 04_correlation_matrix.png

Key Feature Correlation Summary:
Correlation (Rating vs. Avg. Rating): 0.36
Correlation (Rating vs. Log User Frequency): 0.02

Advanced EDA Complete. Charts saved in the 'eda_charts_advanced' folder.
