In [None]:
# ===== UNIVARIATE DISTRIBUTION PLOTS =====
print("\n" + "="*60)
print("UNIVARIATE DISTRIBUTION ANALYSIS - CREDIT CARD")
print("="*60)

# Create univariate plots for key variables
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

# Plot key features
features_to_plot = ['Amount', 'Amount_log', 'V1', 'V2', 'V14', 'V17']

for idx, feature in enumerate(features_to_plot):
    if feature in cc_features.columns:
        # Plot distribution
        axes[idx].hist(cc_features[feature].dropna(), bins=50, alpha=0.7,
                      color='#2E75B6', edgecolor='black')
        axes[idx].set_title(f'{feature} Distribution', fontsize=11, fontweight='bold')
        axes[idx].set_xlabel(feature, fontsize=9)
        axes[idx].set_ylabel('Frequency', fontsize=9)
        axes[idx].grid(True, alpha=0.3)
        
        # Add statistics
        stats_text = f'Mean: {cc_features[feature].mean():.4f}\nStd: {cc_features[feature].std():.4f}'
        axes[idx].text(0.02, 0.98, stats_text, transform=axes[idx].transAxes,
                      fontsize=8, verticalalignment='top',
                      bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.suptitle('Univariate Distribution of Key Variables (Credit Card)', 
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../results/univariate_distributions_creditcard.png', dpi=300, bbox_inches='tight')
plt.show()

# ===== BIVARIATE TARGET-VS-FEATURE ANALYSIS =====
print("\n" + "="*60)
print("BIVARIATE ANALYSIS: FRAUD VS NON-FRAUD - CREDIT CARD")
print("="*60)

# Select top PCA features for analysis
top_pca_features = ['V14', 'V17', 'V12', 'V10', 'V16', 'V3']

# Create violin plots for comparison
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_pca_features):
    if feature in cc_features.columns:
        # Create violin plot
        sns.violinplot(x='Class', y=feature, data=cc_features, ax=axes[idx],
                      palette={0: '#2E75B6', 1: '#C00000'}, inner='quartile')
        
        axes[idx].set_title(f'{feature} by Fraud Status', fontsize=11, fontweight='bold')
        axes[idx].set_xlabel('Fraud (1) / Non-Fraud (0)', fontsize=9)
        axes[idx].set_ylabel(feature, fontsize=9)
        axes[idx].grid(True, alpha=0.3, axis='y')
        
        # Calculate and display statistics
        fraud_mean = cc_features[cc_features['Class'] == 1][feature].mean()
        non_fraud_mean = cc_features[cc_features['Class'] == 0][feature].mean()
        
        stats_text = f'Non-Fraud Mean: {non_fraud_mean:.4f}\nFraud Mean: {fraud_mean:.4f}'
        axes[idx].text(0.02, 0.98, stats_text, transform=axes[idx].transAxes,
                      fontsize=8, verticalalignment='top',
                      bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.suptitle('Bivariate Analysis: PCA Features by Fraud Status (Credit Card)',
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../results/bivariate_analysis_creditcard.png', dpi=300, bbox_inches='tight')
plt.show()

# ===== DISTRIBUTION COMPARISON FOR KEY FEATURES =====
print("\n" + "="*60)
print("DISTRIBUTION COMPARISON: FRAUD VS NON-FRAUD")
print("="*60)

# Select key features for detailed comparison
key_features = ['V14', 'V17', 'Amount_log']

fig, axes = plt.subplots(len(key_features), 2, figsize=(14, 4*len(key_features)))

for idx, feature in enumerate(key_features):
    if feature in cc_features.columns:
        # Left: Density plot comparison
        sns.kdeplot(data=cc_features[cc_features['Class'] == 0][feature], 
                   ax=axes[idx, 0], label='Non-Fraud', color='#2E75B6', fill=True, alpha=0.5)
        sns.kdeplot(data=cc_features[cc_features['Class'] == 1][feature], 
                   ax=axes[idx, 0], label='Fraud', color='#C00000', fill=True, alpha=0.5)
        axes[idx, 0].set_title(f'{feature} - Density Comparison', fontsize=11, fontweight='bold')
        axes[idx, 0].set_xlabel(feature, fontsize=9)
        axes[idx, 0].set_ylabel('Density', fontsize=9)
        axes[idx, 0].legend()
        axes[idx, 0].grid(True, alpha=0.3)
        
        # Right: ECDF plot
        from statsmodels.distributions.empirical_distribution import ECDF
        
        # Calculate ECDF for both classes
        ecdf_non_fraud = ECDF(cc_features[cc_features['Class'] == 0][feature])
        ecdf_fraud = ECDF(cc_features[cc_features['Class'] == 1][feature])
        
        x_sorted = np.sort(np.concatenate([
            cc_features[cc_features['Class'] == 0][feature],
            cc_features[cc_features['Class'] == 1][feature]
        ]))
        
        axes[idx, 1].plot(x_sorted, ecdf_non_fraud(x_sorted), 
                         color='#2E75B6', label='Non-Fraud', linewidth=2)
        axes[idx, 1].plot(x_sorted, ecdf_fraud(x_sorted), 
                         color='#C00000', label='Fraud', linewidth=2)
        axes[idx, 1].set_title(f'{feature} - ECDF Comparison', fontsize=11, fontweight='bold')
        axes[idx, 1].set_xlabel(feature, fontsize=9)
        axes[idx, 1].set_ylabel('Cumulative Probability', fontsize=9)
        axes[idx, 1].legend()
        axes[idx, 1].grid(True, alpha=0.3)

plt.suptitle('Detailed Distribution Comparison: Fraud vs Non-Fraud (Credit Card)',
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../results/distribution_comparison_creditcard.png', dpi=300, bbox_inches='tight')
plt.show()

# ===== STATISTICAL SIGNIFICANCE TESTING =====
print("\n" + "="*60)
print("STATISTICAL SIGNIFICANCE TESTING")
print("="*60)

from scipy.stats import mannwhitneyu, ttest_ind
import pandas as pd

# Create significance test results table
significance_results = []

for feature in cc_features.select_dtypes(include=[np.number]).columns:
    if feature != 'Class':
        fraud_data = cc_features[cc_features['Class'] == 1][feature].dropna()
        non_fraud_data = cc_features[cc_features['Class'] == 0][feature].dropna()
        
        if len(fraud_data) > 1 and len(non_fraud_data) > 1:
            # Mann-Whitney U test (non-parametric)
            stat_mw, p_mw = mannwhitneyu(fraud_data, non_fraud_data, alternative='two-sided')
            
            # T-test (parametric)
            stat_tt, p_tt = ttest_ind(fraud_data, non_fraud_data, equal_var=False)
            
            # Calculate effect size (Cohen's d)
            mean_diff = fraud_data.mean() - non_fraud_data.mean()
            pooled_std = np.sqrt((fraud_data.std()**2 + non_fraud_data.std()**2) / 2)
            cohens_d = mean_diff / pooled_std if pooled_std != 0 else 0
            
            significance_results.append({
                'Feature': feature,
                'Fraud_Mean': fraud_data.mean(),
                'Non_Fraud_Mean': non_fraud_data.mean(),
                'Mean_Difference': mean_diff,
                'MW_p_value': p_mw,
                'TT_p_value': p_tt,
                'Cohens_d': cohens_d,
                'Significant': p_mw < 0.05
            })

# Convert to DataFrame and sort by significance
significance_df = pd.DataFrame(significance_results)
significance_df = significance_df.sort_values('MW_p_value')

print("\nStatistical Significance Test Results (Top 10 Most Significant):")
print("-" * 100)
print(significance_df.head(10).to_string(index=False))

# Save results to CSV
significance_df.to_csv('../results/statistical_significance_creditcard.csv', index=False)
print(f"\nFull results saved to: ../results/statistical_significance_creditcard.csv")