In [None]:
# Target variable distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df[target_col], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel(target_col, fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title(f'Distribution of {target_col}', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Boxplot
axes[1].boxplot(df[target_col])
axes[1].set_ylabel(target_col, fontsize=12)
axes[1].set_title(f'Boxplot of {target_col}', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Distribution of all numeric features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

n_cols = 4
n_rows = min(5, (len(numeric_cols) + n_cols - 1) // n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, n_rows*3))
axes = axes.flatten() if n_rows > 1 else [axes]

for idx, col in enumerate(numeric_cols[:20]):  # Show first 20 features
    if idx < len(axes):
        axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
        axes[idx].set_title(col, fontsize=10, fontweight='bold')
        axes[idx].set_xlabel('Value')
        axes[idx].set_ylabel('Frequency')
        axes[idx].grid(True, alpha=0.3)

# Hide unused subplots
for idx in range(min(20, len(numeric_cols)), len(axes)):
    axes[idx].set_visible(False)

plt.suptitle('Distribution of Numeric Features', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()


## 5. Correlation Analysis


In [None]:
# Correlation matrix
correlation_matrix = df.corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap of All Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()


In [None]:
# Correlation with target variable
target_correlations = correlation_matrix[target_col].drop(target_col).sort_values(ascending=False)

print(f"Top 20 Features Most Correlated with {target_col}:")
print("=" * 60)
print(target_correlations.head(20))

print(f"\nBottom 20 Features (Negative Correlation with {target_col}):")
print("=" * 60)
print(target_correlations.tail(20))


In [None]:
# Visualize correlations with target
plt.figure(figsize=(10, 12))
target_correlations.sort_values().tail(30).plot(kind='barh')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title(f'Top 30 Feature Correlations with {target_col}', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Select top correlated features
top_n = 15
top_features = target_correlations.abs().sort_values(ascending=False).head(top_n)

print(f"\nTop {top_n} Features by Absolute Correlation:")
print("=" * 60)
for feature, corr in top_features.items():
    print(f"{feature:.<50} {target_correlations[feature]:>8.4f}")


## 6. Outlier Detection


In [None]:
# Detect outliers using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return len(outliers), (len(outliers) / len(data)) * 100

print("Outlier Detection (IQR Method):")
print("=" * 60)

outlier_summary = []
for col in numeric_cols[:20]:  # Check first 20 numeric columns
    count, percentage = detect_outliers_iqr(df, col)
    outlier_summary.append({
        'Feature': col,
        'Outlier Count': count,
        'Percentage': percentage
    })

outlier_df = pd.DataFrame(outlier_summary).sort_values('Outlier Count', ascending=False)
print(outlier_df.head(15))


## 7. Scatter Plots - Top Features vs Target


In [None]:
# Scatter plots for top 6 correlated features
top_6_features = target_correlations.abs().sort_values(ascending=False).head(6).index

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_6_features):
    axes[idx].scatter(df[feature], df[target_col], alpha=0.5, s=10)
    axes[idx].set_xlabel(feature, fontsize=10)
    axes[idx].set_ylabel(target_col, fontsize=10)
    axes[idx].set_title(f'{feature} vs {target_col}\n(r = {target_correlations[feature]:.3f})', 
                       fontsize=11, fontweight='bold')
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('Top 6 Features vs Target Variable', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()


## 8. Feature Selection Recommendations


In [None]:
# Recommend features based on correlation threshold
correlation_threshold = 0.1

recommended_features = target_correlations[target_correlations.abs() > correlation_threshold]

print(f"\nFeature Selection Recommendations:")
print("=" * 60)
print(f"Total features: {len(df.columns) - 1}")
print(f"Recommended features (|correlation| > {correlation_threshold}): {len(recommended_features)}")
print(f"\nRecommended features list:")
for feature in recommended_features.index[:30]:  # Show top 30
    print(f"  - {feature} (r = {recommended_features[feature]:.4f})")


## 9. Data Split Analysis (70% Train / 30% Test)


In [None]:
# Analyze the train-test split
split_ratio = 0.7
split_index = int(len(df) * split_ratio)

train_data = df.iloc[:split_index]
test_data = df.iloc[split_index:]

print("Train-Test Split Analysis:")
print("=" * 60)
print(f"Total samples: {len(df)}")
print(f"Training samples: {len(train_data)} ({len(train_data)/len(df)*100:.1f}%)")
print(f"Test samples: {len(test_data)} ({len(test_data)/len(df)*100:.1f}%)")
print(f"\nTarget variable statistics:")
print(f"Train mean: {train_data[target_col].mean():.4f}")
print(f"Test mean: {test_data[target_col].mean():.4f}")
print(f"Train std: {train_data[target_col].std():.4f}")
print(f"Test std: {test_data[target_col].std():.4f}")


In [None]:
# Visualize train-test distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(train_data[target_col], bins=30, alpha=0.7, label='Train', edgecolor='black')
axes[0].hist(test_data[target_col], bins=30, alpha=0.7, label='Test', edgecolor='black')
axes[0].set_xlabel(target_col, fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Train vs Test Distribution', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(range(len(df)), df[target_col], linewidth=0.5, alpha=0.7)
axes[1].axvline(x=split_index, color='red', linestyle='--', linewidth=2, label='Train/Test Split')
axes[1].set_xlabel('Sample Index', fontsize=12)
axes[1].set_ylabel(target_col, fontsize=12)
axes[1].set_title('Temporal View of Data Split', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 10. Summary and Conclusions


In [None]:
print("\n" + "=" * 60)
print(" " * 15 + "EDA SUMMARY")
print("=" * 60)
print(f"\n1. Dataset Overview:")
print(f"   - Total samples: {len(df)}")
print(f"   - Total features: {len(df.columns) - 1}")
print(f"   - Target variable: {target_col}")
print(f"\n2. Data Quality:")
print(f"   - Missing values: {df.isnull().sum().sum()}")
print(f"   - Duplicate rows: {duplicates}")
print(f"\n3. Target Variable:")
print(f"   - Mean: {df[target_col].mean():.4f}")
print(f"   - Std: {df[target_col].std():.4f}")
print(f"   - Range: [{df[target_col].min():.4f}, {df[target_col].max():.4f}]")
print(f"\n4. Feature Selection:")
print(f"   - Recommended features: {len(recommended_features)}")
print(f"   - Correlation threshold: {correlation_threshold}")
print(f"\n5. Train-Test Split:")
print(f"   - Training: {len(train_data)} samples ({split_ratio*100:.0f}%)")
print(f"   - Testing: {len(test_data)} samples ({(1-split_ratio)*100:.0f}%)")
print("\n" + "=" * 60)
