In [None]:
# %% [markdown]
# # Data Exploration
# 
# This notebook explores the dataset used for the ML pipeline.

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification

# %%
# Create sample data if needed
X, y = make_classification(n_samples=1000, n_features=5, n_classes=2, random_state=42)
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(5)])
df['target'] = y

# %%
# Display basic information
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# %%
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# %%
# Statistical summary
print("Statistical summary:")
print(df.describe())

# %%
# Distribution of target variable
plt.figure(figsize=(8, 6))
sns.countplot(x='target', data=df)
plt.title('Distribution of Target Variable')
plt.savefig('../plots/target_distribution.png')
plt.show()

# %%
# Correlation matrix
plt.figure(figsize=(10, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.savefig('../plots/correlation_matrix.png')
plt.show()

# %%
# Feature distributions
df.drop('target', axis=1).hist(bins=20, figsize=(12, 10))
plt.suptitle('Feature Distributions')
plt.savefig('../plots/feature_distributions.png')
plt.show()