In [None]:
# EDA + Preprocessing Starter Code

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("../data/bank_marketing.csv")  # Update path if needed

# Basic info
print("Shape:", df.shape)
print("\nColumns:\n", df.columns)
print("\nMissing values:\n", df.isnull().sum())
print("\nClass distribution:\n", df['y'].value_counts())

# Preview data
display(df.head())

# Convert target to binary
df['y'] = df['y'].map({'yes': 1, 'no': 0})

# Identify categorical + numerical
categorical = df.select_dtypes(include='object').columns.tolist()
numerical = df.select_dtypes(include=np.number).columns.tolist()

print("\nCategorical Columns:", categorical)
print("\nNumerical Columns:", numerical)

# --- Visualization ---
# Target variable distribution
sns.countplot(x='y', data=df)
plt.title("Target Variable Distribution")
plt.show()

# Categorical variable impact
for col in categorical:
    plt.figure(figsize=(6,3))
    sns.countplot(x=col, hue='y', data=df)
    plt.title(f"Target vs {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# --- Basic Preprocessing ---
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
df_encoded = df.copy()
le = LabelEncoder()
for col in categorical:
    df_encoded[col] = le.fit_transform(df_encoded[col])

# Final processed data
print("\nEncoded Data Preview:\n")
display(df_encoded.head())

# Save processed version
df_encoded.to_csv("../data/processed_marketing.csv", index=False)
