In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# --- Load Raw Data (for human-readable analysis) ---
print("Loading RAW data...")
try:
    # --- FIX: Go up one directory from 'notebooks' to the project root ---
    raw_df = pd.read_csv('../data/raw/bank-additional-full.csv', sep=';')
    raw_df['y_numeric'] = raw_df['y'].map({'yes': 1, 'no': 0})
    print(f"Raw data loaded successfully. Shape: {raw_df.shape}")
except FileNotFoundError:
    raise Exception("RAW data file not found. Please ensure it's in ../data/raw/")

# --- Load Processed Data (to see the transformations) ---
print("\nLoading PROCESSED data...")
try:
    # --- FIX: Go up one directory from 'notebooks' to the project root ---
    X_train_proc = pd.read_csv('../data/processed_target_encoding/X_train_processed.csv')
    y_train_proc = pd.read_csv('../data/processed_target_encoding/y_train.csv')
    # Combine them for easier analysis
    processed_train_df = pd.concat([X_train_proc, y_train_proc], axis=1)
    print(f"Processed training data loaded successfully. Shape: {processed_train_df.shape}")
except FileNotFoundError:
    raise Exception("PROCESSED data files not found. Please run the pipeline script first.")

Loading RAW data...
Raw data loaded successfully. Shape: (41188, 22)

Loading PROCESSED data...
Processed training data loaded successfully. Shape: (32950, 39)


CELL 2: Study 1 - The Impact of Feature Engineering (Before vs. After)
This study visualizes how our pipeline changed the raw features. We'll look at the distribution of age before and after outlier capping and scaling.

In [3]:
# --- Study 1: Visualizing the Impact of Preprocessing ---

print("--- Comparing 'age' distribution before and after processing ---")

# Create a figure with two subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=('<b>Raw Data</b>: Age Distribution', '<b>Processed Data</b>: Scaled Age Distribution'))

# Plot 1: Histogram of the original 'age' from the raw data
fig.add_trace(
    go.Histogram(x=raw_df['age'], name='Raw Age', nbinsx=30),
    row=1, col=1
)

# Plot 2: Histogram of the scaled 'age' from the processed data
fig.add_trace(
    go.Histogram(x=processed_train_df['num__age'], name='Processed Age', nbinsx=30),
    row=1, col=2
)

fig.update_layout(
    title_text='<b>Impact of Outlier Capping and StandardScaler on the "age" Feature</b>',
    title_x=0.5,
    showlegend=False
)
fig.show()

--- Comparing 'age' distribution before and after processing ---


CELL 3: Study 2 - Exploring Key Feature Relationships (Raw Data)
This study looks for relationships in the raw data to understand why some features were so predictive. We'll explore the relationship between job, education, and the subscription outcome.

In [4]:
# --- Study 2: Exploring Relationships in the Raw Data ---

print("--- Analyzing Subscription Rate by Job and Education ---")

# Calculate the mean success rate for each category
job_success_rate = raw_df.groupby('job')['y_numeric'].mean().sort_values(ascending=False)
education_success_rate = raw_df.groupby('education')['y_numeric'].mean().sort_values(ascending=False)

# Create a figure with two subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=('Subscription Rate by <b>Job</b>', 'Subscription Rate by <b>Education</b>'))

# Plot 1: Bar chart of success rate by job
fig.add_trace(
    go.Bar(x=job_success_rate.index, y=job_success_rate.values, name='Job'),
    row=1, col=1
)

# Plot 2: Bar chart of success rate by education
fig.add_trace(
    go.Bar(x=education_success_rate.index, y=education_success_rate.values, name='Education'),
    row=1, col=2
)

fig.update_layout(
    title_text='<b>Which Customer Segments Are Most Likely to Subscribe?</b>',
    title_x=0.5,
    showlegend=False,
    yaxis=dict(tickformat=".0%") # Format y-axis as percentage
)
fig.update_yaxes(title_text='Subscription Rate', row=1, col=1)
fig.show()

--- Analyzing Subscription Rate by Job and Education ---


CELL 4: Study 3 - The Power of the Economic Context
This study visualizes the impact of the powerful economic indicator features that were added to this dataset. We'll look at euribor3m.

In [5]:
# --- Study 3: Analyzing Economic Indicators ---

print("--- Visualizing the Impact of the Euribor 3-Month Rate ---")

# We can use a box plot to see the distribution of the rate for 'yes' vs 'no' outcomes
fig = px.box(
    raw_df,
    x='y',
    y='euribor3m',
    color='y',
    title='<b>Distribution of Euribor 3-Month Rate by Subscription Outcome</b>',
    labels={'y': 'Subscribed to Term Deposit?', 'euribor3m': 'Euribor 3-Month Rate'},
    points="all" # Show all the data points
)

fig.update_layout(title_x=0.5)
fig.show()

--- Visualizing the Impact of the Euribor 3-Month Rate ---
