# Setup

In [None]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import average_precision_score, make_scorer
from sklearn.preprocessing import LabelEncoder
import matplotlib.ticker as mticker
from IPython.display import display

OUTPUT_FIGURES_DIR = os.path.join("reports", "figures")
plt.style.use('ggplot')
os.makedirs(OUTPUT_FIGURES_DIR, exist_ok=True)

# Data ingestion

Upload data and update fil_path value with path of data file

In [None]:
def load_bank_data() -> pd.DataFrame:
    file_path = 'bank-full.csv'
    print(f"Attempting to load data from local file: {file_path}")
    try:
        df = pd.read_csv(file_path, sep=';')
        print(f"Data loaded successfully from {file_path}. Shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please ensure it's in the correct directory.")
        return pd.DataFrame()
    except Exception as e:
        print(f"An error occurred during data loading: {e}")
        return pd.DataFrame()

In [None]:
def prepare_data(df: pd.DataFrame) -> pd.DataFrame:

    if df.empty:
        return df
    df_prepared = df.copy()

    if 'day' in df_prepared.columns:
        df_prepared.rename(columns={'day': 'day_of_month'}, inplace=True)
        print("Renamed 'day' to 'day_of_month' for clarity.")

    print("No NaN values found in df.info(), so skipping NaN filling step.")
    constant_value_cols = [col for col in df_prepared.columns if df_prepared[col].nunique() == 1]

    if constant_value_cols:
        df_prepared.drop(columns=constant_value_cols, inplace=True)
        print(f"Dropped columns with constant values: {constant_value_cols}")

    return df_prepared

# Feature Engineering

In [None]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Performs feature engineering on the dataset.
    This function is now called for EDA purposes but the engineered features
    are not used in the final model training to align with user request.
    """
    if df.empty:
        return df
    df_engineered = df.copy()
    print("--- Performing Feature Engineering for Insights ---")
    if 'y' in df_engineered.columns and df_engineered['y'].dtype == 'object':
        df_engineered['y'] = df_engineered['y'].map({'no': 0, 'yes': 1})
    df_engineered['num_existing_loans'] = (df_engineered['housing'] == 'yes').astype(int).astype('category')
    df_engineered['has_any_loan_or_default'] = ((df_engineered['num_existing_loans'] == 1) | \
                                               (df_engineered['default'] == 'yes')).astype(int).astype('category')
    bins_age = [0, 25, 35, 45, 55, 65, 100]
    labels_age = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']
    df_engineered['age_group'] = pd.cut(df_engineered['age'], bins=bins_age, labels=labels_age, right=False)
    df_engineered['is_no_fixed_income'] = (df_engineered['job'].isin(['student','retired','unemployed'])).astype(int).astype('category')
    df_engineered['balance_group'] = pd.qcut(df_engineered['balance'] + 0.001, q=5, labels=['Bal_Q1', 'Bal_Q2', 'Bal_Q3', 'Bal_Q4', 'Bal_Q5'])
    df_engineered['is_negative_balance'] = (df_engineered['balance'] < 0).astype(int).astype('category')
    df_engineered['was_previously_contacted'] = (df_engineered['pdays'] != -1).astype(int).astype('category')
    spring_summer_months = ['apr', 'may', 'jun', 'jul', 'aug']
    df_engineered['is_spring_summer_contact'] = df_engineered['month'].isin(spring_summer_months).astype(int).astype('category')
    df_engineered['total_contacts'] = df_engineered['previous'] + df_engineered['campaign']
    month_mapping = {
        'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
        'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
    }
    df_engineered['month_numeric'] = df_engineered['month'].map(month_mapping)
    df_engineered['day_of_week'] = (pd.to_datetime(2023).tz_localize(None) +
                                     pd.to_timedelta(df_engineered['month_numeric'] * 30 + df_engineered['day_of_month'], unit='D')
                                    ).dt.day_name()
    df_engineered.drop('month_numeric', axis=1, inplace=True)
    print("Engineered new features: 'num_existing_loans', 'has_any_loan_or_default', 'age_group', 'is_no_fixed_income', 'balance_group', 'is_negative_balance', 'was_previously_contacted', 'is_spring_summer_contact', 'total_contacts', and 'day_of_week'.")
    return df_engineered




# EDA

In [None]:
def perform_eda(df: pd.DataFrame, output_dir: str):
    """
    Generates and saves exploratory data analysis plots for original features.
    """
    print("\n--- Performing Exploratory Data Analysis (EDA) on Original Features ---")
    df_copy = df.copy()

    if df_copy['y'].dtype == 'object':
        df_copy['y'] = df_copy['y'].map({'no': 0, 'yes': 1})

    numerical_cols = df_copy.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df_copy.select_dtypes(include=['object', 'category']).columns.tolist()

    if 'y' in numerical_cols:
        numerical_cols.remove('y')
    if 'y' in categorical_cols:
        categorical_cols.remove('y')

    # --- Plot 1: Target Variable Distribution (Pie Chart) ---
    target_counts = df_copy['y'].value_counts()
    plt.figure(figsize=(6, 6))
    plt.pie(target_counts, labels=target_counts.index, autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral'])
    plt.title('Distribution of Target Variable (y)')
    plt.axis('equal')
    plt.savefig(os.path.join(output_dir, 'target_distribution.png'))
    plt.close()
    print("Saved target distribution plot.")

    # Plot 2: Categorical Features vs. Target (Percentage Plots)
    if categorical_cols:
        num_plots = len(categorical_cols)
        fig_rows = int(np.ceil(num_plots / 3))
        plt.figure(figsize=(12, 4 * fig_rows))
        for i, col in enumerate(categorical_cols):
            # Calculate the percentages, explicitly setting observed=False to suppress the FutureWarning
            temp_df = df_copy.groupby(col, observed=False)['y'].value_counts(normalize=True).mul(100).rename('percentage').reset_index()

            plt.subplot(fig_rows, 3, i + 1)
            ax = sns.barplot(x=col, y='percentage', hue='y', data=temp_df, palette=['lightcoral', 'skyblue'])

            # Add percentage labels on top of the bars
            for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x() + p.get_width() / 2., height + 1,
                        f'{height:.1f}%', ha="center", fontsize=8)

            plt.title(f'Subscription Rate by {col}')
            plt.xlabel(col)
            plt.ylabel('Percentage (%)')
            plt.xticks(rotation=45, ha='right')
            ax.set_ylim(0, 100) # Set y-axis to a fixed range
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'all_categorical_features_vs_target_percentage.png'))
        plt.close()
        print(f"Saved percentage plots for {len(categorical_cols)} original categorical features vs. target.")
    else:
        print("No categorical features to plot against target.")

    # Plot 3: Correlation Matrix Heatmap for ALL Numerical Features
    numerical_cols_for_corr = [col for col in numerical_cols if col in df_copy.columns and df_copy[col].dtype in ['int64', 'float64']]
    if 'y' not in numerical_cols_for_corr:
        if 'y' in df_copy.columns and df_copy['y'].dtype in ['int64', 'float64']:
             numerical_cols_for_corr.append('y')

    if numerical_cols_for_corr:
        correlation_matrix = df_copy[numerical_cols_for_corr].corr()
        plt.figure(figsize=(10, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
        plt.title('Correlation Matrix of All Original Numerical Features')
        plt.savefig(os.path.join(output_dir, 'all_numerical_features_correlation_matrix.png'))
        plt.close()
        print("Saved correlation matrix heatmap for all original numerical features.")
    else:
        print("No numerical features found for correlation matrix plot.")

    # Plot 4: Duration Distribution vs. Subscription Status
    if 'duration' in df_copy.columns and 'y' in df_copy.columns:
        df_copy_for_plot = df_copy.copy()
        df_copy_for_plot['subscribed_status'] = df_copy_for_plot['y'].map({0: 'Not Subscribed', 1: 'Subscribed'})
        sns.set_style("whitegrid")
        plt.figure(figsize=(8, 5))
        sns.kdeplot(data=df_copy_for_plot, x='duration', hue='subscribed_status', fill=True, common_norm=False, palette=['lightcoral', 'skyblue'])
        mean_duration_no = df_copy_for_plot[df_copy_for_plot['y'] == 0]['duration'].mean()
        mean_duration_yes = df_copy_for_plot[df_copy_for_plot['y'] == 1]['duration'].mean()
        plt.axvline(x=mean_duration_no, color='darkred', linestyle='--', label=f'Avg. Not Subscribed: {mean_duration_no:.2f}s')
        plt.axvline(x=mean_duration_yes, color='darkblue', linestyle='--', label=f'Avg. Subscribed: {mean_duration_yes:.2f}s')
        plt.title('Distribution of Last Contact Duration by Subscription Status', fontsize=16)
        plt.xlabel('Last Contact Duration (seconds)', fontsize=12)
        plt.ylabel('Density', fontsize=12)
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'duration_distribution.png'))
        plt.close()
        print("Saved duration distribution plot with mean lines.")
    else:
        print("Skipping duration distribution plot: 'duration' or 'y' column not found.")


Based on the plots generated -

**Insights**

1. **Target Audience Focus:** The target distribution plot shows that only a small percentage (11.7%) of customers subscribe to a term deposit. This highlights the need to carefully target marketing efforts towards segments with a higher propensity to subscribe.

2. **Identify Promising Segments (from Categorical Plots):**

    a. Customers who are retired or students have significantly higher subscription
    rates. Marketing efforts could be specifically tailored to these groups.

    b. Customers with a tertiary education or who are single also show relatively higher subscription rates.

    c. The balance_group plot suggests that customers in higher balance quantiles might be more likely to subscribe due to sufficient disposable income.

    d. The poutcome plot clearly shows that previous success in a marketing campaign is a strong indicator of future subscription. Focusing on customers with a 'success' in the previous campaign outcome is likely to be highly effective.

    e. Certain months (e.g., March, September, October, December) appear to have higher subscription rates, suggesting seasonality in campaign effectiveness.

    f. The day_of_week plot can help in optimizing the timing of contact.

3. **Avoid Less Promising Segments (from Categorical Plots):**

    a. Customers who have defaulted on loans have a very low subscription rate.

    b. Using 'unknown' contact methods or contacting people with an 'unknown' previous outcome is associated with lower success rates.

4. **Duration Matters (from Duration Plot):**

    The duration distribution plot clearly shows that the duration of the last contact is strongly correlated with subscription. Longer calls are more likely to result in a subscription. This suggests that call center scripts and agent training should emphasize engaging customers for a sufficient duration. However, duration is only known *after the call, so it's not a feature that can be used for predictive targeting before the call.* It's more of an insight into the nature of successful interactions.

5. **Beware of Correlation (from Correlation Matrix):**

    While the correlation matrix primarily shows linear relationships between numerical features,'campaign' and 'previous' are correlated, which is intuitive as more campaigns might lead to more previous contacts.


In summary, the plots suggest that targeting retired individuals, students, those with tertiary education, those previously successful in campaigns, and focusing on specific months and days of the week could improve marketing campaign effectiveness. The duration of contact is also a key factor in successful subscriptions and training the call centre executives to provide clear explanation of benefits of subscribing would help.

# Encode features

In [None]:
def encode_features(df: pd.DataFrame) -> tuple[pd.DataFrame, list]:
    """
    Encodes only the original categorical features for the model.
    """
    if df.empty:
        return df, []
    df_encoded = df.copy()
    if df_encoded['y'].dtype == 'object':
        df_encoded['y'] = df_encoded['y'].map({'no': 0, 'yes': 1})

    # We only encode the original features as requested
    categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
    categorical_features_lgbm = [col for col in categorical_cols if col in df_encoded.columns]
    for col in categorical_features_lgbm:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    return df_encoded, categorical_features_lgbm

# Recommendation mechanism

We live in a world of hyperpersonalization. I intend to build something that can recommend the best time to contact someone and if we had product data, also recommend which product to offer a customer.

1. Optimal Contact Timing:
To achieve this, I have created 'segments' of customers based on their job, marital status, education, poutcome, and age_group attributes. For each segment, the month with the highest conversion rate and day of the month with highest conversion rate is recommended as the best time to contact.

2. Contact frequency guidance:
Additionally, customers who have already been contacted many times may not be inclined to subscribe. Hence, based on the total number of times they've been contacted before (past and current campaign), I recommend how many more times we likely need to contact them to persuade them to subscribe based on the average number of times people in that cohort need to be contacted before they subscribe.

NOTE -

1. If we know product details, we can recommend specific products to customers.
2. We can tailor offerings/marketing messages based on age/job/marital status etc.

In [None]:
def bin_age(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    df_with_age_bin = df.copy()
    bins = [0, 25, 35, 45, 55, 65, 100]
    labels = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']
    df_with_age_bin['age_group'] = pd.cut(df_with_age_bin['age'], bins=bins, labels=labels, right=False)
    return df_with_age_bin

def calculate_segment_averages(df_train_subscribers: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates average total contacts for successful customers within each segment.
    The segment is now defined by job, marital status, education, age group, and poutcome.
    """
    if df_train_subscribers.empty:
        return pd.DataFrame(columns=['job', 'marital', 'education', 'poutcome', 'age_group', 'avg_total_contacts'])
    df_train_subscribers['total_contacts'] = df_train_subscribers['previous'] + df_train_subscribers['campaign']
    segment_columns = ['job', 'marital', 'education', 'poutcome', 'age_group']
    segment_averages = df_train_subscribers.groupby(segment_columns, observed=False)['total_contacts'].mean().reset_index()
    segment_averages.rename(columns={'total_contacts': 'avg_total_contacts'}, inplace=True)
    return segment_averages


def calculate_best_contact_info(df_train_all: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the best month and day_of_month for each segment based on conversion rates.
    """
    if df_train_all.empty:
        return pd.DataFrame(columns=['job', 'marital', 'education', 'poutcome', 'age_group', 'best_month', 'best_day'])

    df_with_age_bin = bin_age(df_train_all)
    segment_columns = ['job', 'marital', 'education', 'poutcome', 'age_group']

    # Calculate conversion rates for each month within each segment
    month_conversion_rates = df_with_age_bin.groupby(segment_columns + ['month'], observed=False)['y'].mean().reset_index()

    # Find the best month for each segment, dropping NaNs to prevent KeyError
    idx_best_month = month_conversion_rates.groupby(segment_columns, observed=False)['y'].idxmax().dropna()
    best_months = month_conversion_rates.loc[idx_best_month].rename(columns={'y': 'best_month_rate', 'month': 'best_month'})

    # Calculate conversion rates for each day within each segment
    day_conversion_rates = df_with_age_bin.groupby(segment_columns + ['day_of_month'], observed=False)['y'].mean().reset_index()

    # Find the best day for each segment, dropping NaNs to prevent KeyError
    idx_best_day = day_conversion_rates.groupby(segment_columns, observed=False)['y'].idxmax().dropna()
    best_days = day_conversion_rates.loc[idx_best_day].rename(columns={'y': 'best_day_rate', 'day_of_month': 'best_day'})

    best_contact_info = pd.merge(best_months.drop('best_month_rate', axis=1), best_days.drop('best_day_rate', axis=1), on=segment_columns, how='left')
    return best_contact_info



def create_actionable_recommendations_optimized(results_df: pd.DataFrame, segment_averages: pd.DataFrame, best_contact_info: pd.DataFrame) -> pd.Series:
    """
    Creates tailored recommendations for each customer based on their propensity score and segment-specific insights.
    """
    results_df = bin_age(results_df)
    segment_columns = ['job', 'marital', 'education', 'poutcome', 'age_group']

    results_with_avg = pd.merge(results_df, segment_averages, on=segment_columns, how='left')
    results_with_info = pd.merge(results_with_avg, best_contact_info, on=segment_columns, how='left')

    avg_contacts_for_segment_mean = segment_averages['avg_total_contacts'].mean() if not segment_averages.empty else 0
    results_with_info['avg_total_contacts'] = results_with_info['avg_total_contacts'].fillna(avg_contacts_for_segment_mean)
    results_with_info['additional_contacts_needed'] = (results_with_info['avg_total_contacts'] - results_with_info['total_contacts']).apply(lambda x: max(0, np.ceil(x)))

    recommendations = []
    for _, row in results_with_info.iterrows():
        propensity = row['propensity_to_subscribe']
        job = row['job']
        poutcome = row['poutcome']
        total_contacts = row['total_contacts']
        avg_contacts_for_segment = row['avg_total_contacts']
        additional_contacts_needed = row['additional_contacts_needed']
        best_month = row['best_month']
        best_day = row['best_day']

        base_rec = f"Based on a propensity score of {propensity:.2f}, this customer is a strong candidate for a call."

        if additional_contacts_needed > 0:
            contact_freq_rec = f"They have had fewer contacts ({total_contacts}) than the average successful customer in their cohort ({avg_contacts_for_segment:.2f}). Consider making approximately {int(additional_contacts_needed)} more contact attempts."
        else:
            contact_freq_rec = f"They have already had enough contacts ({total_contacts}) to be considered in line with the average successful customer in their cohort ({avg_contacts_for_segment:.2f})."

        # Handle potential NaN values in best_month and best_day
        if pd.notna(best_month) and pd.notna(best_day):
             contact_timing_rec = f"Based on historical data for customers like them, the best time to contact would be in {best_month.capitalize()} on day {int(best_day)} of the month."
        else:
            contact_timing_rec = "Timing recommendations are not available for this customer's segment based on the training data."


        final_rec = f"{base_rec} {contact_timing_rec} {contact_freq_rec}"
        recommendations.append(final_rec)

    return pd.Series(recommendations, index=results_df.index)

# Model training and evaluation

We perform hyperparameter tuning with cross validation to get optimal features

In [None]:
def plot_feature_importance(model, features, output_dir, file_suffix):
    """
    Plots the top N feature importances from a trained LightGBM model as percentages.
    """
    print(f"\n--- Plotting Feature Importance for {file_suffix} Model ---")
    importance_df = pd.DataFrame({
        'feature': features,
        'importance': model.feature_importances_
    }).sort_values(by='importance', ascending=False)

    # Calculate importance as a percentage of the total sum
    total_importance = importance_df['importance'].sum()
    importance_df['importance_percent'] = (importance_df['importance'] / total_importance) * 100

    # Display top 10 features with percentage values
    display(importance_df[['feature', 'importance_percent']].head(10))

    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance_percent', y='feature', data=importance_df.head(10))
    plt.title(f'Top 10 Feature Importances ({file_suffix} Model)')
    plt.xlabel('Importance (%)')
    plt.ylabel('Feature')
    plt.gca().xaxis.set_major_formatter(mticker.PercentFormatter()) # Format x-axis as percentage
    plt.tight_layout()
    output_path = os.path.join(output_dir, f'feature_importance_{file_suffix}.png')
    plt.savefig(output_path)
    plt.close()
    print(f"Feature importance plot saved to '{output_path}'.")


In [None]:
def train_model(X_train_val, X_test, y_train_val, y_test, X_test_original, output_dir, categorical_features_lgbm, model_name_suffix: str):

    print(f"\n--- Starting Model Training and Evaluation for {model_name_suffix} Model ---")
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
    )
    print(f"Data split: Train={X_train.shape}, Val={X_val.shape}, Test={X_test.shape}")
    lgb_clf = lgb.LGBMClassifier(objective='binary', random_state=42, n_jobs=-1, force_col_wise=True, verbose=-1)
    imbalance_ratio = y_train_val.value_counts()[0] / y_train_val.value_counts()[1]
    param_distributions = {
        'n_estimators': [500, 800, 1000], 'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'num_leaves': [20, 31, 40, 50], 'max_depth': [5, 8, 10], 'min_child_samples': [10, 20, 30, 50, 100],
        'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'reg_alpha': [0, 0.1, 0.5, 1.0],
        'reg_lambda': [0, 0.1, 0.5, 1.0], 'class_weight': [None, 'balanced'],
        'scale_pos_weight': [1, imbalance_ratio, imbalance_ratio * 0.5, imbalance_ratio * 1.5]
    }
    aucpr_scorer = make_scorer(average_precision_score)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    random_search = RandomizedSearchCV(
        estimator=lgb_clf, param_distributions=param_distributions,
        n_iter=15, scoring=aucpr_scorer, cv=skf, verbose=1, random_state=42
    )
    print("Starting RandomizedSearchCV for LightGBM tuning with Early Stopping...")
    early_stopping_rounds = 20
    random_search.fit(X_train_val, y_train_val,
                      eval_set=[(X_val, y_val)],
                      eval_metric='average_precision',
                      callbacks=[lgb.early_stopping(early_stopping_rounds, verbose=False)],
                      categorical_feature=categorical_features_lgbm)
    best_lgb_model = random_search.best_estimator_
    print("\nBest hyperparameters found:", random_search.best_params_)
    print("Best cross-validation AUC-PR score:", random_search.best_score_)
    y_pred_proba_test = best_lgb_model.predict_proba(X_test)[:, 1]
    test_aucpr = average_precision_score(y_test, y_pred_proba_test)
    print(f"\nFinal Test Set AUC-PR for {model_name_suffix} model: {test_aucpr:.4f}")

    # Plot feature importance
    plot_feature_importance(best_lgb_model, X_train_val.columns.tolist(), output_dir, model_name_suffix)

    return best_lgb_model, y_pred_proba_test, test_aucpr


# Create propensity file with recommendation

In [None]:
def create_propensity_list(model, X_test, y_test, y_pred_proba, X_test_original, output_dir, file_suffix, segment_averages, best_contact_info):
    print(f"\n--- Generating Customer Propensity List ({file_suffix}) ---")
    results_df = X_test_original.copy()
    results_df['propensity_to_subscribe'] = y_pred_proba
    results_df['y_actual'] = y_test
    results_df = bin_age(results_df)
    results_df['total_contacts'] = results_df['previous'] + results_df['campaign']
    results_df['recommendation'] = create_actionable_recommendations_optimized(results_df, segment_averages, best_contact_info)

    # Rename columns as requested by the user
    results_df.rename(columns={'propensity_to_subscribe': 'y_pred', 'y_actual': 'y_test'}, inplace=True)

    results_df = results_df.sort_values(by='y_pred', ascending=False)
    # Update the column order to reflect the new names
    cols = ['y_pred', 'recommendation', 'y_test'] + [col for col in results_df.columns if col not in ['y_pred', 'recommendation', 'y_test']]
    results_df = results_df[cols]
    output_path = os.path.join(output_dir, f'customer_propensity_list_{file_suffix}.csv')
    results_df.to_csv(output_path, index=False)
    print(f"Propensity list with actionable recommendations saved to '{output_path}'.")
    print("The CSV file contains the entire test set, ordered from highest to lowest propensity to subscribe, with a tailored marketing recommendation for each customer.")

# Execution

In [None]:
def main():
    print("--- Starting Customer Propensity Model Workflow ---")
    data = load_bank_data()
    if data.empty:
        print("Data loading failed. Exiting.")
        return

    prepared_df = prepare_data(data)

    # We still need to engineer features on a copy of the data for the EDA plots and recommendations
    engineered_df_for_eda = engineer_features(prepared_df.copy())

    # Execute EDA on the engineered data, which still contains 'duration' for this step
    perform_eda(engineered_df_for_eda.copy(), OUTPUT_FIGURES_DIR)

    # --- Now, prepare data for model training by dropping 'duration' to prevent data leakage ---
    print("\n--- Preparing Data for Model Training ---")
    final_df_for_model = prepared_df.drop(columns=['duration'])

    # Encode only the original features as requested
    df_encoded, categorical_features_lgbm = encode_features(final_df_for_model)

    X = df_encoded.drop(columns='y')
    y = df_encoded['y']

    # Data Split
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train the model on original features
    model, y_pred_proba, aucpr = train_model(
        X_train_val, X_test, y_train_val, y_test, X_test,
        OUTPUT_FIGURES_DIR, categorical_features_lgbm, "Original"
    )

    # Re-create the original training data for the segment analysis
    df_train_subscribers_orig = prepared_df.loc[y_train_val.index].copy()

    #  Convert 'y' column to numerical (0/1) for calculations
    df_train_subscribers_orig['y'] = df_train_subscribers_orig['y'].map({'no': 0, 'yes': 1})

    # Explicitly fill NaN values in segmentation columns
    segment_columns = ['job', 'marital', 'education', 'poutcome']
    for col in segment_columns:
        if df_train_subscribers_orig[col].isnull().any():
            print(f"Warning: NaN values found in '{col}'. Filling with 'unknown' for segmentation.")
            df_train_subscribers_orig[col] = df_train_subscribers_orig[col].fillna('unknown')

    # Calculate segment averages and best contact times from the training data
    # Now that 'y' is numerical, we filter for y == 1 (which means 'yes')
    df_train_subscribers = df_train_subscribers_orig[df_train_subscribers_orig['y'] == 1].copy()
    segment_averages = calculate_segment_averages(bin_age(df_train_subscribers))
    best_contact_info = calculate_best_contact_info(df_train_subscribers_orig)

    # Re-create the original test set for the final output and recommendations
    X_test_original_for_final_list = prepared_df.loc[X_test.index].drop(columns=['duration'])

    # Also fill NaNs in the test set's segmentation columns to match the training data
    for col in segment_columns:
        if X_test_original_for_final_list[col].isnull().any():
            X_test_original_for_final_list[col] = X_test_original_for_final_list[col].fillna('unknown')

    # Generate the final propensity list
    create_propensity_list(
        model, X_test, y_test, y_pred_proba,
        X_test_original_for_final_list, OUTPUT_FIGURES_DIR, "original", segment_averages, best_contact_info
    )

    print("\n--- Workflow Complete ---")

if __name__ == '__main__':
    main()


--- Starting Customer Propensity Model Workflow ---
Attempting to load data from local file: bank-full.csv
Data loaded successfully from bank-full.csv. Shape: (45211, 17)
Renamed 'day' to 'day_of_month' for clarity.
No NaN values found in df.info(), so skipping NaN filling step.
--- Performing Feature Engineering for Insights ---
Engineered new features: 'num_existing_loans', 'has_any_loan_or_default', 'age_group', 'is_no_fixed_income', 'balance_group', 'is_negative_balance', 'was_previously_contacted', 'is_spring_summer_contact', 'total_contacts', and 'day_of_week'.

--- Performing Exploratory Data Analysis (EDA) on Original Features ---
Saved target distribution plot.
Saved percentage plots for 18 original categorical features vs. target.
Saved correlation matrix heatmap for all original numerical features.
Saved duration distribution plot with mean lines.

--- Preparing Data for Model Training ---

--- Starting Model Training and Evaluation for Original Model ---
Data split: Train=(

Unnamed: 0,feature,importance_percent
5,balance,21.182004
0,age,16.740616
9,day_of_month,14.486645
12,pdays,9.042506
10,month,8.683113
1,job,7.782412
11,campaign,7.715858
2,marital,3.598367
13,previous,2.551247
8,contact,2.280593


Feature importance plot saved to 'reports/figures/feature_importance_Original.png'.

--- Generating Customer Propensity List (original) ---
Propensity list with actionable recommendations saved to 'reports/figures/customer_propensity_list_original.csv'.
The CSV file contains the entire test set, ordered from highest to lowest propensity to subscribe, with a tailored marketing recommendation for each customer.

--- Workflow Complete ---


Note on importance percentage -

balance	21.182004 - This means that the customer's balance is responsible for over 21% of the model's overall predictive power in determining whether a customer will subscribe to a term deposit.

**MARKETING STRATEGY RECOMMENDATIONS (based on feature importance)**

The features with the highest importance indicate the factors that most significantly influence a customer's propensity to subscribe. Focusing marketing efforts on these areas can lead to more effective campaigns.

1. Target Customers Based on Financial Balance:
    * Insight: balance is the second most important feature. Customers with higher balances might be more receptive or have more disposable income/savings.
    * Recommendation: Prioritize customers with higher account balances for marketing efforts. Tailor offers to resonate with their financial standing.
2. Strategic Timing of Campaigns (Day and Month):
    * Insight: day_of_week and month are highly influential. The specific time a customer is contacted plays a crucial role.
    * Recommendation: Analyze historical data to identify the most effective days of the week and months for outreach. Schedule campaigns to align with these optimal times to maximize response rates.
3. Age-Based Segmentation:
    * Insight: age is a significant demographic factor.
    * Recommendation: Segment the customer base by age groups and develop age-specific marketing messages and product offerings that resonate with different generations.
4. Leverage Past Contact History (pdays):
    * Insight: pdays (number of days since last contact from previous campaign) is important.
    * Recommendation: Understand the optimal re-engagement window. Customers who were contacted a certain number of days ago might be more or less receptive. Tailor follow-up strategies based on their prior interaction recency.
5. Customize by Occupation (job) and Education:
    * Insight: job and education are important demographic features.
    * Recommendation: Develop targeted marketing messages and product positioning that align with the specific needs, financial situations, and educational backgrounds of different professional groups.
7. Optimize Campaign Contact Frequency (campaign):
    * Insight: campaign (number of contacts during the current campaign) is relevant. While more contacts can sometimes increase propensity, there's often a point of diminishing returns or even negative impact (customer annoyance).
    * Recommendation: Carefully monitor and optimize the number of times a customer is contacted within a single campaign. Avoid over-contacting, as it can lead to customer fatigue and negative sentiment.
8. Evaluate Contact Channel Effectiveness (contact):
    * Insight: contact method (e.g., cellular, telephone) is important.
    * Recommendation: Analyze which communication channels yield the best results for different customer segments and allocate resources accordingly.



In [None]:
'''
import os
import shutil

OUTPUT_FIGURES_DIR = os.path.join("reports", "figures")

if os.path.exists(OUTPUT_FIGURES_DIR):
    shutil.rmtree(OUTPUT_FIGURES_DIR)
    print(f"Deleted directory: {OUTPUT_FIGURES_DIR}")
else:
    print(f"Directory not found: {OUTPUT_FIGURES_DIR}")

'''

'\nimport os\nimport shutil\n\nOUTPUT_FIGURES_DIR = os.path.join("reports", "figures")\n\nif os.path.exists(OUTPUT_FIGURES_DIR):\n    shutil.rmtree(OUTPUT_FIGURES_DIR)\n    print(f"Deleted directory: {OUTPUT_FIGURES_DIR}")\nelse:\n    print(f"Directory not found: {OUTPUT_FIGURES_DIR}")\n\n'