# <center> Bank Telemarketing deposit 
# <center>Optimizing Long-Term Deposit Subscription Campaigns: Analyzing and Predicting potential clients </center>

## Summary:

#### 1. Importation of the necessary libraries to run the code
#### 2. Loading the dataset
#### 3. Exploratory Data Analysis (EDA)
#### 4. Imputation of the missing values
#### 5. Encoding categorical variables using pd.get_dummies
#### 6. Application of SMOTE method to rebalance the classes
#### 7. Selection of important features using Random Forest
#### 8. Sélection des variables importantes avec ACP
#### 9. Model development and evaluation using ROC AUC and confusion matrix
#### 10. Comparison of results between ACP & Random Forest

In [None]:
# 1. Importation of the necessary libraries to run the code

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, roc_curve
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

# Configuration of visual styles
sns.set(style="whitegrid", font_scale=1.2)
plt.style.use('ggplot')

In [None]:
# 2. Loading the dataset

def load_data(filepath):
    """
    Loads the CSV file containing the data.
    :param filepath: Path to the CSV file.
    :return: DataFrame Pandas.
    """
    try:
        df = pd.read_csv(filepath, sep=';')
        print("Dataset loaded successfully")
        return df
    except Exception as e:
        print(f"Error while loading the dataset : {e}")
        return None

# Path to the data file
filepath = "/Users/abdel_merhom/Desktop/bank_marketing.csv"

# Loading the dataset
data = load_data(filepath)

In [None]:
# 3. Exploratory Data Analysis (EDA)
def exploratory_data_analysis(df, target_col='y'):
    """
    EDA is the first important step to understand our data. It will allows us to gain insights into the structure, content, different
    patterns and relationships within the dataset.
    
    :param df: DataFrame.
    :param target_col: Name of the target value (Response value).
    """
    print("\n---Exploratory Data Analysis---")

    # General informations
    print("\nInformations about the DataFrame :")
    display(df.info())

    # Descriptive statistics
    print("\nDescriptive statistics for Numerical variables :")
    display(df.describe())

    # Checking for missing values
    print("\nMissing values per column :")
    missing_values = df.isnull().sum()
    display(missing_values)

    # Count of the occurences of "unknown"
    print("\nCounting occurences of 'unknown' :")
    unknown_counts = (df == 'unknown').sum()
    display(unknown_counts)

    # Distribution of the target variable
    print("\nDistribution of the target variable 'y' :")
    target_distribution = df[target_col].value_counts(normalize=True) * 100
    display(target_distribution)

    plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x=target_col)
    plt.title('Distribution of the target variable (y)')
    plt.show()

    # Analysis of categorical variables in relation to the target
    categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', "poutcome"]
    for col in categorical_cols:
        print(f"\nRépartition de '{col}' par rapport à '{target_col}':")
        pivot_table = pd.crosstab(df[col], df[target_col], normalize='index').round(2) * 100
        display(pivot_table)

        plt.figure(figsize=(8, 5))
        sns.countplot(data=df, x=col, hue=target_col, palette='Set3')
        plt.title(f"Répartition de '{col}' par rapport à '{target_col}'")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    # Analysis of numerical variables in relation to the target
    numeric_cols = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'euribor3m']
    for col in numeric_cols:
        print(f"\nMoyenne de '{col}' pour chaque classe de '{target_col}':")
        display(df.groupby(target_col)[col].mean())

        plt.figure(figsize=(8, 5))
        sns.boxplot(data=df, x=target_col, y=col, palette='Set2')
        plt.title(f"Relation entre '{col}' et '{target_col}'")
        plt.xlabel(target_col.capitalize())
        plt.ylabel(col.capitalize())
        plt.show()

# Exécution of the EDA
exploratory_data_analysis(data, target_col='y')

In [None]:
# 4. Imputation of the missing values
def impute_missing_values(df):
    """
    Impute missing values ("unknown") in categorical variables with the mode,and in numerical variables with the mean.
    :param df: DataFrame.
    :return: DataFrame after imputation.
    """
    # Categorical variables
    categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
    for col in categorical_cols:
        mode_value = df[col].mode()[0]
        df[col] = df[col].replace('unknown', mode_value)

    # Numerical variables
    numeric_cols = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
    for col in numeric_cols:
        mean_value = df[col].mean()
        df[col].fillna(mean_value, inplace=True)

    print("Missing values imputed successfully..")
    return df

# Apply imputation of the missing values
data = impute_missing_values(data)

In [None]:
# 5. Encoding categorical variables using pd.get_dummies
def encode_categorical_variables_with_dummies(df, target_col='y'):
    """
    Encoding categorical variables using pd.get_dummies (One-Hot Encoding).
    :param df: DataFrame.
    :param target_col: Name of the target value (Response value).
    :return: Encoded DataFrame.
    """
    # Separation of categorical and numerical variables
    categorical_cols = [col for col in df.columns if df[col].dtype == 'object' and col != target_col]

    # Encoding categorical variables
    df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False, dtype=int)

     # Remove'duration' variable to prevent overfitting
    if 'duration' in df_encoded.columns:
        df_encoded = df_encoded.drop(columns=['duration'])
        print("'Duration' variable removed to prevent overfitting.")

    # Conversion of the target value into numeric (0 for 'no', 1 for 'yes')
    df_encoded[target_col] = df_encoded[target_col].map({'no': 0, 'yes': 1})

    print("Categorical variables successfully encoded (One-Hot Encoding).")
    return df_encoded

# Apply Encoding for numerical variables
data_encoded = encode_categorical_variables_with_dummies(data, target_col = 'y')


In [None]:
# 6. Application of SMOTE method to rebalance the classes.
from imblearn.over_sampling import SMOTE

def apply_smote(X, y):
    """
    Apply SMOTE method to rebalance classes.
    :param X: Trainning data (features).
    :param y: Name of the target value (Response value).
    :return: Balanced data.
    """
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Visualization of the distrubtion before and after applying SMOTE
    print("\nDistribution before SMOTE :")
    print(y.value_counts())

    print("\nDistribution after SMOTE :")
    print(pd.Series(y_resampled).value_counts())

    plt.figure(figsize=(8, 4))
    plt.subplot(1, 2, 1)
    sns.countplot(x=y)
    plt.title("before SMOTE")

    plt.subplot(1, 2, 2)
    sns.countplot(x=y_resampled)
    plt.title("after SMOTE")

    plt.tight_layout()
    plt.show()

    return X_resampled, y_resampled

# Apply class rebalancing with SMOTE
X_resampled, y_resampled = apply_smote(data_encoded, data_encoded["y"])

In [None]:
# 7. Selection of important features using Random Forest
def select_important_features_random_forest(X, y):
    """
    Using Random Forest algorithm to select the most important features. 
    :param X: Trainning data (features).
    :param y: Name of the target value (Response value).
    :return: List of selected features 
    """
    rf_selector = RandomForestClassifier(random_state=42)
    rf_selector.fit(X, y)
    feature_importances = rf_selector.feature_importances_
    important_features = X.columns[feature_importances > np.mean(feature_importances)].tolist()

    # Visualization of feature importance
    plt.figure(figsize=(20, 12))
    sns.barplot(x=feature_importances, y=X.columns, palette='viridis')
    plt.title("Importance des Variables selon Random Forest")
    plt.xlabel("Importance")
    plt.ylabel("Variables")
    plt.show()

    print("Features importantes selon Random Forest :", important_features)
    return important_features

# Separation of features and target
X = data_encoded.drop(columns=['y'])
y = data_encoded['y']

# Apply selection of important features using Random Forest.
important_features_rf = select_important_features_random_forest(X, y)
X_rf = X[important_features_rf]

In [None]:
# 8. Sélection des variables importantes avec ACP
def select_important_features_pca(X, explained_variance_threshold=0.95, top_n_per_component=3):
    """
    Use PCA to reduce dimensionality and identify important variables.
    :param X: Training data (features).
    :param explained_variance_threshold: Explained variance threshold.
    :param top_n_per_component: Number of variables to select per principal component.
    :return: List of selected variables, DataFrame of variable weights.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    pca = PCA(n_components=explained_variance_threshold)
    X_pca = pca.fit_transform(X_scaled)

    print(f"Number of retained principal components : {X_pca.shape[1]}")
    print(f"Cumulative explained variance percentage : {np.sum(pca.explained_variance_ratio_ * 100):.2f}%")

    component_weights = pd.DataFrame(pca.components_, columns=X.columns, index=[f"PC{i+1}" for i in range(pca.n_components_)])

    selected_variables = []
    for pc in component_weights.index:
        top_variables = component_weights.loc[pc].abs().nlargest(top_n_per_component).index.tolist()
        selected_variables.extend(top_variables)
        print(f"\nTop {top_n_per_component} variables pour {pc}: {top_variables}")

    selected_variables = list(set(selected_variables))
    return selected_variables, component_weights

# Sélection des variables importantes avec ACP
selected_variables_pca, component_weights = select_important_features_pca(X, top_n_per_component=3)
print("\nList of variables selected by PCA :")
print(selected_variables_pca)

In [None]:
# 9. Model development and evaluation using ROC AUC and confusion matrix
def develop_and_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }

    results = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        # Metrics calculation
        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
        plt.title(f"Confusion Matrix - {model_name} (ACP)")
        plt.xlabel("Predicted values")
        plt.ylabel("Actual values")
        plt.show()

        # Courbe ROC
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure(figsize=(6, 4))
        plt.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.2f})")
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel("False Positif Rate")
        plt.ylabel("True Positif Rate")
        plt.title(f"Courbe ROC - {model_name} (ACP)")
        plt.legend()
        plt.show()

        results[model_name] = {"Accuracy": accuracy, "ROC-AUC": roc_auc}

    return results

# Split Data on tranning and testing sets
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X[selected_variables_pca], y, test_size=0.2, random_state=42)
X_train_rf, X_test_rf, _, _ = train_test_split(X[important_features_rf], y, test_size=0.2, random_state=42)

# Model evaluation using ACP
print("\nModel evaluation using ACP :")
results_pca = develop_and_evaluate_models(X_train_pca, X_test_pca, y_train, y_test)

# Model evaluation using Random Forest
print("\nModel evaluation using Random Forest :")
results_rf = develop_and_evaluate_models(X_train_rf, X_test_rf, y_train, y_test)

In [None]:
# 10. Comparison of results between ACP & Random Forest
print("\n--- Comparison of results ---")

print("\Results with ACP :")
for model, metrics in results_pca.items():
    print(f"{model}: Accuracy = {metrics['Accuracy']:.4f}, ROC-AUC = {metrics['ROC-AUC']:.4f}")

print("\nResults with Random Forest :")
for model, metrics in results_rf.items():
    print(f"{model}: Accuracy = {metrics['Accuracy']:.4f}, ROC-AUC = {metrics['ROC-AUC']:.4f}")

# Identification of the best model and features selection method
best_model_pca = max(results_pca, key=lambda k: results_pca[k]['ROC-AUC'])
best_model_rf = max(results_rf, key=lambda k: results_rf[k]['ROC-AUC'])

if results_pca[best_model_pca]['ROC-AUC'] > results_rf[best_model_rf]['ROC-AUC']:
    print(f"\nBest features selection method: ACP with {best_model_pca} (ROC-AUC = {results_pca[best_model_pca]['ROC-AUC']:.4f})")
else:
    print(f"\nBest features selection method: Random Forest with {best_model_rf} (ROC-AUC = {results_rf[best_model_rf]['ROC-AUC']:.4f})")