## Importing Libraries & Loading Data

In [173]:
import pandas as pd
import sklearn

df = pd.read_csv("/Users/alejandrolondono/Downloads/Churn_Modelling.csv")

In [174]:
df.head() # taking a look at the structure of our data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## Data Cleaning

In [175]:
column_names = df.columns.tolist() # returning column names as a list of values
column_names

['RowNumber',
 'CustomerId',
 'Surname',
 'CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'Exited']

#### Dropping Irrelevant Columns

In [176]:
df.drop(['RowNumber','CustomerId','Surname'], axis=1, inplace=True) # These columns are irrelevant to our prediction
print(df)

      CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0             619    France  Female   42       2       0.00              1   
1             608     Spain  Female   41       1   83807.86              1   
2             502    France  Female   42       8  159660.80              3   
3             699    France  Female   39       1       0.00              2   
4             850     Spain  Female   43       2  125510.82              1   
...           ...       ...     ...  ...     ...        ...            ...   
9995          771    France    Male   39       5       0.00              2   
9996          516    France    Male   35      10   57369.61              1   
9997          709    France  Female   36       7       0.00              1   
9998          772   Germany    Male   42       3   75075.31              2   
9999          792    France  Female   28       4  130142.79              1   

      HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0   

#### One Hot Encoding

In [177]:
df = pd.get_dummies(df, columns=['Geography','Gender'], drop_first=True) # One Hot encoding categorical features
df.head() # displaying first few rows to verify encoding

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,False,False,False
1,608,41,1,83807.86,1,0,1,112542.58,0,False,True,False
2,502,42,8,159660.8,3,1,0,113931.57,1,False,False,False
3,699,39,1,0.0,2,0,0,93826.63,0,False,False,False
4,850,43,2,125510.82,1,1,1,79084.1,0,False,True,False


#### Converting Booleans to Binary

In [178]:
df['Geography_Germany'] = df['Geography_Germany'].astype(int)
df['Geography_Spain'] = df['Geography_Spain'].astype(int)
df['Gender_Male'] = df['Gender_Male'].astype(int)
print(df.dtypes) # displaying the data types to verify conversion
df.head() # displaying first few rows

CreditScore            int64
Age                    int64
Tenure                 int64
Balance              float64
NumOfProducts          int64
HasCrCard              int64
IsActiveMember         int64
EstimatedSalary      float64
Exited                 int64
Geography_Germany      int64
Geography_Spain        int64
Gender_Male            int64
dtype: object


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


# Our Models

## 1st Algorithm | Logistic Regression

In [179]:
# Importing classes from the sklearn library
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### Splitting Train/Test set

In [180]:
y = df['Exited'] # target variable
X = df.drop(columns=['Exited'], axis=1) # features, dropped target variable

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.20, random_state=67, stratify=y) # Splitting Training and Testing data (80/20

### Splitting Train/Valid set

In [181]:
# Again we split data now into training and validation. another 80/20
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.20, random_state=67, stratify=y_temp)

### Scaling

In [182]:
scaler = StandardScaler() # Initialize
X_train_scaled = scaler.fit_transform(X_train) # Fitting/Transforming our training data
# Transforming Validation and Test Data
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

### Training the Logistic Regression Model

In [183]:
log_reg = LogisticRegression(random_state=67) # Initialization
log_reg.fit(X_train_scaled, y_train) # Training data. Fitting

In [184]:
# Will list biggest risk and retention factors

# Get the coefficients
lr_coefficients = log_reg.coef_[0]

# Getting the feature names (the columns of X)
feature_names = X.columns

# Combining both names and numbers
feature_impact = pd.Series(lr_coefficients, index=feature_names)

# Sorting to find the highest positive and negative impacts

# Most Likely to Churn (Highest Positive Impact)
print("Top 3 Risk Factors (Positive Impact):\n", feature_impact.nlargest(3))

# Most Likely to Stay (Highest Negative Impact)
print("\nTop 3 Retention Factors (Negative Impact):\n", feature_impact.nsmallest(3))

Top 3 Risk Factors (Positive Impact):
 Age                  0.762387
Geography_Germany    0.321354
Balance              0.211343
dtype: float64

Top 3 Retention Factors (Negative Impact):
 IsActiveMember   -0.540419
Gender_Male      -0.281427
Tenure           -0.055570
dtype: float64


### Making Predictions

In [185]:
y_val_pred = log_reg.predict(X_val_scaled) # Predicting on the validation set (scaled)
y_val_proba = log_reg.predict_proba(X_val_scaled) # Predicting probabilities on the validation set (scaled)

In [186]:
log_reg_cm = confusion_matrix(y_val, y_val_pred) # confusion matrix | Actual values, and predicted values
print(log_reg_cm)

[[1216   58]
 [ 253   73]]


In [187]:
log_reg_cr = classification_report(y_val, y_val_pred) # creating the full classification report
print("\nLogistic Regression:")
print(log_reg_cr)


Logistic Regression:
              precision    recall  f1-score   support

           0       0.83      0.95      0.89      1274
           1       0.56      0.22      0.32       326

    accuracy                           0.81      1600
   macro avg       0.69      0.59      0.60      1600
weighted avg       0.77      0.81      0.77      1600



## 2nd Algorithm | Naive Bayes

In [188]:
from sklearn.naive_bayes import GaussianNB

In [189]:
y = df.Exited # target variable
X = df.drop('Exited', axis=1) # features, dropping the target variable

### Training and Predicting

In [190]:
gnb = GaussianNB() # Creating an instance
gnb.fit(X_train_scaled, y_train) # training
y_val_pred_nb = gnb.predict(X_val_scaled) # making predictions on new data
y_val_proba_nb = gnb.predict_proba(X_val_scaled) # Predicting probabilities

### Evaluation

In [191]:
nb_cm = confusion_matrix(y_val, y_val_pred_nb) # creating the confusion matrix
print(nb_cm)

[[1190   84]
 [ 208  118]]


In [192]:
nb_cr = classification_report(y_val, y_val_pred_nb) # creating the classification report
print("\nNaive Bayes:")
print(nb_cr)


Naive Bayes:
              precision    recall  f1-score   support

           0       0.85      0.93      0.89      1274
           1       0.58      0.36      0.45       326

    accuracy                           0.82      1600
   macro avg       0.72      0.65      0.67      1600
weighted avg       0.80      0.82      0.80      1600



## 3rd Algorithm | Neural Network

In [193]:
from sklearn.neural_network import MLPClassifier

In [194]:
# Model
mlp = MLPClassifier(hidden_layer_sizes=(10, 5), max_iter=500, random_state=67) # setting up the architecture of our model
mlp.fit(X_train_scaled, y_train) # we will fit the model on the scaled training data

In [195]:
y_val_pred_nn = mlp.predict(X_val_scaled) # making predictions on new data
y_val_proba_nn = mlp.predict_proba(X_val_scaled) # making predictions on new data

### Evaluation

In [196]:
mlp_cm = confusion_matrix(y_val, y_val_pred_nn)
print("\nNeural Network Untuned:")
print(mlp_cm)
print("\nNeural Network Untuned:")
print(classification_report(y_val, y_val_pred_nn))


Neural Network Untuned:
[[1216   58]
 [ 164  162]]

Neural Network Untuned:
              precision    recall  f1-score   support

           0       0.88      0.95      0.92      1274
           1       0.74      0.50      0.59       326

    accuracy                           0.86      1600
   macro avg       0.81      0.73      0.75      1600
weighted avg       0.85      0.86      0.85      1600



##### I've decided to tune the model because despite a high overall Accuracy (0.86), the model shows a severe bias toward the majority class (non-churn=0). This is shown by the low F1-Score for the minority class (Churn=1). This bias is due to the class imbalance in the banking dataset, where non-churners outnumber churners by far.

### Tune the model

In [197]:
# Importing the necessary over-sampling library
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier

##### The untuned model had a low F1-Score (0.59) for the minority class. This was likely due to the bias from the data imbalance. I used SMOTE (Synthetic Minority Over-sampling Technique) to artificially balance the training data. Which forces the model to learn the patterns of the 'Churn'(minority) class effectively.

In [198]:
# Initializing SMOTE
smote = SMOTE(random_state=67)

# Applying SMOTE only to the training data to make it a balanced set. We should then have a 50/50 class distribution.
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Initializing the (new)MLP model using the same parameters that we used in the original.
mlp_tuned = MLPClassifier(
    hidden_layer_sizes=(12, 6),
    max_iter=500,
    random_state=67
)

# I fitted the model using the balanced training data. This way the model can learn from an equal number of churn and no-churn examples.
mlp_tuned.fit(X_train_resampled, y_train_resampled)

# Now we are making predictions on the original (imbalanced) validation data.
y_val_pred_tuned = mlp_tuned.predict(X_val_scaled)

### Evaluate the Tuned

In [199]:
# Confusion Matrix (using the tuned prediction)
mlp_cm_tuned = confusion_matrix(y_val, y_val_pred_tuned)
print("Confusion Matrix (Tuned):\n", mlp_cm_tuned)

# 2. Classification Report (Using the same tuned prediction)
print("\nNeural Network Classification Report (Tuned):\n", classification_report(y_val, y_val_pred_tuned))

Confusion Matrix (Tuned):
 [[1028  246]
 [  82  244]]

Neural Network Classification Report (Tuned):
               precision    recall  f1-score   support

           0       0.93      0.81      0.86      1274
           1       0.50      0.75      0.60       326

    accuracy                           0.80      1600
   macro avg       0.71      0.78      0.73      1600
weighted avg       0.84      0.80      0.81      1600



# Test Set Evaluation

In [200]:
# We are making predictions on the unseen Test Set
y_test_pred_final = mlp_tuned.predict(X_test_scaled) # Predicts the class labels (0 or 1)

# Printing the final, unbiased Confusion Matrix
print("\nFINAL TEST SET CONFUSION MATRIX")
final_cm = confusion_matrix(y_test, y_test_pred_final)
print(final_cm) # Providing the final TP, TN, FP, FN numbers

# Printing the final Classification Report
print("FINAL TEST SET CLASSIFICATION REPORT")
print(classification_report(y_test, y_test_pred_final))


FINAL TEST SET CONFUSION MATRIX
[[1329  264]
 [ 119  288]]
FINAL TEST SET CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.92      0.83      0.87      1593
           1       0.52      0.71      0.60       407

    accuracy                           0.81      2000
   macro avg       0.72      0.77      0.74      2000
weighted avg       0.84      0.81      0.82      2000

