In [None]:
# Standard libraries
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

# Other libraries and methods used in the code
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
def adjusting_non_numerical_attributes(column, df):
    le = preprocessing.LabelEncoder()
    le.fit(df[column])
    df[column] = le.transform(df[column])

In [None]:
def apply_the_split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    return(X_train, X_test, y_train, y_test)

In [None]:
def train_the_model_and_make_predictions(X_train, y_train, X_test):
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

In [None]:
# Read the file containing the data
df = pd.read_csv('BankChurners.csv')
df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis = 1, inplace=True)
df.info()

In [None]:
# Turn the object data into numerical data
for column in df:
    if df[column].dtype == 'object':
        adjusting_non_numerical_attributes(column, df)
df.info()

In [None]:
# Checking to see if there are any missing values
df.isna().sum()

In [None]:
# Generating the heatmap to analyze if there any attributes highly correlated
fig, ax = plt.subplots(figsize=(20,20)) 
corr_matrix = df.corr()
sn.heatmap(corr_matrix, annot = True, ax=ax)
fig.savefig("heatmap_nb.png")

In [None]:
# Visualizing the data distribution of an example of uncorrelated attributes
plt.style.use('seaborn-whitegrid')
fig, ax = plt.subplots(figsize=(10,6))

# Plot the data
scatter = ax.scatter(x=df["Months_on_book"],
                     y=df["Marital_Status"],
                     c=df["Attrition_Flag"],
                     cmap='PiYG',
                     marker='*');

# Customize the plot
ax.set(title="Dataset attributes",
       xlabel = "Months_on_book",
       ylabel = "Marital_Status");

# Add a legend
ax.legend(*scatter.legend_elements(), title="Target");
fig.savefig("relation_btw_uncorrelated_attributes.png")

In [None]:
# Visualizing the data distribution of an example of correlated attributes
plt.style.use('seaborn-whitegrid')
fig, ax = plt.subplots(figsize=(10,6))

# Plot the data
scatter = ax.scatter(x=df["Credit_Limit"],
                     y=df["Avg_Open_To_Buy"],
                     c=df["Attrition_Flag"],
                     cmap='PiYG',
                     marker='*');

# Customize the plot
ax.set(title="Dataset attributes",
       xlabel = "Credit_Limit",
       ylabel = "Avg_Open_To_Buy");

# Add a legend
ax.legend(*scatter.legend_elements(), title="Target");
fig.savefig("relation_btw_correlated_attributes_2.png")

In [None]:
# Visualizing the data distribution of another example of correlated attributes
plt.style.use('seaborn-whitegrid')
fig, ax = plt.subplots(figsize=(10,6))

# Plot the data
scatter = ax.scatter(x=df["Months_on_book"],
                     y=df["Customer_Age"],
                     c=df["Attrition_Flag"],
                     cmap='PiYG',
                     marker='*');

# Customize the plot
ax.set(title="Dataset attributes",
       xlabel = "Months_on_book",
       ylabel = "Customer_Age");

# Add a legend
ax.legend(*scatter.legend_elements(), title="Target");
fig.savefig("relation_btw_correlated_attributes.png")

In [None]:
# Discarding highly correlated attributes with many possible values for each
df.drop(['Customer_Age','Total_Trans_Ct','Total_Revolving_Bal','Credit_Limit'], axis=1, inplace=True) #Alta correlação com months in the book e muitos valores possíveis

In [None]:
# Splitting the data into train and test sets
X = df.drop(['Attrition_Flag'], axis=1)
y = df['Attrition_Flag']
X_train, X_test, y_train, y_test = apply_the_split(X,y)

In [None]:
# Making the predictions with raw data
y_pred = train_the_model_and_make_predictions(X_train, y_train, X_test)
print(classification_report(y_test,y_pred))

In [None]:
# Analyzing the Confusion Matrix
plt.style.use('seaborn-white')
fig, ax = plt.subplots(figsize=(10,6))

ConfusionMatrixDisplay.from_estimator(estimator = clf, X=X,y=y, ax = ax,cmap='gist_ncar')
#fig.savefig("cm_w_warning.png")

In [None]:
# Trying the oversampling method to see how this affects the metrics
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
X_train, X_test, y_train, y_test = apply_the_split(X_over,y_over)

# Retraining the model and making another predictions
y_pred = train_the_model_and_make_predictions(X_train, y_train, X_test)
print(classification_report(y_test,y_pred))

In [None]:
# Trying the undersampling method to see how this affects the metrics
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(X, y)
X_train, X_test, y_train, y_test = apply_the_split(X_under,y_under)

# Retraining the model and making another predictions
y_pred = train_the_model_and_make_predictions(X_train, y_train, X_test)
print(classification_report(y_test,y_pred))

In [None]:
# Normalizing the data (by feature) to the same scale, analyzing if this improves the accuracy and the other metrics
d = preprocessing.normalize(X, axis=0)
scaled_X = pd.DataFrame(d, columns=X.columns)
scaled_X.info()

In [None]:
# Spplitting the data into Train and Test sets with the normalized X
X_train, X_test, y_train, y_test = apply_the_split(scaled_X,y)

# Retraining the model and making another predictions
y_pred = train_the_model_and_make_predictions(X_train, y_train, X_test)
print(classification_report(y_test,y_pred))

In [None]:
# Analyzing the Confusion Matrix
plt.style.use('seaborn-white')
fig, ax = plt.subplots(figsize=(10,6))

ConfusionMatrixDisplay.from_estimator(estimator = clf_norm, X=scaled_X,y=y, ax = ax, cmap='gist_ncar')
fig.savefig("cm_wo_warning.png")