In [129]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

import pickle
import gzip

from imblearn.over_sampling import RandomOverSampler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

In [130]:
churners_df = pd.read_csv("../../data/BankChurners.csv", index_col='CLIENTNUM', sep= ',')

churners_df.shape

# no missing values --> some 'Unknown' values
churners_df.isnull().sum()

attrited = churners_df[churners_df['Attrition_Flag'] == "Attrited Customer"].value_counts().sum()
existing = churners_df[churners_df['Attrition_Flag'] == "Existing Customer"].value_counts().sum()

print(f'Amount of attrited clients: {attrited}')
print(f'Amount of existing clients: {existing}')

print(churners_df.columns)

# # Assuming 'df' is your DataFrame containing the dataset
# current_month = 2024 * 12 + 5  # Convert reference date to months
# df['Customer_Tenure'] = current_month - df['Months_on_book']

Amount of attrited clients: 1627
Amount of existing clients: 8500
Index(['Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count',
       'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
      dtype='object')


In [131]:
def oversampling(X,y):
    ros = RandomOverSampler(sampling_strategy='not majority', random_state=42)
    X,y = ros.fit_resample(X,y)
    return X,y

In [132]:
def cleaning_data(df):
    df.drop(columns=["Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1",
                     "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], 
                     inplace=True)
    # no missing values --> some 'Unknown' values
    df.isnull().sum()
    
        ## Handling Outliers
    Q1 = df['Months_on_book'].quantile(0.25) # Calculate Q1
    Q3 = df['Months_on_book'].quantile(0.75) # Calculate Q3

    IQR = Q3 - Q1 # Calculate IQ range

    lower_bound = Q1 - 1.5 * IQR # Calculate the lower bound
    upper_bound = Q3 + 1.5 * IQR # Calculate the upper bound

    df = df[(df['Months_on_book'] >= lower_bound) & (df['Months_on_book'] <= upper_bound)]

    #change 'Unknown' values in columns to None or Nan
    df = df.applymap(lambda x: None if x == 'Unknown' else x)
    print(df.isnull().sum())
    return df

In [133]:
def adding_features(df):
    return df

In [134]:
def imputer(train, test):
    imputer_cat = SimpleImputer(strategy='most_frequent')
    categorical_features = train.select_dtypes(include=['object']).columns
    train[categorical_features] = imputer_cat.fit_transform(train[categorical_features])
    test[categorical_features] = imputer_cat.transform(test[categorical_features])

    imputer_num = SimpleImputer(strategy='median')
    numerical_features = train.select_dtypes(include=['int64', 'float64']).columns
    train[numerical_features] = imputer_num.fit_transform(train[numerical_features])
    test[numerical_features] = imputer_num.transform(test[numerical_features])

    return train, test

In [135]:
def encode(train, test):
    # encode categorical features
    categorical_cols = train.select_dtypes('object').columns

    encoder = LabelEncoder()
    for col in categorical_cols:
        train[col] = encoder.fit_transform(train[col])
        
        #If a value is encountered in the test data that wasn't seen during training (i.e., not present in the label encoder's classes), 
        #it's assigned a placeholder value of -1. This ensures that unseen categorical values in the test data are handled appropriately 
        #during the encoding process.
        test[col] = test[col].map(lambda s: encoder.transform([s])[0] if s in encoder.classes_ else -1)
    
    return train, test

In [136]:
'''MinMaxScaler 
--> scales values linearly, without distorting the relative relationships between the variables.
--> brings all values in the range of 0 and 1, preventing any single feature form dominating the learning process
--> Less sensitive to outliers'''
def scaler(X_train, X_test):
    scaler = MinMaxScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns)
    
    return X_train, X_test

In [137]:
def preprocess(df):
    df = cleaning_data(df)
    df = adding_features(df)

    X = churners_df.drop('Attrition_Flag', axis=1)
    y = churners_df['Attrition_Flag']

    X,y = oversampling(X,y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    X_train, X_test= imputer(X_train, X_test)
    X_train, X_test= encode(X_train, X_test)
    X_train, X_test = scaler(X_train, X_test)

    # call def that exports preprocess models

    return X_train, X_test, y_train, y_test, 

In [138]:
X_train, X_test, y_train, y_test = preprocess(churners_df)
X_train.head()

Attrition_Flag                 0
Customer_Age                   0
Gender                         0
Dependent_count                0
Education_Level             1457
Marital_Status               727
Income_Category             1047
Card_Category                  0
Months_on_book                 0
Total_Relationship_Count       0
Months_Inactive_12_mon         0
Contacts_Count_12_mon          0
Credit_Limit                   0
Total_Revolving_Bal            0
Avg_Open_To_Buy                0
Total_Amt_Chng_Q4_Q1           0
Total_Trans_Amt                0
Total_Trans_Ct                 0
Total_Ct_Chng_Q4_Q1            0
Avg_Utilization_Ratio          0
dtype: int64


  df = df.applymap(lambda x: None if x == 'Unknown' else x)


Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0.571429,0.0,0.2,0.666667,0.666667,0.2,0.0,0.767442,0.0,0.5,0.5,0.001654,0.370282,0.016168,0.13718,0.059708,0.255814,0.092623,0.624625
1,0.571429,0.0,0.2,0.333333,0.333333,0.8,0.0,0.534884,0.4,0.166667,0.333333,0.081768,0.750099,0.065251,0.165146,0.196168,0.496124,0.216747,0.456456
2,0.52381,0.0,0.8,0.666667,1.0,0.8,0.0,0.697674,0.6,0.166667,0.5,0.027774,0.58085,0.025845,0.216073,0.23014,0.503876,0.276521,0.620621
3,0.261905,0.0,0.4,0.833333,1.0,0.8,1.0,0.372093,0.8,0.333333,1.0,0.325437,0.0,0.35349,0.26494,0.462682,0.651163,0.226979,0.0
4,0.238095,0.0,0.4,1.0,0.666667,0.8,0.0,0.534884,0.2,0.5,0.5,0.140055,0.0,0.175818,0.268472,0.50489,0.379845,0.243134,0.0
