In [1]:

from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import randint
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.feature_selection import SelectFromModel
from sklearn import svm
from sklearn.compose import make_column_transformer
from scipy.stats import randint
import seaborn as sns
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
#from xgboost import XGBClassifier
#from xgboost import XGBRegressor
#from mlxtend.preprocessing import DenseTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
# MLP for Pima Indians Dataset with 10-fold cross validation via sklearn
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

In [58]:
import tensorflow as tf
from tensorflow import keras

In [2]:
import pandas as pd
import numpy as np

df=pd.read_csv(r'D:\Python Tutorial\Churn_Modelling.csv')
df = df.groupby('Exited').apply(lambda x: x.sample(frac=1))

# EDA- Exploratory data analysis

In [3]:
df=df.drop(columns=['RowNumber','CustomerId'],axis=1)

In [4]:
# Remove rows with na
df=df.dropna()

In [5]:
##Convert all features with unique values less than 5 to object
def convert_to_object(data):
    for i in data.columns:
        if data[i].nunique()<5:
            data[i]=data[i].astype('str')

In [6]:
convert_to_object(df)

In [7]:
##Create a function to drop a variable if it is object but nunique>20,
##There is something called as "Pass by value" in python ad doing df= in the last statement will not alter df
##while the same df= in for loop will work
#refer https://stackoverflow.com/questions/38895768/python-pandas-dataframe-is-it-pass-by-value-or-pass-by-reference/38925257#38925257
#https://stackoverflow.com/questions/70902808/for-loop-inside-a-def-function-not-working/70902856#70902856
def drop_object_nunique(data):
    for i in data.columns:
        if data[i].dtype=='object' and data[i].nunique()>20:
            data.drop(columns=[i],axis=1,inplace=True)

In [8]:
drop_object_nunique(df)

In [None]:
#!pip install imbalanced-learn
#from imblearn.over_sampling import SMOTE

for i in df.columns:
    if df[i].dtype=='object' and df[i].nunique()>20:
        df=df.drop(columns=[i],axis=1)

In [9]:
X=df.drop(columns=['Exited'],axis=1)
y=df['Exited']

In [10]:
##There is imbalance in the class of the prediction variable
y.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

Use SMOTENC to solve the problem of imbalance class where X has categorical variables + numerical variables

In [None]:
# define function to select and pass only cat variables index location to SMOTEC without the loc of y variable

In [11]:
def SMOTEC_Cat(data):
    cat_bool=list([data.dtypes==object][0])
    loc=[]
    for i in range(len(cat_bool)):
        if cat_bool[i]==True:
            loc.append(i)
    return loc[:len(loc)-1]
#This will return the index location of all cat vars except the last y var, since we dont want it for SMOTEC function
SMOTEC_Cat(df)

[1, 2, 6, 7, 8]

In [None]:
#smote = SMOTENC(categorical_features =[df.dtypes==object],sampling_strategy='minority',random_state=0)

In [12]:
smote = SMOTENC(categorical_features =SMOTEC_Cat(df),sampling_strategy='minority',random_state=0)

In [13]:
#sm = SMOTENC(random_state=42, categorical_features=loc)
X, y = smote.fit_resample(X, y)

In [14]:
print(X.shape,",",y.shape)

(15926, 10) , (15926,)


In [None]:
#train Test split
#X_train_full,X_test_full,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
#print(X_train_full.shape,y_train.shape)
#print(X_test_full.shape,y_test.shape)

In [15]:
##Lets create a function to select only numerical variables
numerical_variables=[]
def select_num_variables(data,numerical_variables):
    for i in data.columns:
        if data[i].dtype in ['float64','int64']:
            numerical_variables.append(i)
            

In [16]:
select_num_variables(X,numerical_variables)

In [17]:
numerical_variables

['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']

In [18]:
##Lets create a function to select only categorical variables
categorical_variables=[]
def select_cat_variables(data,categorical_variables):
    for i in data.columns:
        if data[i].dtype=='object':
            categorical_variables.append(i)
    

In [19]:
select_cat_variables(X,categorical_variables)

In [20]:
categorical_variables

['Geography', 'Gender', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']

In [21]:
##Keep Selected Columns
my_cols=numerical_variables+categorical_variables
my_cols

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'EstimatedSalary',
 'Geography',
 'Gender',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember']

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# Preprocessing transformation of numerical and categorical data

In [23]:
# numerical transformation
numerical_transformer=Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="constant")),
    ('scaler',  StandardScaler())
     ])

In [24]:
#Categorical transformation
categorical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('encoeder',OneHotEncoder(handle_unknown="ignore"))
])

In [25]:
#Bundle preprocessing of numerical and categorical data using ColumnTransformer
preprocessor=ColumnTransformer(transformers=[
                                             ('num',numerical_transformer,numerical_variables),
                                             ('cat',categorical_transformer,categorical_variables)
])

# Preprocess the data

In [None]:
#preprocessor.fit_transform(X_train)

In [26]:
X_train.shape

(12740, 10)

In [27]:
##This gives me numpy array
X_train_transformed=preprocessor.fit_transform(X_train)
X_train_transformed.shape

(12740, 18)

In [28]:
#Convert it to DataFrame to extract and rename features
X_df=pd.DataFrame(X_train_transformed)

In [29]:
#This will give me the categorical variables transformed
cat_names=preprocessor.transformers_[1][1]['encoeder']\
.get_feature_names(categorical_variables)

In [30]:
##Rename cat columns
for i,j in zip(range(5,18), range(len(cat_names))):
    X_df=X_df.rename(columns={X_df.columns[i]:cat_names[j]})
    
##Rename numerical columns
for i,j in zip(range(0,5), range(len(numerical_variables))):
    X_df=X_df.rename(columns={X_df.columns[i]:numerical_variables[j]})

# Feature selection

from sklearn.svm import LinearSVC
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train_transformed, y_train)
model_lsvc = SelectFromModel(lsvc, prefit=True)
X_new = model_lsvc.transform(X_train_transformed)
X_new.shape[1]

In [31]:
from sklearn.svm import LinearSVC
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_df, y_train)
model_lsvc = SelectFromModel(lsvc, prefit=True)
X_new = model_lsvc.transform(X_df)
X_new.shape[1]

14

In [32]:
#This will give the position of selected features in fporm of list [0, 1, 2, 3, 4, 6, 9, 10, 11, 12, 17]
select_var_True=list(model_lsvc.get_support([0]))

In [33]:
#This will subset the X_df dataset to select only the importnat features
X_df_selected_features=X_df.iloc[:,select_var_True]

In [34]:
X_df_selected_features.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,NumOfProducts_2,NumOfProducts_3,HasCrCard_1,IsActiveMember_0,IsActiveMember_1
0,0.429726,-0.403114,-0.306699,-0.069754,0.339267,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.702982,0.598733,-1.412969,0.218925,1.687104,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,1.194845,0.698918,-0.306699,0.238759,0.759504,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.145538,-1.104407,1.537085,-1.347315,0.998791,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.189259,-0.002375,-1.044213,-1.347315,1.432141,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


In [35]:
selected_features=list(X_df_selected_features.columns)
selected_features

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'EstimatedSalary',
 'Geography_Germany',
 'Geography_Spain',
 'Gender_Female',
 'Gender_Male',
 'NumOfProducts_2',
 'NumOfProducts_3',
 'HasCrCard_1',
 'IsActiveMember_0',
 'IsActiveMember_1']

In [101]:
print(X_new.shape[1],",",X_df_selected_features.shape[1])

14 , 14


# Keras Model

In [77]:
# Function to create model, required for KerasClassifier
def create_model(optimizer='rmsprop', init='glorot_uniform'):
    # create model
    model = Sequential()
    model.add(Dense(14, input_dim=X_new.shape[1], kernel_initializer=init, activation='relu'))
    model.add(Dense(8, kernel_initializer=init, activation='relu'))
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [78]:
# create model
model = KerasClassifier(build_fn=create_model,verbose=0)

In [None]:
##Define Gridsearch parameters

In [79]:
param_grid={'optimizer': ['adam','rmsprop'],
            'init':['uniform','glorot_uniform'],
            'epochs':[150,200],
            'batch_size':[50,100,500]}

In [80]:
grid=GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)

In [81]:
grid_result=grid.fit(X_df_selected_features,y_train)

In [82]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.833124 using {'batch_size': 100, 'epochs': 200, 'init': 'uniform', 'optimizer': 'rmsprop'}


In [83]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.827158 (0.000712) with: {'batch_size': 50, 'epochs': 150, 'init': 'uniform', 'optimizer': 'adam'}
0.829513 (0.000676) with: {'batch_size': 50, 'epochs': 150, 'init': 'uniform', 'optimizer': 'rmsprop'}
0.832417 (0.001975) with: {'batch_size': 50, 'epochs': 150, 'init': 'glorot_uniform', 'optimizer': 'adam'}
0.828257 (0.003253) with: {'batch_size': 50, 'epochs': 150, 'init': 'glorot_uniform', 'optimizer': 'rmsprop'}
0.829827 (0.003675) with: {'batch_size': 50, 'epochs': 200, 'init': 'uniform', 'optimizer': 'adam'}
0.832496 (0.002144) with: {'batch_size': 50, 'epochs': 200, 'init': 'uniform', 'optimizer': 'rmsprop'}
0.829906 (0.001750) with: {'batch_size': 50, 'epochs': 200, 'init': 'glorot_uniform', 'optimizer': 'adam'}
0.829827 (0.002070) with: {'batch_size': 50, 'epochs': 200, 'init': 'glorot_uniform', 'optimizer': 'rmsprop'}
0.828022 (0.004061) with: {'batch_size': 100, 'epochs': 150, 'init': 'uniform', 'optimizer': 'adam'}
0.828336 (0.001703) with: {'batch_size': 100, 'epochs': 150

#Linear SVC method of feature selection
from sklearn.svm import LinearSVC
my_pipeline=Pipeline(steps=[('preprocessor',preprocessor),
                            ('feature selection',SelectFromModel(LinearSVC(penalty="l1",dual=False))),
                             ('model',grid)])

# Test Data Set

In [84]:
X_test.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
10878,746,France,Male,34,5,149871.000675,1,1,0,179675.337123
13564,700,Germany,Male,45,4,118707.72924,1,1,0,65434.024057
3959,616,France,Male,29,9,0.0,1,1,1,166984.44
12402,665,France,Female,37,8,186564.781205,1,1,0,184262.85942
14151,748,Germany,Male,45,8,117795.503364,1,1,0,102002.38352


# Preprocess the test data¶
In the last section I have compressed all the test- preprocess steps into one function (Below codes will help from learning purpose in term sof steps involved to match the shape of test and train data

In [85]:
##This gives me numpy array
X_test_transformed=preprocessor.fit_transform(X_test)
X_test_transformed.shape

(3186, 18)

In [86]:
#Convert it to DataFrame to extract and rename features
X_test_df=pd.DataFrame(X_test_transformed)

In [87]:
X_test_df.head()
X_test_df.shape

(3186, 18)

In [88]:
#This will give me the categorical variables transformed
cat_names=preprocessor.transformers_[1][1]['encoeder']\
.get_feature_names(categorical_variables)

In [89]:
##Rename cat columns
for i,j in zip(range(5,18), range(len(cat_names))):
    X_test_df=X_test_df.rename(columns={X_test_df.columns[i]:cat_names[j]})
    
##Rename numerical columns
for i,j in zip(range(0,5), range(len(numerical_variables))):
    X_test_df=X_test_df.rename(columns={X_test_df.columns[i]:numerical_variables[j]})

In [90]:
X_test_df.head()
X_test_df.shape

(3186, 18)

In [None]:
##Now select the same features as that in training data

In [91]:
selected_features=list(X_df_selected_features.columns)
selected_features

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'EstimatedSalary',
 'Geography_Germany',
 'Geography_Spain',
 'Gender_Female',
 'Gender_Male',
 'NumOfProducts_2',
 'NumOfProducts_3',
 'HasCrCard_1',
 'IsActiveMember_0',
 'IsActiveMember_1']

In [92]:
X_test_processed=X_test_df[selected_features]

In [93]:
X_test_processed.head()
X_test_processed.shape

(3186, 14)

In [None]:
help(X_test_processed.reindex)

A test dataset can have less classes for a categorical variable, hence there could be a possibility that the selected features in the train dataset might not be available in test dataset,

In that case we will have to create those features in test dataset with value=0

In [94]:
#we canuse reindex function to get the missing colums in test datatset with fill_value=0
#help(X_test_processed.reindex) for more details
X_test_processed=X_test_processed.reindex(columns=X_df_selected_features.columns,fill_value=0)

In [95]:
X_test_processed.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,NumOfProducts_2,NumOfProducts_3,HasCrCard_1,IsActiveMember_0,IsActiveMember_1
0,1.047415,-0.693467,0.0685,1.120198,1.320767,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,0.549147,0.415612,-0.292824,0.61214,-0.645129,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,-0.360734,-1.197593,1.513796,-1.323168,1.102379,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,0.17003,-0.390991,1.152472,1.718422,1.399711,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,1.069078,0.415612,1.152472,0.597267,-0.015851,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [96]:
preds=grid.predict(X_test_processed)

In [97]:
preds.shape

(3186, 1)

In [98]:
matrix=classification_report(y_test,preds,labels=[1,0])
print('Classification report: \n',matrix)

Classification report: 
               precision    recall  f1-score   support

           1       0.83      0.82      0.83      1608
           0       0.82      0.83      0.83      1578

   micro avg       0.83      0.83      0.83      3186
   macro avg       0.83      0.83      0.83      3186
weighted avg       0.83      0.83      0.83      3186



# Create a function to preprocess and predict on test data

Above steps written in an efficient way looped under a function to directly give the test prediction results are preprocessing, feature reconciliation and fitting the Keras model

In [99]:
def predict_test(test_data):
    X_test_df=pd.DataFrame(preprocessor.fit_transform(X_test))
    
    cat_names=preprocessor.transformers_[1][1]['encoeder']\
    .get_feature_names(categorical_variables)
    
    ##Rename cat columns
    for i,j in zip(range(5,18), range(len(cat_names))):
        X_test_df=X_test_df.rename(columns={X_test_df.columns[i]:cat_names[j]})
    
    ##Rename numerical columns
    for i,j in zip(range(0,5), range(len(numerical_variables))):
        X_test_df=X_test_df.rename(columns={X_test_df.columns[i]:numerical_variables[j]})
    
    preds=grid.predict(X_test_df[selected_features])
    matrix=classification_report(y_test,preds,labels=[1,0])
    print('Classification report: \n',matrix)

In [100]:
predict_test(X_test)

Classification report: 
               precision    recall  f1-score   support

           1       0.83      0.82      0.83      1608
           0       0.82      0.83      0.83      1578

   micro avg       0.83      0.83      0.83      3186
   macro avg       0.83      0.83      0.83      3186
weighted avg       0.83      0.83      0.83      3186

