In [1]:

from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import randint
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.feature_selection import SelectFromModel
from sklearn import svm
from sklearn.compose import make_column_transformer
from scipy.stats import randint
import seaborn as sns
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
#from xgboost import XGBClassifier
#from xgboost import XGBRegressor
#from mlxtend.preprocessing import DenseTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
# MLP for Pima Indians Dataset with 10-fold cross validation via sklearn
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold


In [131]:
import pandas as pd
import numpy as np

df=pd.read_csv(r'D:\Python Tutorial\Churn_Modelling.csv')
df = df.groupby('Exited').apply(lambda x: x.sample(frac=1))

# EDA- Exploratory data analysis

In [132]:
df=df.drop(columns=['RowNumber','CustomerId'],axis=1)

In [133]:
# Remove rows with na
df=df.dropna()

In [134]:
##Convert all features with unique values less than 5 to object
def convert_to_object(data):
    for i in data.columns:
        if data[i].nunique()<5:
            data[i]=data[i].astype('str')

In [135]:
convert_to_object(df)

In [136]:
##Create a function to drop a variable if it is object but nunique>20,
##There is something called as "Pass by value" in python ad doing df= in the last statement will not alter df
##while the same df= in for loop will work
#refer https://stackoverflow.com/questions/38895768/python-pandas-dataframe-is-it-pass-by-value-or-pass-by-reference/38925257#38925257
#https://stackoverflow.com/questions/70902808/for-loop-inside-a-def-function-not-working/70902856#70902856
def drop_object_nunique(data):
    for i in data.columns:
        if data[i].dtype=='object' and data[i].nunique()>20:
            data.drop(columns=[i],axis=1,inplace=True)

In [137]:
drop_object_nunique(df)

for i in df.columns:
    if df[i].dtype=='object' and df[i].nunique()>20:
        df=df.drop(columns=[i],axis=1)

In [138]:
X=df.drop(columns=['Exited'],axis=1)
y=df['Exited']

In [139]:
#train Test split
#X_train_full,X_test_full,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [140]:
#print(X_train_full.shape,y_train.shape)
#print(X_test_full.shape,y_test.shape)

In [141]:
##Lets create a function to select only numerical variables
numerical_variables=[]
def select_num_variables(data,numerical_variables):
    for i in data.columns:
        if data[i].dtype in ['float64','int64']:
            numerical_variables.append(i)
            

In [142]:
select_num_variables(X,numerical_variables)

In [143]:
numerical_variables

['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']

In [144]:
##Lets create a function to select only categorical variables
categorical_variables=[]
def select_cat_variables(data,categorical_variables):
    for i in data.columns:
        if data[i].dtype=='object':
            categorical_variables.append(i)
    

In [145]:
select_cat_variables(X,categorical_variables)

In [146]:
categorical_variables

['Geography', 'Gender', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']

In [147]:
##Keep Selected Columns
my_cols=numerical_variables+categorical_variables
my_cols

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'EstimatedSalary',
 'Geography',
 'Gender',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember']

In [148]:
X.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10000 entries, (0, 3654) to (1, 7915)
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  object 
 7   HasCrCard        10000 non-null  object 
 8   IsActiveMember   10000 non-null  object 
 9   EstimatedSalary  10000 non-null  float64
dtypes: float64(2), int64(3), object(5)
memory usage: 1.1+ MB


In [149]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# Preprocessing transformation of numerical and categorical data

In [150]:
# numerical transformation
numerical_transformer=Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="constant")),
    ('scaler',  StandardScaler())
     ])

In [151]:
#Categorical transformation
categorical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="constant")),
    ('encoeder',OneHotEncoder(handle_unknown="ignore"))
])

In [152]:
#Bundle preprocessing of numerical and categorical data using ColumnTransformer
preprocessor=ColumnTransformer(transformers=[
                                             ('num',numerical_transformer,numerical_variables),
                                             ('cat',categorical_transformer,categorical_variables)
])

# Preprocess the data

In [None]:
#preprocessor.fit_transform(X_train)

In [153]:
##This gives me numpy array
X_train_transformed=preprocessor.fit_transform(X_train)
X_train_transformed.shape

(8000, 18)

In [154]:
#Convert it to DataFrame to extract and rename features
X_df=pd.DataFrame(X_train_transformed)

In [156]:
#This will give me the categorical variables transformed
cat_names=preprocessor.transformers_[1][1]['encoeder']\
.get_feature_names(categorical_variables)

In [157]:
##Rename cat columns
for i,j in zip(range(5,18), range(len(cat_names))):
    X_df=X_df.rename(columns={X_df.columns[i]:cat_names[j]})
    
##Rename numerical columns
for i,j in zip(range(0,5), range(len(numerical_variables))):
    X_df=X_df.rename(columns={X_df.columns[i]:numerical_variables[j]})

In [158]:
X_df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,HasCrCard_0,HasCrCard_1,IsActiveMember_0,IsActiveMember_1
0,-0.193178,-0.852208,-1.037856,-0.059334,1.430546,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,2.055417,0.963618,1.379879,0.907698,1.22766,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,-1.457366,-0.661068,1.379879,0.433516,-0.943857,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,-0.327887,-0.087649,-0.692465,0.318513,-0.024918,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,2.065779,-0.852208,-0.001684,0.751971,0.03045,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


# Feature selection

from sklearn.svm import LinearSVC
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train_transformed, y_train)
model_lsvc = SelectFromModel(lsvc, prefit=True)
X_new = model_lsvc.transform(X_train_transformed)
X_new.shape[1]

In [159]:
from sklearn.svm import LinearSVC
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_df, y_train)
model_lsvc = SelectFromModel(lsvc, prefit=True)
X_new = model_lsvc.transform(X_df)
X_new.shape[1]

11

In [162]:
#This will give the position of selected features in fporm of list [0, 1, 2, 3, 4, 6, 9, 10, 11, 12, 17]
select_var_True=list(model_lsvc.get_support([0]))

In [164]:
#This will subset the X_df dataset to select only the importnat features
X_df_selected_features=X_df.iloc[:,select_var_True]

In [165]:
X_df_selected_features.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,EstimatedSalary,Geography_Germany,Gender_Male,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,IsActiveMember_1
0,-0.193178,-0.852208,-1.037856,-0.059334,1.430546,1.0,1.0,0.0,1.0,0.0,1.0
1,2.055417,0.963618,1.379879,0.907698,1.22766,1.0,0.0,1.0,0.0,0.0,0.0
2,-1.457366,-0.661068,1.379879,0.433516,-0.943857,0.0,1.0,1.0,0.0,0.0,1.0
3,-0.327887,-0.087649,-0.692465,0.318513,-0.024918,0.0,1.0,1.0,0.0,0.0,0.0
4,2.065779,-0.852208,-0.001684,0.751971,0.03045,1.0,1.0,0.0,1.0,0.0,1.0


In [166]:
selected_features=list(X_df_selected_features.columns)
selected_features

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'EstimatedSalary',
 'Geography_Germany',
 'Gender_Male',
 'NumOfProducts_1',
 'NumOfProducts_2',
 'NumOfProducts_3',
 'IsActiveMember_1']

In [167]:
X_df_selected_features.shape

(8000, 11)

In [168]:
##X_new.shape[1] can be directly used for input_dim in Keras, but we will create a def to pass it
#Purely for learning purpose
def input_dim(data_train):
    X_train_transformed=preprocessor.fit_transform(X_train)
    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train_transformed, y_train)
    model_lsvc = SelectFromModel(lsvc, prefit=True)
    X_new = model_lsvc.transform(X_train_transformed)
    return X_new.shape[1]


In [169]:
input_dim(X_train)

11

In [170]:
##or Simply
X_df_selected_features.shape[1]

11

# Keras Model

In [171]:
# Function to create model, required for KerasClassifier
def create_model(optimizer='rmsprop', init='glorot_uniform'):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=input_dim(X_train), kernel_initializer=init, activation='relu'))
    model.add(Dense(8, kernel_initializer=init, activation='relu'))
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [172]:
# create model
model = KerasClassifier(build_fn=create_model,verbose=0)

In [128]:
##Define Gridsearch parameters

In [202]:
param_grid={'optimizer': ['adam','rmsprop'],
            'init':['uniform','glorot_uniform'],
            'epochs':[150,200],
            'batch_size':[50,100,500]}

In [203]:
grid=GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)

In [204]:
grid_result=grid.fit(X_df_selected_features,y_train)

In [205]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.864251 using {'batch_size': 50, 'epochs': 200, 'init': 'glorot_uniform', 'optimizer': 'rmsprop'}


In [206]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.862001 (0.007634) with: {'batch_size': 50, 'epochs': 150, 'init': 'uniform', 'optimizer': 'adam'}
0.862501 (0.004133) with: {'batch_size': 50, 'epochs': 150, 'init': 'uniform', 'optimizer': 'rmsprop'}
0.861751 (0.006517) with: {'batch_size': 50, 'epochs': 150, 'init': 'glorot_uniform', 'optimizer': 'adam'}
0.861501 (0.006701) with: {'batch_size': 50, 'epochs': 150, 'init': 'glorot_uniform', 'optimizer': 'rmsprop'}
0.860126 (0.005127) with: {'batch_size': 50, 'epochs': 200, 'init': 'uniform', 'optimizer': 'adam'}
0.860751 (0.006826) with: {'batch_size': 50, 'epochs': 200, 'init': 'uniform', 'optimizer': 'rmsprop'}
0.859001 (0.005597) with: {'batch_size': 50, 'epochs': 200, 'init': 'glorot_uniform', 'optimizer': 'adam'}
0.864251 (0.007347) with: {'batch_size': 50, 'epochs': 200, 'init': 'glorot_uniform', 'optimizer': 'rmsprop'}
0.863626 (0.006057) with: {'batch_size': 100, 'epochs': 150, 'init': 'uniform', 'optimizer': 'adam'}
0.861501 (0.007001) with: {'batch_size': 100, 'epochs': 150

#Linear SVC method of feature selection
from sklearn.svm import LinearSVC
my_pipeline=Pipeline(steps=[('preprocessor',preprocessor),
                            ('feature selection',SelectFromModel(LinearSVC(penalty="l1",dual=False))),
                             ('model',grid)])

# Test Data Set

In [None]:
X_test.head()

# Preprocess the test data¶
In the last section I have compressed all the test- preprocess steps into one function (Below codes will help from learning purpose in term sof steps involved to match the shape of test and train data

In [106]:
##This gives me numpy array
X_test_transformed=preprocessor.fit_transform(X_test)
X_test_transformed.shape

(600, 18)

In [107]:
#Convert it to DataFrame to extract and rename features
X_test_df=pd.DataFrame(X_test_transformed)

In [109]:
X_test_df.head()
X_test_df.shape

(600, 18)

In [110]:
#This will give me the categorical variables transformed
cat_names=preprocessor.transformers_[1][1]['encoeder']\
.get_feature_names(categorical_variables)

In [111]:
##Rename cat columns
for i,j in zip(range(5,18), range(len(cat_names))):
    X_test_df=X_test_df.rename(columns={X_test_df.columns[i]:cat_names[j]})
    
##Rename numerical columns
for i,j in zip(range(0,5), range(len(numerical_variables))):
    X_test_df=X_test_df.rename(columns={X_test_df.columns[i]:numerical_variables[j]})

In [113]:
X_test_df.head()
X_test_df.shape

(600, 18)

In [None]:
##Now select the same features as that in training data

In [94]:
selected_features=list(X_df_selected_features.columns)
selected_features

['Age',
 'Geography_Germany',
 'Gender_Male',
 'NumOfProducts_1',
 'NumOfProducts_2',
 'IsActiveMember_1']

In [120]:
X_test_processed=X_test_df[selected_features]

In [124]:
X_test_processed.head()
X_test_processed.shape

(600, 6)

In [119]:
help(X_test_processed.reindex)

Help on method reindex in module pandas.core.frame:

reindex(labels=None, index=None, columns=None, axis=None, method=None, copy=True, level=None, fill_value=nan, limit=None, tolerance=None) method of pandas.core.frame.DataFrame instance
    Conform Series/DataFrame to new index with optional filling logic.
    
    Places NA/NaN in locations having no value in the previous index. A new object
    is produced unless the new index is equivalent to the current one and
    ``copy=False``.
    
    Parameters
    ----------
    
    keywords for axes : array-like, optional
        New labels / index to conform to, should be specified using
        keywords. Preferably an Index object to avoid duplicating data.
    
    method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
        Method to use for filling holes in reindexed DataFrame.
        Please note: this is only applicable to DataFrames/Series with a
        monotonically increasing/decreasing index.
    
        * None (defa

A test dataset can have less classes for a categorical variable, hence there could be a possibility that the selected features in the train dataset might not be available in test dataset,

In that case we will have to create those features in test dataset with value=0

In [122]:
#we canuse reindex function to get the missing colums in test datatset with fill_value=0
#help(X_test_processed.reindex) for more details
X_test_processed=X_test_processed.reindex(columns=X_df_selected_features.columns,fill_value=0)

In [123]:
X_test_processed.head()

Unnamed: 0,Age,Geography_Germany,Gender_Male,NumOfProducts_1,NumOfProducts_2,IsActiveMember_1
0,0.198989,1.0,1.0,0.0,1.0,1.0
1,0.002295,0.0,1.0,0.0,1.0,1.0
2,-0.1944,0.0,1.0,1.0,0.0,1.0
3,1.57585,0.0,0.0,0.0,0.0,0.0
4,-0.981177,1.0,0.0,1.0,0.0,1.0


In [125]:
preds=grid.predict(X_test_processed)

In [126]:
preds.shape

(600, 1)

In [127]:
matrix=classification_report(y_test,preds,labels=[1,0])
print('Classification report: \n',matrix)

Classification report: 
               precision    recall  f1-score   support

           1       0.71      0.42      0.53       101
           0       0.89      0.97      0.93       499

   micro avg       0.87      0.87      0.87       600
   macro avg       0.80      0.69      0.73       600
weighted avg       0.86      0.87      0.86       600



# Create a function to preprocess and predict on test data

Above steps written in an efficient way looped under a function to directly give the test prediction results are preprocessing, feature reconciliation and fitting the Keras model

In [207]:
def predict_test(test_data):
    X_test_df=pd.DataFrame(preprocessor.fit_transform(X_test))
    
    cat_names=preprocessor.transformers_[1][1]['encoeder']\
    .get_feature_names(categorical_variables)
    
    ##Rename cat columns
    for i,j in zip(range(5,18), range(len(cat_names))):
        X_test_df=X_test_df.rename(columns={X_test_df.columns[i]:cat_names[j]})
    
    ##Rename numerical columns
    for i,j in zip(range(0,5), range(len(numerical_variables))):
        X_test_df=X_test_df.rename(columns={X_test_df.columns[i]:numerical_variables[j]})
    
    preds=grid.predict(X_test_df[selected_features])
    matrix=classification_report(y_test,preds,labels=[1,0])
    print('Classification report: \n',matrix)

In [208]:
predict_test(X_test)

Classification report: 
               precision    recall  f1-score   support

           1       0.76      0.46      0.57       424
           0       0.87      0.96      0.91      1576

   micro avg       0.85      0.85      0.85      2000
   macro avg       0.81      0.71      0.74      2000
weighted avg       0.85      0.85      0.84      2000

