In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
NUM_PATH="/content/drive/MyDrive/Selected 1 pro/Churn_Modelling.csv"
# Importing data into python from the given csv file
dataset= pd.read_csv(NUM_PATH)


In [None]:
dataset.head()

In [None]:
dataset.tail()

In [None]:
dataset.dtypes

In [None]:
dataset.info()

In [None]:
dataset.columns

In [None]:
dataset.describe()

In [None]:
dataset.CreditScore.value_counts()

In [None]:
dataset.CreditScore.isna().any()

In [None]:
dataset.drop(labels=['CustomerId','Surname'],
                axis=1,
                inplace=True)


In [None]:
dataset.head()

In [None]:
dataset.Geography.value_counts(dropna=False)

In [None]:
dataset.Gender.value_counts(dropna=False)

In [None]:
dataset_cleaned = pd.get_dummies(dataset, 
                                    prefix=['Geo','Gen'], 
                                    prefix_sep='_',
                                    dummy_na=False, 
                                    columns=['Geography','Gender'],
                                    sparse=False,
                                    drop_first=False,
                                    dtype=int) 

In [None]:
dataset_cleaned

In [None]:
dataset.isna().any()

In [None]:
dataset.corr()

In [None]:
dataset_cleaned.hist(bins=10,
                        figsize=(20,20),
                        xrot=30)

In [None]:
from sklearn import preprocessing

In [None]:
labels=dataset_cleaned.columns
print(labels)
scaler=preprocessing.StandardScaler()
scaled_dataset_cleaned=scaler.fit_transform(dataset_cleaned)

In [None]:
scaled_dataset_cleaned=pd.DataFrame(scaled_dataset_cleaned)
scaled_dataset_cleaned.columns=labels

In [None]:
scaled_dataset_cleaned.hist(bins=10,
                               figsize=(20,20),
                               xrot=30)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
for i in scaled_dataset_cleaned.columns:
    sns.kdeplot(scaled_dataset_cleaned[i],
                 label=[i],
                 bw=1.5,
                 ax=ax)

In [None]:
corr=scaled_dataset_cleaned.corr()

In [None]:
fig,ax=plt.subplots(1,1,figsize=(20,10))
sns.heatmap(corr,
            annot=True,
            cmap='RdYlGn',
            ax=ax)

In [None]:
nr=7
nc=2
fig,ax=plt.subplots(nrows=nr,ncols=nc,figsize=(20,20))
i=0
for j in range(nr):
    for k in range(nc):
        axes=ax[j,k]
        
        sns.boxplot(x=scaled_dataset_cleaned['Exited'],
                    y=scaled_dataset_cleaned.iloc[:,i],
                    ax=axes)
        i+=1

In [None]:
scaled_dataset_cleaned=scaled_dataset_cleaned.drop('Exited',
                                                         axis=1)

In [None]:
scaled_dataset_cleaned.columns

In [None]:
from sklearn.decomposition import PCA

n_comp = 2
pca=PCA(n_components=n_comp)
principal_components=pca.fit_transform(scaled_dataset_cleaned)
len(principal_components)


In [None]:
pc_df=pd.DataFrame(principal_components,
                  columns=['principal_components_%s'%(i+1) for i in range(n_comp)],
                  index=range(1,len(principal_components)+1))
print(pc_df)

In [None]:
input_components=pc_df
output_components=dataset.Exited
print(input_components.shape,output_components.shape)
final_df=pd.concat([input_components,output_components],axis=1)

In [None]:
fig,ax=plt.subplots(1,1,figsize=(20,20))
ax.set_xlabel('principal_components_1',fontsize=20)
ax.set_ylabel('principal_components_2',fontsize=20)
ax.set_title('Customers Exited on PC1 & PC2',fontsize=20)

Targets=[0,1]
colors=['r','k']

for target,color in zip(Targets,colors):
    index_no_target=final_df['Exited']==target
    ax.scatter(final_df.loc[index_no_target,'principal_components_1'],
               final_df.loc[index_no_target,'principal_components_2'],
              c=color)
    ax.legend(Targets)
    ax.grid()

In [None]:
pca.explained_variance_ratio_

In [None]:
n_comp=10
pca_10=PCA(n_components=n_comp)
pca10_comp=pca_10.fit_transform(scaled_dataset_cleaned)
df_PCA_10=pd.DataFrame(pca10_comp,
                       columns=['Principal_component_%s'%(i+1) for i in range(n_comp)],
                      index=range(1,len(pca10_comp)+1))
print(df_PCA_10)

In [None]:
sum(pca_10.explained_variance_ratio_)

In [None]:
#Test Train split of the datdset
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(df_PCA_10,
                                               output_components, test_size=0.2, random_state=44, shuffle =True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve





#Applying LogisticRegression Model 

'''
#linear_model.LogisticRegression(penalty='l2’,dual=False,tol=0.0001,C=1.0,fit_intercept=True,intercept_scaling=1,
#                                class_weight=None,random_state=None,solver='warn’,max_iter=100,
#                                multi_class='warn’, verbose=0,warm_start=False, n_jobs=None)
'''

LogisticRegressionModel = LogisticRegression(penalty='l1',solver='',C=1., n_jobs=None,max_iter=1000 )

LogisticRegressionModel.fit(x_train, y_train)

print('LogisticRegressionModel Train Score is : ' , LogisticRegressionModel.score(x_train, y_train))
print('LogisticRegressionModel Test Score is : ' , LogisticRegressionModel.score(x_test, y_test))
print('LogisticRegressionModel Classes are : ' , LogisticRegressionModel.classes_)
print('LogisticRegressionModel No. of iteratios is : ' , LogisticRegressionModel.n_iter_)
print('----------------------------------------------------')
#Calculating Prediction
y_pred = LogisticRegressionModel.predict(x_test)
y_pred_prob = LogisticRegressionModel.predict_proba(x_test)
print('Predicted Value for LogisticRegressionModel is : ' , y_pred[:1000])
print('Prediction Probabilities Value for LogisticRegressionModel is : ' , y_pred_prob[:10])

#----------------------------------------------------
#Calculating Accuracy Score  : ((TP + TN) / float(TP + TN + FP + FN))
AccScore = accuracy_score(y_test, y_pred, normalize=True)
print('Accuracy Score is : ', AccScore)

#----------------------------------------------------
#Calculating Precision Score : (Specificity) #(TP / float(TP + FP))  
# precision_score(y_true, y_pred, labels=None, pos_label=1, average=’binary’,sample_weight=None)

PrecisionScore = precision_score(y_test, y_pred, average='micro') #it can be : binary,macro,weighted,samples
print('Precision Score is : ', PrecisionScore)


#----------------------------------------------------
#Calculating F1 Score  : 2 * (precision * recall) / (precision + recall)
# f1_score(y_true, y_pred, labels=None, pos_label=1, average=’binary’, sample_weight=None)

F1Score = f1_score(y_test, y_pred, average='micro') #it can be : binary,macro,weighted,samples
print('F1 Score is : ', F1Score)

#----------------------------------------------------
#Calculating Confusion Matrix
CM = confusion_matrix(y_test, y_pred)
print('Confusion Matrix is : \n', CM)

# drawing confusion matrix
sns.heatmap(CM, center = True)
plt.show()