In [3]:
import pandas as pd
import sklearn

df = pd.read_csv("/Users/alejandrolondono/Downloads/Churn_Modelling.csv")

In [4]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Data Cleaning

In [8]:
column_names = df.columns.tolist()
column_names

['RowNumber',
 'CustomerId',
 'Surname',
 'CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'Exited']

In [9]:
# Dropping Irrelevant Columns
df.drop(['RowNumber','CustomerId','Surname'], axis=1, inplace=True)
print(df)

      CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0             619    France  Female   42       2       0.00              1   
1             608     Spain  Female   41       1   83807.86              1   
2             502    France  Female   42       8  159660.80              3   
3             699    France  Female   39       1       0.00              2   
4             850     Spain  Female   43       2  125510.82              1   
...           ...       ...     ...  ...     ...        ...            ...   
9995          771    France    Male   39       5       0.00              2   
9996          516    France    Male   35      10   57369.61              1   
9997          709    France  Female   36       7       0.00              1   
9998          772   Germany    Male   42       3   75075.31              2   
9999          792    France  Female   28       4  130142.79              1   

      HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0   

In [10]:
# One Hot Encoding
df = pd.get_dummies(df, columns=['Geography','Gender'], drop_first=True)
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,False,False,False
1,608,41,1,83807.86,1,0,1,112542.58,0,False,True,False
2,502,42,8,159660.8,3,1,0,113931.57,1,False,False,False
3,699,39,1,0.0,2,0,0,93826.63,0,False,False,False
4,850,43,2,125510.82,1,1,1,79084.1,0,False,True,False


In [12]:
# Converting Booleans to Binary
df['Geography_Germany'] = df['Geography_Germany'].astype(int)
df['Geography_Spain'] = df['Geography_Spain'].astype(int)
df['Gender_Male'] = df['Gender_Male'].astype(int)
print(df.dtypes)
df.head()

CreditScore            int64
Age                    int64
Tenure                 int64
Balance              float64
NumOfProducts          int64
HasCrCard              int64
IsActiveMember         int64
EstimatedSalary      float64
Exited                 int64
Geography_Germany      int64
Geography_Spain        int64
Gender_Male            int64
dtype: object


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


# 1st Algorithm | Logistic Regression

In [30]:
# Importing classes from the sklearn library
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### Splitting Train/Test set

In [18]:
y = df['Exited'] # target variable
X = df.drop(columns=['Exited'], axis=1) # features, dropped target variable

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.20, random_state=67, stratify=y) # Splitting Training and Testing data (80/20

### Splitting Train/Valid set

In [19]:
# Again we split data now into training and validation. another 80/20
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.20, random_state=67, stratify=y_temp)

### Scaling

In [20]:
scaler = StandardScaler() # Initialize
X_train_scaled = scaler.fit_transform(X_train) # Fitting/Transforming our training data
# Transforming Validation and Test Data
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

### Training the Logistic Regression Model

In [22]:
log_reg = LogisticRegression(random_state=67) # Initialization
log_reg.fit(X_train_scaled, y_train) # Training data. Fitting

### Making Predictions

In [25]:
y_val_pred = log_reg.predict(X_val_scaled) # Predicting on the validation set (scaled)
y_val_proba = log_reg.predict_proba(X_val_scaled) # Predicting probabilities on the validation set (scaled)

In [28]:
log_reg_cm = confusion_matrix(y_val, y_val_pred) # confusion matrix | Actual values, and predicted values
print(log_reg_cm)

[[1216   58]
 [ 253   73]]


In [31]:
log_reg_cr = classification_report(y_val, y_val_pred)
print(log_reg_cr)

              precision    recall  f1-score   support

           0       0.83      0.95      0.89      1274
           1       0.56      0.22      0.32       326

    accuracy                           0.81      1600
   macro avg       0.69      0.59      0.60      1600
weighted avg       0.77      0.81      0.77      1600



# 2nd Algorithm | Naive Bayes

In [44]:
from sklearn.naive_bayes import GaussianNB

In [42]:
y = df.Exited
X = df.drop('Exited', axis=1)

### Training and Predicting

In [45]:
gnb = GaussianNB() # Creating an instance
gnb.fit(X_train_scaled, y_train) # training
y_val_pred_nb = gnb.predict(X_val_scaled) # making predictions on new data
y_val_proba_nb = gnb.predict_proba(X_val_scaled) # Predicting probabilities

### Evaluation

In [48]:
nb_cm = confusion_matrix(y_val, y_val_pred_nb)
print(nb_cm)

[[1190   84]
 [ 208  118]]


In [49]:
nb_cr = classification_report(y_val, y_val_pred_nb)
print(nb_cr)

              precision    recall  f1-score   support

           0       0.85      0.93      0.89      1274
           1       0.58      0.36      0.45       326

    accuracy                           0.82      1600
   macro avg       0.72      0.65      0.67      1600
weighted avg       0.80      0.82      0.80      1600

