<a href="https://colab.research.google.com/github/ChristopherCrook/assign03-50-Crook/blob/main/churn_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#tables and visualizations
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler, OrdinalEncoder
from sklearn import config_context
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn import svm
from sklearn.model_selection import cross_val_score

# Load the Data

In [3]:
data = pd.read_excel('https://github.com/ChristopherCrook/assign03-50-Crook/blob/main/IBM-HR-Data-Employee-Attrition.xlsx?raw=true')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

# Split the Data

In [4]:
random_seed = 6234
class_column = 'Over18'

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=class_column), data[class_column],
                                                   test_size=0.25, random_state=random_seed, stratify=data[class_column])

## Do a quick shape test
print('On X train: ')
print('X train dimensions: ', X_train.shape)
display(X_train.head())

# X test
print('\nOn X test: ')
print('X test dimensions: ', X_test.shape)
display(X_test.head())

On X train: 
X train dimensions:  (1102, 34)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
270,55,No,Travel_Rarely,452,Research & Development,1,3,Medical,1,374,...,3,80,0,37,2,3,36,10,4,13
1281,35,Yes,Travel_Rarely,303,Sales,27,3,Life Sciences,1,1797,...,4,80,0,10,2,3,10,7,7,7
381,30,No,Travel_Rarely,202,Sales,2,1,Technical Degree,1,508,...,1,80,1,1,3,3,1,0,0,0
679,31,No,Non-Travel,1188,Sales,20,2,Marketing,1,947,...,4,80,1,9,2,2,9,8,0,0
302,28,No,Travel_Rarely,1476,Research & Development,16,2,Medical,1,412,...,3,80,0,9,2,3,8,3,0,7



On X test: 
X test dimensions:  (368, 34)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
556,53,No,Travel_Rarely,346,Research & Development,6,3,Life Sciences,1,769,...,4,80,0,19,4,3,2,2,2,2
579,34,No,Travel_Rarely,121,Research & Development,2,4,Medical,1,804,...,3,80,0,6,3,3,6,5,1,3
963,38,No,Travel_Rarely,1009,Sales,2,2,Life Sciences,1,1355,...,4,80,1,11,3,3,7,7,1,7
125,26,No,Travel_Rarely,841,Research & Development,6,3,Other,1,164,...,3,80,0,5,3,2,5,4,4,3
215,41,No,Travel_Rarely,896,Sales,6,3,Life Sciences,1,298,...,3,80,0,16,3,3,1,0,0,0


# Create Our Pipelines

In [14]:
#individual pipelines for differing datatypes
categoric = Pipeline(steps=[('cat_impute', SimpleImputer(missing_values=np.nan, strategy='constant')),
                               ('onehot_cat', OneHotEncoder(drop='if_binary'))])

numeric = Pipeline(steps=[('impute_num', SimpleImputer(missing_values=np.nan, strategy='mean')),
                               ('scale_num', StandardScaler())])
#establish preprocessing pipeline by columns
preproc = ColumnTransformer([('categoric_pipe', categoric, make_column_selector(dtype_include=object)),
                             ('numeric_pipe', numeric, make_column_selector(dtype_include=np.number))],
                             remainder='passthrough')
#generate the whole modeling pipeline with preprocessing
pipe = Pipeline(steps=[('preproc', preproc),
                       ('mdl', RandomForestClassifier())])

#visualization for steps
with config_context(display='diagram'):
    display(pipe)


# Cross Validation

In [15]:
tuning_grid = {'mdl__n_estimators' : [100, 200 ,300],
               'mdl__max_depth': [10, 15, 20] }
grid_search = GridSearchCV(pipe, param_grid = tuning_grid, cv = 5, return_train_score=True)
tuning_grid

{'mdl__max_depth': [10, 15, 20], 'mdl__n_estimators': [100, 200, 300]}

In [16]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preproc',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('categoric_pipe',
                                                                         Pipeline(steps=[('cat_impute',
                                                                                          SimpleImputer(strategy='constant')),
                                                                                         ('onehot_cat',
                                                                                          OneHotEncoder(drop='if_binary'))]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f2fff09e490>),
                                                                        ('numeric_pipe',
                                               

In [17]:
print(grid_search.best_score_)
grid_search.best_params_

1.0


{'mdl__max_depth': 10, 'mdl__n_estimators': 100}

In [18]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mdl__max_depth,param_mdl__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.14595,0.004595,0.017148,0.002093,10,100,"{'mdl__max_depth': 10, 'mdl__n_estimators': 100}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,0.273559,0.009257,0.02838,0.005649,10,200,"{'mdl__max_depth': 10, 'mdl__n_estimators': 200}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,0.394201,0.009893,0.032712,0.000299,10,300,"{'mdl__max_depth': 10, 'mdl__n_estimators': 300}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,0.149795,0.002278,0.017249,0.001821,15,100,"{'mdl__max_depth': 15, 'mdl__n_estimators': 100}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,0.26898,0.003319,0.024758,0.000595,15,200,"{'mdl__max_depth': 15, 'mdl__n_estimators': 200}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,0.394832,0.009099,0.034321,0.00136,15,300,"{'mdl__max_depth': 15, 'mdl__n_estimators': 300}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,0.142708,0.003452,0.015936,0.000257,20,100,"{'mdl__max_depth': 20, 'mdl__n_estimators': 100}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,0.269797,0.00335,0.024405,0.00055,20,200,"{'mdl__max_depth': 20, 'mdl__n_estimators': 200}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,0.390306,0.002244,0.036636,0.005418,20,300,"{'mdl__max_depth': 20, 'mdl__n_estimators': 300}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
