##### Logistic Regression Testing and Evaluation Four

In [11]:
#import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, RepeatedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
#supress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import CSV file as pandas data frame 
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
#Create copy of orginal Dataframe
data_original = data.copy(deep = True)
#rename columns
data = data.rename(columns = {'id':'Identification Number','gender':'Gender','age':'Age',
                              'hypertension':'Hypertension','heart_disease':'Heart Disease', 
                              'ever_married':'Marriage Status','work_type':'Work Type','Residence_type':'Residence Type',
                             'avg_glucose_level':'Average Glucose Level','bmi':'Body Mass Index',
                              'smoking_status':'Smoking Status','stroke':'Stroke'})
#data type conversions
data['Age'] = data['Age'].astype('int')
data['Hypertension'] = data['Hypertension'].astype('object')
data['Stroke'] = data['Stroke'].astype('object')
# Replace the missing values with mean of bmi attribute
data['Body Mass Index'].fillna(np.round(data['Body Mass Index'].mean(), 1), inplace = True)
# Deleting attribute (high cardinality)
data.drop(['Identification Number'], axis = 1, inplace = True)
#dataset head
data.head()

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Marriage Status,Work Type,Residence Type,Average Glucose Level,Body Mass Index,Smoking Status,Stroke
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


###### Data Encoding for LR Algorithim

In [3]:
#one-hot encode the Data
one_hot_encoded = pd.get_dummies(data)
one_hot_encoded.head()

Unnamed: 0,Age,Heart Disease,Average Glucose Level,Body Mass Index,Gender_Female,Gender_Male,Gender_Other,Hypertension_0,Hypertension_1,Marriage Status_No,...,Work Type_Self-employed,Work Type_children,Residence Type_Rural,Residence Type_Urban,Smoking Status_Unknown,Smoking Status_formerly smoked,Smoking Status_never smoked,Smoking Status_smokes,Stroke_0,Stroke_1
0,67,1,228.69,36.6,0,1,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
1,61,0,202.21,28.9,1,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,1
2,80,1,105.92,32.5,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
3,49,0,171.23,34.4,1,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
4,79,0,174.12,24.0,1,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,1


##### Split data into train test partitions

In [4]:
#Performing our train test split on the data
train, test = train_test_split(one_hot_encoded,test_size = 0.25,random_state=42)
X_train = train.drop(['Heart Disease'],axis=1)
y_train = train.filter(['Heart Disease'])
X_test = test.drop(['Heart Disease'],axis=1)
y_test = test.filter(['Heart Disease'])
#print shape of x test
print('X_test shape:', X_test.shape)
print('X_train shape:', X_train.shape)
print('y_test shape:', y_test.shape)
print('y_train shape:', y_train.shape)

X_test shape: (1278, 23)
X_train shape: (3832, 23)
y_test shape: (1278, 1)
y_train shape: (3832, 1)


##### Building the Logistic Regression Model
Performing parameter grid searches

In [5]:
#LR object
Model_Four = LogisticRegression(random_state = 31, max_iter = 700)

#Logisict Regression Optimal Parameter Search Settings
LRparameter_grid = {
    'C' : [0.001, 0.01, 0.1, 1, 10, 100, 150, 200],
    'penalty' : ['l1','l2'],
    'max_iter' : list(range(100,800,1000)),
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

#Logisitc Regression Optimal Parameter Grid Search
LR_search = GridSearchCV(Model_Four, param_grid=LRparameter_grid, refit = True, verbose= 3, cv=5)

# fitting the model for grid search 
LR_search.fit(X_train , y_train)
LR_search.best_params_
# summarize
print('Mean Accuracy: %.3f' % LR_search.best_score_)
print('Config: %s' % LR_search.best_params_)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] C=0.001, max_iter=100, penalty=l1, solver=newton-cg .............
[CV]  C=0.001, max_iter=100, penalty=l1, solver=newton-cg, score=nan, total=   0.0s
[CV] C=0.001, max_iter=100, penalty=l1, solver=newton-cg .............
[CV]  C=0.001, max_iter=100, penalty=l1, solver=newton-cg, score=nan, total=   0.0s
[CV] C=0.001, max_iter=100, penalty=l1, solver=newton-cg .............
[CV]  C=0.001, max_iter=100, penalty=l1, solver=newton-cg, score=nan, total=   0.0s
[CV] C=0.001, max_iter=100, penalty=l1, solver=newton-cg .............
[CV]  C=0.001, max_iter=100, penalty=l1, solver=newton-cg, score=nan, total=   0.0s
[CV] C=0.001, max_iter=100, penalty=l1, solver=newton-cg .............
[CV]  C=0.001, max_iter=100, penalty=l1, solver=newton-cg, score=nan, total=   0.0s
[CV] C=0.001, max_iter=100, penalty=l1, solver=lbfgs .................
[CV]  C=0.001, max_iter=100, penalty=l1, solver=lbfgs, score=nan, total=   0.0s
[CV] C=0.001

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  C=0.001, max_iter=100, penalty=l1, solver=saga, score=0.944, total=   0.1s
[CV] C=0.001, max_iter=100, penalty=l1, solver=saga ..................
[CV]  C=0.001, max_iter=100, penalty=l1, solver=saga, score=0.945, total=   0.1s
[CV] C=0.001, max_iter=100, penalty=l1, solver=saga ..................
[CV]  C=0.001, max_iter=100, penalty=l1, solver=saga, score=0.945, total=   0.1s
[CV] C=0.001, max_iter=100, penalty=l1, solver=saga ..................
[CV]  C=0.001, max_iter=100, penalty=l1, solver=saga, score=0.945, total=   0.1s
[CV] C=0.001, max_iter=100, penalty=l2, solver=newton-cg .............
[CV]  C=0.001, max_iter=100, penalty=l2, solver=newton-cg, score=0.945, total=   0.1s
[CV] C=0.001, max_iter=100, penalty=l2, solver=newton-cg .............
[CV]  C=0.001, max_iter=100, penalty=l2, solver=newton-cg, score=0.944, total=   0.1s
[CV] C=0.001, max_iter=100, penalty=l2, solver=newton-cg .............
[CV]  C=0.001, max_iter=100, penalty=l2, solver=newton-cg, score=0.945, total=

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   22.7s finished


As we can see the best performing model from the extensive hyperparamter grid search returned:

- Mean Accuracy: 0.945
- Config: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}

##### Optimal Model from Hyperparamter Tune Grid Search

In [14]:
# Build a Logisitc Regression Model (Tuning Hyper-parameters)
Model_Four = LogisticRegression(random_state = 31, C = 0.1, max_iter = 100, solver='liblinear')

# fit the model with data (Training Model)
Model_Four.fit(X_train,y_train)

# Prediction using model (test Model)
Model_Predictions = Model_Four.predict(X_test)

###### Model Coefficient Analysis

In [15]:
#Print Model Coeffients
print(Model_Four.coef_)

[[ 0.0568388   0.00479633 -0.05531089 -0.86855507 -0.22615862  0.
  -0.75235436 -0.34235932 -0.45315441 -0.64155927 -0.3420426  -0.02046839
  -0.27139286 -0.22617307 -0.23463677 -0.48913488 -0.60557881 -0.38462403
  -0.24404809 -0.55621126  0.09016969 -0.81695969 -0.277754  ]]


In [16]:
##Tidy up Coefficients
#Get the original variable names
original_variables = list(X_train.columns)
#zip together the names and coefficients
zipped = list(zip(original_variables, Model_Four.coef_[0]))
coefs = [list (x) for x in zipped]
#Put into a dataframe wth column labels
coefficients_df = pd.DataFrame(coefs,columns = ['variables','coefficient'])
#Sort Values
coefficients_df.sort_values(by = ['coefficient'], axis = 0, inplace = True, ascending = False)
#Show dataframe
coefficients_df

Unnamed: 0,variables,coefficient
20,Smoking Status_smokes,0.09017
0,Age,0.056839
1,Average Glucose Level,0.004796
5,Gender_Other,0.0
11,Work Type_Never_worked,-0.020468
2,Body Mass Index,-0.055311
4,Gender_Male,-0.226159
13,Work Type_Self-employed,-0.226173
14,Work Type_children,-0.234637
18,Smoking Status_formerly smoked,-0.244048


###### Model Evaluation Metrics

In [17]:
#Confusion Matrix
confusion_matrix = metrics.confusion_matrix(y_test, Model_Predictions)
cm_df = pd.DataFrame(confusion_matrix, columns = ['True Positive','False Positive'],
                    index = ['False Negative','True Negative'])
cm_df

Unnamed: 0,True Positive,False Positive
False Negative,1211,2
True Negative,63,2


In [18]:
#classification report
target_names = ['class 0', 'class 1']
print(classification_report(y_test, Model_Predictions, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.95      1.00      0.97      1213
     class 1       0.50      0.03      0.06        65

    accuracy                           0.95      1278
   macro avg       0.73      0.51      0.52      1278
weighted avg       0.93      0.95      0.93      1278



In [19]:
#Evaluation Metrics
print("Accuracy:",metrics.accuracy_score(y_test, Model_Predictions))
print("Precision:",metrics.precision_score(y_test, Model_Predictions))
print("Recall:",metrics.recall_score(y_test, Model_Predictions))

Accuracy: 0.9491392801251957
Precision: 0.5
Recall: 0.03076923076923077
