##### Logistic Regression Testing and Evaluation Three

In [1]:
#import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, RepeatedKFold
from sklearn import metrics
from sklearn.metrics import classification_report
#supress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import CSV file as pandas data frame 
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
#Create copy of orginal Dataframe
data_original = data.copy(deep = True)
#rename columns
data = data.rename(columns = {'id':'Identification Number','gender':'Gender','age':'Age',
                              'hypertension':'Hypertension','heart_disease':'Heart Disease', 
                              'ever_married':'Marriage Status','work_type':'Work Type','Residence_type':'Residence Type',
                             'avg_glucose_level':'Average Glucose Level','bmi':'Body Mass Index',
                              'smoking_status':'Smoking Status','stroke':'Stroke'})
#data type conversions
data['Age'] = data['Age'].astype('int')
data['Hypertension'] = data['Hypertension'].astype('object')
data['Stroke'] = data['Stroke'].astype('object')
# Replace the missing values with mean of bmi attribute
data['Body Mass Index'].fillna(np.round(data['Body Mass Index'].mean(), 1), inplace = True)
# Deleting attribute (high cardinality)
data.drop(['Identification Number'], axis = 1, inplace = True)
#dataset head
data.head()

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Marriage Status,Work Type,Residence Type,Average Glucose Level,Body Mass Index,Smoking Status,Stroke
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


###### Data Encoding for LR algorithim preparation

In [3]:
#one-hot encode the Data
one_hot_encoded = pd.get_dummies(data)
one_hot_encoded.head()

Unnamed: 0,Age,Heart Disease,Average Glucose Level,Body Mass Index,Gender_Female,Gender_Male,Gender_Other,Hypertension_0,Hypertension_1,Marriage Status_No,...,Work Type_Self-employed,Work Type_children,Residence Type_Rural,Residence Type_Urban,Smoking Status_Unknown,Smoking Status_formerly smoked,Smoking Status_never smoked,Smoking Status_smokes,Stroke_0,Stroke_1
0,67,1,228.69,36.6,0,1,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
1,61,0,202.21,28.9,1,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,1
2,80,1,105.92,32.5,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
3,49,0,171.23,34.4,1,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
4,79,0,174.12,24.0,1,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,1


##### Split data into train test partitions

In [4]:
#Performing our train test split on the data
train, test = train_test_split(one_hot_encoded,test_size = 0.25,random_state=42)
X_train = train.drop(['Heart Disease'],axis=1)
y_train = train.filter(['Heart Disease'])
X_test = test.drop(['Heart Disease'],axis=1)
y_test = test.filter(['Heart Disease'])
#print shape of x test
print('X_test shape:', X_test.shape)
print('X_train shape:', X_train.shape)
print('y_test shape:', y_test.shape)
print('y_train shape:', y_train.shape)

X_test shape: (1278, 23)
X_train shape: (3832, 23)
y_test shape: (1278, 1)
y_train shape: (3832, 1)


##### Building the Logistic Regression Model
Performing Grid Searches

In [5]:
#Calling Logistic Regression Model Object
Model_Three = LogisticRegression(random_state = 31, max_iter = 500)

In [6]:
#Tune Grid for C parameter
tune_grid = {
 'C' : np.arange(0.01, 0.99, 0.01)
}
tune_grid


{'C': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
        0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
        0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33,
        0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44,
        0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55,
        0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66,
        0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77,
        0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88,
        0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98])}

In [7]:
#Setting up 10 Fold - Cross validation process
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state = 31)

In [8]:
#Implementing Grid Search
opt = GridSearchCV(
 Model_Three, tune_grid, scoring='f1', 
 cv=cv, n_jobs=-1)
opt_results = opt.fit(X_train, y_train['Heart Disease'])


In [9]:
#F1 Scoring Metric
format_string = 'Average cross-validated in-sample F1 score {:.3f} {}'
print(format_string.format(opt_results.best_score_,str(opt_results.best_params_)))

Average cross-validated in-sample F1 score 0.050 {'C': 0.8}


###### Optimal Model from Hyperparamter Tune Grid Search

In [10]:
# Build a Logisitc Regression Model (Tuning Hyper-parameters)
Model_Three = LogisticRegression(random_state = 31, C = opt_results.best_params_['C'], max_iter = 500, solver='liblinear')

# fit the model with data (Training Model)
Model_Three.fit(X_train,y_train)

# Prediction using model (test Model)
Model_Predictions = Model_Three.predict(X_test)

###### Model Coefficient Analysis

In [11]:
#Print Model Coeffients
print(Model_Three.coef_)

[[ 0.07502336  0.00540942 -0.01186705 -1.23089962 -0.5148534   0.
  -0.99611204 -0.74964098 -0.72155623 -1.02419679 -0.60165244 -0.05335747
  -0.42479133 -0.5134405  -0.15251128 -0.79607557 -0.94967745 -0.54755677
  -0.47842491 -0.76485738  0.04508604 -1.05496423 -0.69078879]]


In [12]:
##Tidy up Coefficients
#Get the original variable names
original_variables = list(X_train.columns)
#zip together the names and coefficients
zipped = list(zip(original_variables, Model_Three.coef_[0]))
coefs = [list (x) for x in zipped]
#Put into a dataframe wth column labels
coefficients_df = pd.DataFrame(coefs,columns = ['variables','coefficient'])
#Sort Values
coefficients_df.sort_values(by = ['coefficient'], axis = 0, inplace = True, ascending = False)
#Show dataframe
coefficients_df

Unnamed: 0,variables,coefficient
0,Age,0.075023
20,Smoking Status_smokes,0.045086
1,Average Glucose Level,0.005409
5,Gender_Other,0.0
2,Body Mass Index,-0.011867
11,Work Type_Never_worked,-0.053357
14,Work Type_children,-0.152511
12,Work Type_Private,-0.424791
18,Smoking Status_formerly smoked,-0.478425
13,Work Type_Self-employed,-0.513441


##### Model Evaluation Metrics

In [13]:
#Confusion Matrix
confusion_matrix = metrics.confusion_matrix(y_test, Model_Predictions)
cm_df = pd.DataFrame(confusion_matrix, columns = ['True Positive','False Positive'],
                    index = ['False Negative','True Negative'])
cm_df

Unnamed: 0,True Positive,False Positive
False Negative,1209,4
True Negative,62,3


In [14]:
#classification report
target_names = ['class 0', 'class 1']
print(classification_report(y_test, Model_Predictions, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.95      1.00      0.97      1213
     class 1       0.43      0.05      0.08        65

    accuracy                           0.95      1278
   macro avg       0.69      0.52      0.53      1278
weighted avg       0.92      0.95      0.93      1278



In [15]:
#Evaluation Metrics
print("Accuracy:",metrics.accuracy_score(y_test, Model_Predictions))
print("Precision:",metrics.precision_score(y_test, Model_Predictions))
print("Recall:",metrics.recall_score(y_test, Model_Predictions))

Accuracy: 0.9483568075117371
Precision: 0.42857142857142855
Recall: 0.046153846153846156


##### Testing Model on the Training Set