###### Logistic Regression Testing and Evaluation Five

Testing Scaled Data Techniques on our best model (Model Two)

In [1]:
#import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.metrics import classification_report
#supress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import CSV file as pandas data frame 
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
#Create copy of orginal Dataframe
data_original = data.copy(deep = True)
#rename columns
data = data.rename(columns = {'id':'Identification Number','gender':'Gender','age':'Age',
                              'hypertension':'Hypertension','heart_disease':'Heart Disease', 
                              'ever_married':'Marriage Status','work_type':'Work Type','Residence_type':'Residence Type',
                             'avg_glucose_level':'Average Glucose Level','bmi':'Body Mass Index',
                              'smoking_status':'Smoking Status','stroke':'Stroke'})
#data type conversions
data['Age'] = data['Age'].astype('int')
data['Hypertension'] = data['Hypertension'].astype('object')
data['Stroke'] = data['Stroke'].astype('object')
# Replace the missing values with mean of bmi attribute
data['Body Mass Index'].fillna(np.round(data['Body Mass Index'].mean(), 1), inplace = True)
# Deleting attribute (high cardinality)
data.drop(['Identification Number'], axis = 1, inplace = True)
#dataset head
data.head()

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Marriage Status,Work Type,Residence Type,Average Glucose Level,Body Mass Index,Smoking Status,Stroke
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


###### Min Max Scalling Data for LR Model Preparations

In [3]:
#Need to seperate all features for scaling into a new dataframe
scaling_df = data[['Age','Average Glucose Level','Body Mass Index']]
#Initiate minxmax Scaler Function
scalermm = MinMaxScaler() 
data_mm = scalermm.fit_transform(scaling_df) 
# Converting the numpy array into a pandas DataFrame 
data_minmaxscaled1 = pd.DataFrame(data_mm) 
# Renaming the columns 
data_minmaxscaled1.columns = scaling_df.columns 
data_minmaxscaled1

Unnamed: 0,Age,Average Glucose Level,Body Mass Index
0,0.817073,0.801265,0.301260
1,0.743902,0.679023,0.213058
2,0.975610,0.234512,0.254296
3,0.597561,0.536008,0.276060
4,0.963415,0.549349,0.156930
...,...,...,...
5105,0.975610,0.132167,0.213058
5106,0.987805,0.323516,0.340206
5107,0.426829,0.128658,0.232532
5108,0.621951,0.513203,0.175258


The conversions has been complete

###### Data Encoding for LR Model

In [20]:
#one-hot encode the Data
one_hot_encoded = pd.get_dummies(data)

#drop features that have been scaled above
one_hot_encoded.drop(['Age'], axis = 1, inplace = True)
one_hot_encoded.drop(['Average Glucose Level'], axis = 1, inplace = True)
one_hot_encoded.drop(['Body Mass Index'], axis = 1, inplace = True)

#join dfs
joins = pd.merge(data_minmaxscaled1,one_hot_encoded, on = one_hot_encoded.index, how = 'outer')
joins.drop(['key_0'], axis = 1, inplace = True)
joins.head()

Unnamed: 0,Age,Average Glucose Level,Body Mass Index,Heart Disease,Gender_Female,Gender_Male,Gender_Other,Hypertension_0,Hypertension_1,Marriage Status_No,...,Work Type_Self-employed,Work Type_children,Residence Type_Rural,Residence Type_Urban,Smoking Status_Unknown,Smoking Status_formerly smoked,Smoking Status_never smoked,Smoking Status_smokes,Stroke_0,Stroke_1
0,0.817073,0.801265,0.30126,1,0,1,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
1,0.743902,0.679023,0.213058,0,1,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,1
2,0.97561,0.234512,0.254296,1,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
3,0.597561,0.536008,0.27606,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
4,0.963415,0.549349,0.15693,0,1,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,1


###### Split Data into Train Test Partitions

In [21]:
#Performing our train test split on the data
train, test = train_test_split(one_hot_encoded,test_size = 0.25,random_state=42)
X_train = train.drop(['Heart Disease'],axis=1)
y_train = train.filter(['Heart Disease'])
X_test = test.drop(['Heart Disease'],axis=1)
y_test = test.filter(['Heart Disease'])
#print shape of x test
print('X_test shape:', X_test.shape)
print('X_train shape:', X_train.shape)
print('y_test shape:', y_test.shape)
print('y_train shape:', y_train.shape)

X_test shape: (1278, 20)
X_train shape: (3832, 20)
y_test shape: (1278, 1)
y_train shape: (3832, 1)


###### Builidng the Logisitc Regression Model

In [22]:
# Build a Logisitc Regression Model (Tuning Hyper-parameters)
Model_Two = LogisticRegression(random_state = 31, class_weight = 'balanced', 
                               max_iter = 500, penalty = 'l2', solver = 'newton-cg')

# fit the model with data (Training Model)
Model_Two.fit(X_train,y_train)

# Prediction using model (test Model)
Model_Predictions = Model_Two.predict(X_test)

###### Model Coefficient Analysis

In [23]:
#Print Model Coeffients
print(Model_Two.coef_)

[[-0.39461799  0.39461799  0.         -0.47399736  0.47399736 -0.40548945
   0.40548945  0.55616114 -0.82615317  0.61451268  1.283765   -1.62828565
  -0.02539908  0.02539908 -0.17130717  0.24163306 -0.29880998  0.22848409
  -0.68028467  0.68028467]]


In [24]:
##Tidy up Coefficients
#Get the original variable names
original_variables = list(X_train.columns)
#zip together the names and coefficients
zipped = list(zip(original_variables, Model_Two.coef_[0]))
coefs = [list (x) for x in zipped]
#Put into a dataframe wth column labels
coefficients_df = pd.DataFrame(coefs,columns = ['variables','coefficient'])
#Sort Values
coefficients_df.sort_values(by = ['coefficient'], axis = 0, inplace = True, ascending = False)
#Show dataframe
coefficients_df

Unnamed: 0,variables,coefficient
10,Work Type_Self-employed,1.283765
19,Stroke_1,0.680285
9,Work Type_Private,0.614513
7,Work Type_Govt_job,0.556161
4,Hypertension_1,0.473997
6,Marriage Status_Yes,0.405489
1,Gender_Male,0.394618
15,Smoking Status_formerly smoked,0.241633
17,Smoking Status_smokes,0.228484
13,Residence Type_Urban,0.025399


###### Model Evaluation Metrics

In [25]:
#Confusion Matrix
confusion_matrix = metrics.confusion_matrix(y_test, Model_Predictions)
cm_df = pd.DataFrame(confusion_matrix, columns = ['True Positive','False Positive'],
                    index = ['False Negative','True Negative'])
cm_df

Unnamed: 0,True Positive,False Positive
False Negative,810,403
True Negative,19,46


In [26]:
#classification report
target_names = ['class 0', 'class 1']
print(classification_report(y_test, Model_Predictions, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.98      0.67      0.79      1213
     class 1       0.10      0.71      0.18        65

    accuracy                           0.67      1278
   macro avg       0.54      0.69      0.49      1278
weighted avg       0.93      0.67      0.76      1278



In [27]:
#Evaluation Metrics
print("Accuracy:",metrics.accuracy_score(y_test, Model_Predictions))
print("Precision:",metrics.precision_score(y_test, Model_Predictions))
print("Recall:",metrics.recall_score(y_test, Model_Predictions))

Accuracy: 0.6697965571205008
Precision: 0.10244988864142539
Recall: 0.7076923076923077
