##### Logistic Regression Testing and Evaluation One

In [1]:
#import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
#supress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import CSV file as pandas data frame 
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
#Create copy of orginal Dataframe
data_original = data.copy(deep = True)
#rename columns
data = data.rename(columns = {'id':'Identification Number','gender':'Gender','age':'Age',
                              'hypertension':'Hypertension','heart_disease':'Heart Disease', 
                              'ever_married':'Marriage Status','work_type':'Work Type','Residence_type':'Residence Type',
                             'avg_glucose_level':'Average Glucose Level','bmi':'Body Mass Index',
                              'smoking_status':'Smoking Status','stroke':'Stroke'})
#data type conversions
data['Age'] = data['Age'].astype('int')
data['Hypertension'] = data['Hypertension'].astype('object')
data['Stroke'] = data['Stroke'].astype('object')
# Replace the missing values with mean of bmi attribute
data['Body Mass Index'].fillna(np.round(data['Body Mass Index'].mean(), 1), inplace = True)
# Deleting attribute (high cardinality)
data.drop(['Identification Number'], axis = 1, inplace = True)
#dataset head
data.head()

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Marriage Status,Work Type,Residence Type,Average Glucose Level,Body Mass Index,Smoking Status,Stroke
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


###### Data Encoding for LR algorithim preparation

In [3]:
#one-hot encode the Data
one_hot_encoded = pd.get_dummies(data)
one_hot_encoded.head()

Unnamed: 0,Age,Heart Disease,Average Glucose Level,Body Mass Index,Gender_Female,Gender_Male,Gender_Other,Hypertension_0,Hypertension_1,Marriage Status_No,...,Work Type_Self-employed,Work Type_children,Residence Type_Rural,Residence Type_Urban,Smoking Status_Unknown,Smoking Status_formerly smoked,Smoking Status_never smoked,Smoking Status_smokes,Stroke_0,Stroke_1
0,67,1,228.69,36.6,0,1,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
1,61,0,202.21,28.9,1,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,1
2,80,1,105.92,32.5,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
3,49,0,171.23,34.4,1,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
4,79,0,174.12,24.0,1,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,1


In [5]:
one_hot_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age                             5110 non-null   int32  
 1   Heart Disease                   5110 non-null   int64  
 2   Average Glucose Level           5110 non-null   float64
 3   Body Mass Index                 5110 non-null   float64
 4   Gender_Female                   5110 non-null   uint8  
 5   Gender_Male                     5110 non-null   uint8  
 6   Gender_Other                    5110 non-null   uint8  
 7   Hypertension_0                  5110 non-null   uint8  
 8   Hypertension_1                  5110 non-null   uint8  
 9   Marriage Status_No              5110 non-null   uint8  
 10  Marriage Status_Yes             5110 non-null   uint8  
 11  Work Type_Govt_job              5110 non-null   uint8  
 12  Work Type_Never_worked          51

###### Split data into train test partitions

In [6]:
#Performing our train test split on the data
train, test = train_test_split(one_hot_encoded,test_size = 0.25,random_state=42)
X_train = train.drop(['Heart Disease'],axis=1)
y_train = train.filter(['Heart Disease'])
X_test = test.drop(['Heart Disease'],axis=1)
y_test = test.filter(['Heart Disease'])
#print shape of x test
print('X_test shape:', X_test.shape)
print('X_train shape:', X_train.shape)
print('y_test shape:', y_test.shape)
print('y_train shape:', y_train.shape)

X_test shape: (1278, 23)
X_train shape: (3832, 23)
y_test shape: (1278, 1)
y_train shape: (3832, 1)


###### Building Logisitic Regression Model

In [7]:
# Build a Logisitc Regression Model (using the default parameters)
Model_One = LogisticRegression()

# fit the model with data (Training Model)
Model_One.fit(X_train,y_train)

# Prediction using model (test Model)
Model_Predictions = Model_One.predict(X_test)

###### Model Coefficient Analysis

In [8]:
#Print Model Coeffients
print(Model_One.coef_)

[[ 0.07622312  0.00554565 -0.00880284 -1.25836059 -0.53902182  0.
  -1.01853833 -0.77884408 -0.70875545 -1.08862697 -0.50471733 -0.03080798
  -0.31346894 -0.37531726 -0.5730709  -0.81036059 -0.98702182 -0.55965564
  -0.51138553 -0.76959496  0.04325372 -1.06264982 -0.73473259]]


In [9]:
##Tidy up Coefficients
#Get the original variable names
original_variables = list(X_train.columns)
#zip together the names and coefficients
zipped = list(zip(original_variables, Model_One.coef_[0]))
coefs = [list (x) for x in zipped]
#Put into a dataframe wth column labels
coefficients_df = pd.DataFrame(coefs,columns = ['variables','coefficient'])
#Sort Values
coefficients_df.sort_values(by = ['coefficient'], axis = 0, inplace = True, ascending = False)
#Show dataframe
coefficients_df

Unnamed: 0,variables,coefficient
0,Age,0.076223
20,Smoking Status_smokes,0.043254
1,Average Glucose Level,0.005546
5,Gender_Other,0.0
2,Body Mass Index,-0.008803
11,Work Type_Never_worked,-0.030808
12,Work Type_Private,-0.313469
13,Work Type_Self-employed,-0.375317
10,Work Type_Govt_job,-0.504717
18,Smoking Status_formerly smoked,-0.511386


###### Model Evaluation Metrics

In [12]:
#Confusion Matrix
confusion_matrix = metrics.confusion_matrix(y_test, Model_Predictions)
cm_df = pd.DataFrame(confusion_matrix, columns = ['True Positive','False Positive'],
                    index = ['False Negative','True Negative'])
cm_df

Unnamed: 0,True Positive,False Positive
False Negative,1210,3
True Negative,62,3


In [26]:
#classification report
target_names = ['class 0', 'class 1']
print(classification_report(y_test, Model_Predictions, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.95      1.00      0.97      1213
     class 1       0.50      0.05      0.08        65

    accuracy                           0.95      1278
   macro avg       0.73      0.52      0.53      1278
weighted avg       0.93      0.95      0.93      1278



In [27]:
#Evaluation Metrics
print("Accuracy:",metrics.accuracy_score(y_test, Model_Predictions))
print("Precision:",metrics.precision_score(y_test, Model_Predictions))
print("Recall:",metrics.recall_score(y_test, Model_Predictions))

Accuracy: 0.9491392801251957
Precision: 0.5
Recall: 0.046153846153846156
