##### Logistic Regression Testing and Evaluation Two

In [3]:
#import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
#supress warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
#import CSV file as pandas data frame 
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
#Create copy of orginal Dataframe
data_original = data.copy(deep = True)
#rename columns
data = data.rename(columns = {'id':'Identification Number','gender':'Gender','age':'Age',
                              'hypertension':'Hypertension','heart_disease':'Heart Disease', 
                              'ever_married':'Marriage Status','work_type':'Work Type','Residence_type':'Residence Type',
                             'avg_glucose_level':'Average Glucose Level','bmi':'Body Mass Index',
                              'smoking_status':'Smoking Status','stroke':'Stroke'})
#data type conversions
data['Age'] = data['Age'].astype('int')
data['Hypertension'] = data['Hypertension'].astype('object')
data['Stroke'] = data['Stroke'].astype('object')
# Replace the missing values with mean of bmi attribute
data['Body Mass Index'].fillna(np.round(data['Body Mass Index'].mean(), 1), inplace = True)
# Deleting attribute (high cardinality)
data.drop(['Identification Number'], axis = 1, inplace = True)
#dataset head
data.head()

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Marriage Status,Work Type,Residence Type,Average Glucose Level,Body Mass Index,Smoking Status,Stroke
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


###### Data Encoding for LR algorithim preparation

In [5]:
#one-hot encode the Data
one_hot_encoded = pd.get_dummies(data)
one_hot_encoded.head()

Unnamed: 0,Age,Heart Disease,Average Glucose Level,Body Mass Index,Gender_Female,Gender_Male,Gender_Other,Hypertension_0,Hypertension_1,Marriage Status_No,...,Work Type_Self-employed,Work Type_children,Residence Type_Rural,Residence Type_Urban,Smoking Status_Unknown,Smoking Status_formerly smoked,Smoking Status_never smoked,Smoking Status_smokes,Stroke_0,Stroke_1
0,67,1,228.69,36.6,0,1,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
1,61,0,202.21,28.9,1,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,1
2,80,1,105.92,32.5,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
3,49,0,171.23,34.4,1,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
4,79,0,174.12,24.0,1,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,1


###### Split data into train test partitions

In [6]:
#Performing our train test split on the data
train, test = train_test_split(one_hot_encoded,test_size = 0.25,random_state=42)
X_train = train.drop(['Heart Disease'],axis=1)
y_train = train.filter(['Heart Disease'])
X_test = test.drop(['Heart Disease'],axis=1)
y_test = test.filter(['Heart Disease'])
#print shape of x test
print('X_test shape:', X_test.shape)
print('X_train shape:', X_train.shape)
print('y_test shape:', y_test.shape)
print('y_train shape:', y_train.shape)

X_test shape: (1278, 23)
X_train shape: (3832, 23)
y_test shape: (1278, 1)
y_train shape: (3832, 1)


###### Building the Logistic Regression Model

In [7]:
# Build a Logisitc Regression Model (Tuning Hyper-parameters)
Model_Two = LogisticRegression(random_state = 31, class_weight = 'balanced', 
                               max_iter = 500, penalty = 'l2', solver = 'newton-cg')

# fit the model with data (Training Model)
Model_Two.fit(X_train,y_train)

# Prediction using model (test Model)
Model_Predictions = Model_Two.predict(X_test)

###### Model Coefficient Analysis

In [8]:
#Print Model Coeffients
print(Model_Two.coef_)

[[ 0.09044672  0.00631845  0.00257777 -0.42966943  0.42966627  0.
  -0.06339512  0.06339196  0.19265615 -0.19265932 -0.18287711 -0.23571526
  -0.17432902 -0.28547346  0.87839168 -0.05108782  0.05108466 -0.2455764
   0.04018871 -0.33978543  0.54516994 -0.12223157  0.1222284 ]]


In [10]:
##Tidy up Coefficients
#Get the original variable names
original_variables = list(X_train.columns)
#zip together the names and coefficients
zipped = list(zip(original_variables, Model_Two.coef_[0]))
coefs = [list (x) for x in zipped]
#Put into a dataframe wth column labels
coefficients_df = pd.DataFrame(coefs,columns = ['variables','coefficient'])
#Sort Values
coefficients_df.sort_values(by = ['coefficient'], axis = 0, inplace = True, ascending = False)
#Show dataframe
coefficients_df

Unnamed: 0,variables,coefficient
14,Work Type_children,0.878392
20,Smoking Status_smokes,0.54517
4,Gender_Male,0.429666
8,Marriage Status_No,0.192656
22,Stroke_1,0.122228
0,Age,0.090447
7,Hypertension_1,0.063392
16,Residence Type_Urban,0.051085
18,Smoking Status_formerly smoked,0.040189
1,Average Glucose Level,0.006318


##### Model Evaluation Metrics

In [18]:
#Confusion Matrix
confusion_matrix = metrics.confusion_matrix(y_test, Model_Predictions)
cm_df = pd.DataFrame(confusion_matrix, columns = ['True Positive','False Positive'],
                    index = ['False Negative','True Negative'])
cm_df

Unnamed: 0,True Positive,False Positive
False Negative,922,291
True Negative,14,51


In [15]:
#classification report
target_names = ['class 0', 'class 1']
print(classification_report(y_test, Model_Predictions, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.99      0.76      0.86      1213
     class 1       0.15      0.78      0.25        65

    accuracy                           0.76      1278
   macro avg       0.57      0.77      0.55      1278
weighted avg       0.94      0.76      0.83      1278



In [16]:
#Evaluation Metrics
print("Accuracy:",metrics.accuracy_score(y_test, Model_Predictions))
print("Precision:",metrics.precision_score(y_test, Model_Predictions))
print("Recall:",metrics.recall_score(y_test, Model_Predictions))

Accuracy: 0.7613458528951487
Precision: 0.14912280701754385
Recall: 0.7846153846153846
