<h1>Logistic Regression Model</h1>

In [1]:
#Importing Necessary Libraries

import pandas as pd
import numpy as np
import io
import datetime
import matplotlib.pyplot as mtp
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import (
    accuracy_score, 
    roc_auc_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    matthews_corrcoef
)
from sklearn.preprocessing import StandardScaler

In [2]:
#Import the dataset
source_data = pd.read_csv('NPHA-doctor-visits.csv')

In [3]:
source_data.head()

Unnamed: 0,Number of Doctors Visited,Age,Phyiscal Health,Mental Health,Dental Health,Employment,Stress Keeps Patient from Sleeping,Medication Keeps Patient from Sleeping,Pain Keeps Patient from Sleeping,Bathroom Needs Keeps Patient from Sleeping,Uknown Keeps Patient from Sleeping,Trouble Sleeping,Prescription Sleep Medication,Race,Gender
0,3,2,4,3,3,3,0,0,0,0,1,2,3,1,2
1,2,2,4,2,3,3,1,0,0,1,0,3,3,1,1
2,3,2,3,2,3,3,0,0,0,0,1,3,3,4,1
3,1,2,3,2,3,3,0,0,0,1,0,3,3,4,2
4,3,2,3,3,3,3,1,0,0,0,0,2,3,1,2


In [27]:
#Splitting dataset into independent and dependent variables
source = source_data
X=source.loc[:, source.columns != 'Stress Keeps Patient from Sleeping']
Y=source['Stress Keeps Patient from Sleeping']

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, random_state=42)

In [29]:
df = X_test
file_path = 'LogTestData.csv'
df.to_csv(file_path, index = False)

In [30]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [31]:
logregression = LogisticRegression()
logregression.fit(X_train, Y_train)

In [32]:
Y_pred = logregression.predict(X_test)
print(Y_pred)
Y_probs = logregression.predict_proba(X_test)[:, 1]
print(Y_probs)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0]
[0.40726958 0.28278818 0.09455611 0.15644202 0.39116134 0.44575629
 0.32399448 0.1846613  0.10253119 0.0923415  0.04201952 0.19914654
 0.17972397 0.3880835  0.06782125 0.1369613  0.04266846 0.04929667
 0.2090342  0.40147989 0.18945209 0.10052119 0.1913764  0.36588057
 0.48310857 0.51754757 0.24218127 0.13861516 0.07996749 0.44423154
 0.17631047 0.14852932 0.3404677  0.0472337  0.30449586 0.27232837
 0.59044807 0.76103706 0.02861898 0.39783588 0.02760465 0.04987694
 0.05226449 0.36185858 0.07549131 0.16325943 0.29442081 0.23481703
 0.62845482 0.20983251 0.11750228 0.11830328 0.49323496 0.490337
 0.20366812 0.46059922 0.26249694 0.58399478 0.21190972 0.37852522
 0.03985658 0.03827946 0.04718426 0.01684

In [33]:
y_compare = np.vstack((Y_test,Y_pred)).T
#actual value on the left side and predicted value on the right hand side
#printing the top 5 values
y_compare[:5,:]

array([[1, 0],
       [0, 0],
       [0, 0],
       [1, 0],
       [0, 0]])

In [36]:
accuracy = accuracy_score(Y_test, Y_pred)
aucscore = roc_auc_score(Y_test, Y_probs)
precision = precision_score(Y_test, Y_pred) 
recall = recall_score(Y_test, Y_pred)  
f1score = f1_score(Y_test, Y_pred)
mccscore = matthews_corrcoef(Y_test, Y_pred)

In [38]:
print(f"Accuracy:  {accuracy:.4f}")
print(f"AUCScore:  {aucscore:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1score:.4f}")

Accuracy:  0.7063
AUCScore:  0.7221
Precision: 0.3913
Recall:    0.2432
F1-Score:  0.3000
