### Building a Heart Disease Prediction using the Regression Model

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


#### Data Collection and Processing

In [2]:
heart_data=pd.read_csv("healthcare-dataset-stroke-data.csv")

In [3]:
heart_data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
heart_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
heart_data.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


In [6]:
heart_data.shape


(5110, 12)

In [7]:
#getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [8]:
#describle the avg,all by using describle
heart_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [9]:
#handling missing values
heart_data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [31]:
heart_data=heart_data.drop('id',axis=1)

In [32]:
mean=heart_data['bmi'].mean()
print(mean)
heart_data['bmi']=heart_data['bmi'].fillna(mean)

28.893236911794663


In [33]:
#checking the distribution of heart_disease variable
heart_data['heart_disease'].value_counts()

heart_disease
0    4834
1     276
Name: count, dtype: int64

In [34]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
numeric_data=label_encoder.fit_transform(heart_data['work_type'])
residencal_values=label_encoder.fit_transform(heart_data['Residence_type'])
smoking_values=label_encoder.fit_transform(heart_data['smoking_status'])
ever_married_values=label_encoder.fit_transform(heart_data['ever_married'])
gender_values=label_encoder.fit_transform(heart_data['gender'])

In [35]:
heart_data['work_type']=numeric_data

In [36]:
heart_data['Residence_type']=residencal_values
heart_data['smoking_status']=smoking_values
heart_data['ever_married']=ever_married_values
heart_data['gender']=gender_values
heart_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.600000,1,1
1,0,61.0,0,0,1,3,0,202.21,28.893237,2,1
2,1,80.0,0,1,1,2,0,105.92,32.500000,2,1
3,0,49.0,0,0,1,2,1,171.23,34.400000,3,1
4,0,79.0,1,0,1,3,0,174.12,24.000000,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,0,80.0,1,0,1,2,1,83.75,28.893237,2,0
5106,0,81.0,0,0,1,3,1,125.20,40.000000,2,0
5107,0,35.0,0,0,1,3,0,82.99,30.600000,2,0
5108,1,51.0,0,0,1,2,0,166.29,25.600000,1,0


### 1---> Defective Health
0 ---> Healthy Health

In [37]:
X=heart_data.drop(columns='stroke',axis=1)
Y=heart_data['stroke']
print(X,Y)

      gender   age  hypertension  heart_disease  ever_married  work_type  \
0          1  67.0             0              1             1          2   
1          0  61.0             0              0             1          3   
2          1  80.0             0              1             1          2   
3          0  49.0             0              0             1          2   
4          0  79.0             1              0             1          3   
...      ...   ...           ...            ...           ...        ...   
5105       0  80.0             1              0             1          2   
5106       0  81.0             0              0             1          3   
5107       0  35.0             0              0             1          3   
5108       1  51.0             0              0             1          2   
5109       0  44.0             0              0             1          0   

      Residence_type  avg_glucose_level        bmi  smoking_status  
0                 

In [38]:
#Splitting the data into Training data & Test Data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=3)

In [39]:
print(X_train,X_test,Y_train,Y_test)

      gender   age  hypertension  heart_disease  ever_married  work_type  \
3186       0  68.0             0              0             1          0   
3348       0  19.0             0              0             0          2   
2313       1  18.0             0              0             0          2   
1832       0  45.0             0              0             1          2   
1060       0  19.0             0              0             0          2   
...      ...   ...           ...            ...           ...        ...   
5036       1  56.0             0              0             1          3   
5085       0  72.0             0              0             1          2   
2316       0  27.0             0              0             1          2   
2400       0  33.0             0              0             1          0   
4339       0  66.0             0              0             1          2   

      Residence_type  avg_glucose_level   bmi  smoking_status  
3186               0   

In [40]:
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(4088, 10) (1022, 10) (4088,) (1022,)


### Logistic Regression

In [41]:
#training the Logistic Regression model with training data
print(heart_data)
model=LogisticRegression()
model.fit(X_train,Y_train)

      gender   age  hypertension  heart_disease  ever_married  work_type  \
0          1  67.0             0              1             1          2   
1          0  61.0             0              0             1          3   
2          1  80.0             0              1             1          2   
3          0  49.0             0              0             1          2   
4          0  79.0             1              0             1          3   
...      ...   ...           ...            ...           ...        ...   
5105       0  80.0             1              0             1          2   
5106       0  81.0             0              0             1          3   
5107       0  35.0             0              0             1          3   
5108       1  51.0             0              0             1          2   
5109       0  44.0             0              0             1          0   

      Residence_type  avg_glucose_level        bmi  smoking_status  stroke  
0         

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Model Evaluation
#accuracy on traning data


In [42]:
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)


In [43]:
print('Accuracy on Training data:',training_data_accuracy)

Accuracy on Training data: 0.9518101761252447


In [44]:
#accuracy score on test data
X_test_prediction=model.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,Y_test)
print("Accuracy score on test data:",testing_data_accuracy)


Accuracy score on test data: 0.949119373776908


In [45]:
#this model is overfitting

# Building a Predictive System


In [48]:
input_data=(0,61.0,0,0,1,3,0,202.21,28.893237,2)
import_data_as_numpy_arrays=np.asarray(input_data)

In [49]:
input_data_reshaped=import_data_as_numpy_arrays.reshape(1,-1)
prediction=model.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0):
    print("the person does not have a stroke , the person is healthy ")
else:
    print("the person have Heart stroke")


[0]
the person does not have a stroke , the person is healthy 


