# 1. Background information of dataset

Attribute Information
1) id: unique identifier
2) gender: "Male", "Female" or "Other"
3) age: age of the patient
4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
6) ever_married: "No" or "Yes"
7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
8) Residence_type: "Rural" or "Urban"
9) avg_glucose_level: average glucose level in blood
10) bmi: body mass index
11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
12) stroke: 1 if the patient had a stroke or 0 if not
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

# 2. Libraries and Packages

In [1]:
# Import general packages - numpy, pandas, seaborn, matplotlib
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set

# Import LinearRegression model from Scikit-Learn
from sklearn.linear_model import LinearRegression

# Create a Linear Regression object
linreg = LinearRegression()

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Import the data set

sourcedata = pd.read_csv('healthcare-dataset-stroke-data.csv')
sourcedata.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
sourcedata.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


#### Observing the first 5 rows of the dataset, we can see that some values are not included from the survey. For example, some values under the column "bmi" is empty and filled with "NaN". To handle these missing values, we will need to remove the rows consisting of these values to clean the data. We will not be filling these values with estimated values as there is an oppurtunity to lose integrity of the data because we might then be operating from assumptions and not actual analysis.

#### Also, under the column "smoking_status", some survery respondents did not indicate their smoking history and entered "unknown". We will also need to clean the dataset from these values by removing the respective rows.

In [4]:
sourcedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [5]:
sourcedata.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [6]:
# Count the number of NaN values in 'bmi'
sourcedata.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
# Remove the NaN values in 'bmi'
sourcedata.dropna(subset = ['bmi'], inplace=True)
sourcedata.head(25)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1
10,12109,Female,81.0,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1
11,12095,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1


In [8]:
# After removing the null values under 'bmi'
sourcedata.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [9]:
sourcedata.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0
mean,37064.313506,42.865374,0.091872,0.049501,105.30515,28.893237,0.042575
std,20995.098457,22.555115,0.288875,0.216934,44.424341,7.854067,0.201917
min,77.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,18605.0,25.0,0.0,0.0,77.07,23.5,0.0
50%,37608.0,44.0,0.0,0.0,91.68,28.1,0.0
75%,55220.0,60.0,0.0,0.0,113.57,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [10]:
# Remove rows with values 'Unknown' under smoking_status coloumn as we will not be filling in unknown values with estimations.
sourcedata = sourcedata[sourcedata["smoking_status"].str.contains("Unknown") == False]
sourcedata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3426 entries, 0 to 5108
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 3426 non-null   int64  
 1   gender             3426 non-null   object 
 2   age                3426 non-null   float64
 3   hypertension       3426 non-null   int64  
 4   heart_disease      3426 non-null   int64  
 5   ever_married       3426 non-null   object 
 6   work_type          3426 non-null   object 
 7   Residence_type     3426 non-null   object 
 8   avg_glucose_level  3426 non-null   float64
 9   bmi                3426 non-null   float64
 10  smoking_status     3426 non-null   object 
 11  stroke             3426 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 348.0+ KB


In [11]:
sourcedata.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,3426.0,3426.0,3426.0,3426.0,3426.0,3426.0,3426.0
mean,37339.00613,48.645943,0.119089,0.060128,108.321891,30.290047,0.052539
std,21049.976345,18.851239,0.323941,0.237759,47.703541,7.295958,0.223145
min,84.0,10.0,0.0,0.0,55.12,11.5,0.0
25%,18997.5,34.0,0.0,0.0,77.2375,25.3,0.0
50%,38068.5,50.0,0.0,0.0,92.36,29.1,0.0
75%,55464.25,63.0,0.0,0.0,116.2075,34.1,0.0
max,72915.0,82.0,1.0,1.0,271.74,92.0,1.0


### Create a train data set populated with our desired variables.

In [12]:
traindata = sourcedata.copy()
traindata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3426 entries, 0 to 5108
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 3426 non-null   int64  
 1   gender             3426 non-null   object 
 2   age                3426 non-null   float64
 3   hypertension       3426 non-null   int64  
 4   heart_disease      3426 non-null   int64  
 5   ever_married       3426 non-null   object 
 6   work_type          3426 non-null   object 
 7   Residence_type     3426 non-null   object 
 8   avg_glucose_level  3426 non-null   float64
 9   bmi                3426 non-null   float64
 10  smoking_status     3426 non-null   object 
 11  stroke             3426 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 348.0+ KB
