## Dataset

In [37]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split


In [38]:
real_dir = os.path.join("../../dataset")
real_path = os.path.join(real_dir,"stroke_healthcare.csv")


In [40]:
dataset = pd.read_csv(real_path, sep=",")
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [41]:
dataset.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [42]:
dataset.shape

(5110, 12)

In [43]:
dataset.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [44]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [45]:
for column in dataset.columns:
    print(f"* {column}: {dataset[column].unique()} \n")

* id: [ 9046 51676 31112 ... 19723 37544 44679] 

* gender: ['Male' 'Female' 'Other'] 

* age: [6.70e+01 6.10e+01 8.00e+01 4.90e+01 7.90e+01 8.10e+01 7.40e+01 6.90e+01
 5.90e+01 7.80e+01 5.40e+01 5.00e+01 6.40e+01 7.50e+01 6.00e+01 5.70e+01
 7.10e+01 5.20e+01 8.20e+01 6.50e+01 5.80e+01 4.20e+01 4.80e+01 7.20e+01
 6.30e+01 7.60e+01 3.90e+01 7.70e+01 7.30e+01 5.60e+01 4.50e+01 7.00e+01
 6.60e+01 5.10e+01 4.30e+01 6.80e+01 4.70e+01 5.30e+01 3.80e+01 5.50e+01
 1.32e+00 4.60e+01 3.20e+01 1.40e+01 3.00e+00 8.00e+00 3.70e+01 4.00e+01
 3.50e+01 2.00e+01 4.40e+01 2.50e+01 2.70e+01 2.30e+01 1.70e+01 1.30e+01
 4.00e+00 1.60e+01 2.20e+01 3.00e+01 2.90e+01 1.10e+01 2.10e+01 1.80e+01
 3.30e+01 2.40e+01 3.40e+01 3.60e+01 6.40e-01 4.10e+01 8.80e-01 5.00e+00
 2.60e+01 3.10e+01 7.00e+00 1.20e+01 6.20e+01 2.00e+00 9.00e+00 1.50e+01
 2.80e+01 1.00e+01 1.80e+00 3.20e-01 1.08e+00 1.90e+01 6.00e+00 1.16e+00
 1.00e+00 1.40e+00 1.72e+00 2.40e-01 1.64e+00 1.56e+00 7.20e-01 1.88e+00
 1.24e+00 8.00e-01 4.00e-01 8

In [46]:
dataset.duplicated().sum()

0

In [47]:
sum(dataset["smoking_status"]=="Unknown")

1544

In [48]:
sum(dataset["gender"]=="Other")

1

In [49]:
sum(dataset["work_type"] == "children")

687

## Data Cleaning

After short descriptive analysis, it turned out that:
- there is no duplicated values
- there are missing values on a column, bmi (201)
- there only 1 data entry has not shown the gender and only for the sake of the better research, we will delete this row
- there are 1544 data entries, whose smoking data is unknown. For sake of the better study, we will delete them as well. As we have 5110 data entries, we will still have enough dataset
- id column is a unique id of the people and this is not informative. For this reason, we will drop the whole column
- i will drop the worktype equal to children rows as well. Because, it can be people who is in child care of their own, or nanny in the private sphere or in public sphere. For the sake of the precision, they will be deleted as well, 687 rows

In [50]:
dataset1 = dataset.copy()
dataset1 = dataset1[dataset1["bmi"].notna()]
dataset1 = dataset1[dataset1["gender"] != "Other"]
dataset1 = dataset1[dataset1["smoking_status"] != "Unknown"]
dataset1.drop("id", axis=1, inplace=True)
dataset1 = dataset1[dataset1["work_type"] != "children"]

- i will group self employed and private work typed rows into private
- i will group formerly smoked and never smoked typed into no smoking category, as their current status is identified as non-smoker

In [51]:
dataset1["work_type"] = np.where(dataset1["work_type"]=="Self-employed", "Private", dataset1["work_type"])
dataset1["smoking_status"] = np.where(dataset1["smoking_status"]=="formerly smoked", "no smoking", dataset1["smoking_status"])
dataset1["smoking_status"] = np.where(dataset1["smoking_status"]=="never smoked", "no smoking", dataset1["smoking_status"])

In [52]:
dataset1.shape

(3357, 11)

In [53]:
dataset1

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,no smoking,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,no smoking,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Private,Rural,174.12,24.0,no smoking,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,no smoking,1
...,...,...,...,...,...,...,...,...,...,...,...
5100,Male,82.0,1,0,Yes,Private,Rural,71.97,28.3,no smoking,0
5102,Female,57.0,0,0,Yes,Private,Rural,77.93,21.7,no smoking,0
5106,Female,81.0,0,0,Yes,Private,Urban,125.20,40.0,no smoking,0
5107,Female,35.0,0,0,Yes,Private,Rural,82.99,30.6,no smoking,0


## Train Test Split

In [54]:
X = dataset1.drop('stroke', axis = 1)
Y = dataset1['stroke']

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=32)

In [56]:
train_dataset = pd.concat([X_train, y_train], axis=1)
test_dataset = pd.concat([X_test, y_test], axis=1)

In [57]:
train_dataset.to_csv("original_train_dataset/stroke_healthcare_original_train.csv", index=False)
test_dataset.to_csv("original_test_dataset/stroke_healthcare_original_test.csv", index=False)

In [58]:
len(train_dataset)

2249

In [59]:
len(test_dataset)

1108