In [66]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [67]:
path = r'Datasets/healthcare-dataset-stroke-data.csv'
data = pd.read_csv(path)
data = pd.DataFrame(data)

In [68]:
columns_names = data.columns.tolist()
data[columns_names] = data[columns_names].values

In [69]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [70]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5110 non-null   object
 1   gender             5110 non-null   object
 2   age                5110 non-null   object
 3   hypertension       5110 non-null   object
 4   heart_disease      5110 non-null   object
 5   ever_married       5110 non-null   object
 6   work_type          5110 non-null   object
 7   Residence_type     5110 non-null   object
 8   avg_glucose_level  5110 non-null   object
 9   bmi                4909 non-null   object
 10  smoking_status     5110 non-null   object
 11  stroke             5110 non-null   object
dtypes: object(12)
memory usage: 479.2+ KB


In [71]:
unknown_count = (data['smoking_status'] == 'Unknown').sum()
print("تعداد مقادیر 'Unknown' در ستون 'smoking_status':", unknown_count)

تعداد مقادیر 'Unknown' در ستون 'smoking_status': 1544


In [72]:
mapping = {
    'formerly smoked': 1,
    'never smoked': 2,
    'smokes': 3
}
data['smoking_status'] = data['smoking_status'].map(mapping).replace('Unknown', np.nan)

In [80]:
columns_to_impute = ['bmi', 'smoking_status']
imputer = KNNImputer(n_neighbors=6)
data[columns_to_impute] = imputer.fit_transform(data[columns_to_impute])

In [81]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   object 
 3   heart_disease      5110 non-null   object 
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   float64
 10  stroke             5110 non-null   object 
dtypes: float64(4), object(7)
memory usage: 439.3+ KB


In [75]:
data.drop('id', axis=1, inplace=True)

In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   object 
 2   hypertension       5110 non-null   object 
 3   heart_disease      5110 non-null   object 
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   object 
 8   bmi                5110 non-null   float64
 9   smoking_status     3566 non-null   float64
 10  stroke             5110 non-null   object 
dtypes: float64(2), object(9)
memory usage: 439.3+ KB


In [77]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,1.0,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,2.0,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,2.0,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,3.0,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,2.0,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.000000,2.0,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,2.0,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,1.0,0


In [78]:
columns_to_convert = ['age', 'avg_glucose_level']
data[columns_to_convert] = data[columns_to_convert].apply(pd.to_numeric)

In [79]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   object 
 3   heart_disease      5110 non-null   object 
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     3566 non-null   float64
 10  stroke             5110 non-null   object 
dtypes: float64(4), object(7)
memory usage: 439.3+ KB
