In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline,make_pipeline

In [41]:
df = pd.read_csv("C:/Users/DELL/OneDrive/Desktop/anu course/datasets/heart.csv")

In [43]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [45]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [47]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [51]:
df.columns.tolist()

['id',
 'gender',
 'age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'smoking_status',
 'stroke']

In [55]:
df['bmi']=df['bmi'].fillna(df['bmi'].median())

In [57]:
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [65]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [129]:
for col in df.select_dtypes(include=['object']).columns:
    print(col, df[col].unique()[:10])


gender ['Male' 'Female' 'Other']
ever_married ['Yes' 'No']
work_type ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Residence_type ['Urban' 'Rural']
smoking_status ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [133]:
for col in df.select_dtypes(include=['int64','float64']).columns:
    print(col, df[col].unique()[:10])

id [ 9046 51676 31112 60182  1665 56669 53882 10434 27419 60491]
age [67. 61. 80. 49. 79. 81. 74. 69. 59. 78.]
hypertension [0 1]
heart_disease [1 0]
avg_glucose_level [228.69 202.21 105.92 171.23 174.12 186.21  70.09  94.39  76.15  58.57]
bmi [36.6 28.1 32.5 34.4 24.  29.  27.4 22.8 24.2 29.7]
stroke [1 0]


In [131]:
for col in df.columns:
    print(col, df[col].map(type).unique())


id [<class 'int'>]
gender [<class 'str'>]
age [<class 'float'>]
hypertension [<class 'int'>]
heart_disease [<class 'int'>]
ever_married [<class 'str'>]
work_type [<class 'str'>]
Residence_type [<class 'str'>]
avg_glucose_level [<class 'float'>]
bmi [<class 'float'>]
smoking_status [<class 'str'>]
stroke [<class 'int'>]


In [59]:
x = df.drop('stroke',axis=1)
y = df['stroke']

In [61]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [69]:
numerical_features = x.select_dtypes(include=['int64','float64']).columns.tolist()
print("Numerical Names : " , numerical_features)

Numerical Names :  ['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']


In [71]:
categorical_features = x.select_dtypes(include=['object']).columns.tolist()
print("categorical Names : " , categorical_features)

categorical Names :  ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


In [135]:
numerical_cols =  Pipeline(
    steps = [("simple imputation",SimpleImputer( strategy='mean')),
              ("scaling",StandardScaler())]
)

In [139]:
categorical_cols = Pipeline(
    steps = [("simple imputaion",SimpleImputer(strategy = 'most_frequent')),
             ("ohe",OneHotEncoder(handle_unknown='ignore'))]
)

In [141]:
preprocessing = ColumnTransformer(transformers = [("Categorical",categorical_cols,categorical_features),
                                                 ("Numerical",numerical_cols,numerical_features)])

In [143]:
pipe = Pipeline(
    steps = [("preprocessor",preprocessing),
             ("regressor",RandomForestClassifier())]
)

In [145]:
pipe.fit(x_train,y_train)

In [147]:
pipe.predict(x_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [151]:
y_pred = pipe.predict(x_test)
acc = accuracy_score(y_test,y_pred)
print ( "Accuracy Score" ,round(acc * 100 , 2), "%" )

Accuracy Score 94.31 %
