In [1]:
from sklearn.metrics import classification_report
import pandas as pd
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
cardio_df = pd.read_csv('data/cardiovascular.csv', sep=';')
cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
print(pd.isna(cardio_df).sum())

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64


In [4]:
X = cardio_df.drop(["cardio"], axis=1)
y = cardio_df["cardio"]
X.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,0,18393,2,168,62.0,110,80,1,1,0,0,1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [6]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    random_state=42,
    n_estimators=30,
    criterion="gini",
    max_depth=5,
    warm_start=False,
    oob_score=True,
    class_weight=None,
)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)
print(y_pred[0])

[0.28945194 0.71054806]


In [7]:
y_pred_df = pd.DataFrame(data = y_pred, columns=['0', '1'])

y_pred_df.head()

Unnamed: 0,0,1
0,0.289452,0.710548
1,0.19753,0.80247
2,0.406864,0.593136
3,0.696297,0.303703
4,0.742905,0.257095


In [8]:
print(classification_report(model.predict(X_val), y_val))

              precision    recall  f1-score   support

           0       0.79      0.71      0.75      7807
           1       0.68      0.77      0.72      6193

    accuracy                           0.74     14000
   macro avg       0.74      0.74      0.73     14000
weighted avg       0.74      0.74      0.74     14000



In [9]:
pickle.dump(model, open('models/cardio_model', 'wb'))