In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [3]:
df = pd.read_csv('cardio_train.csv', sep=";")
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
print(f"Number of columns: { df.shape[1] }")

Number of columns: 13


In [5]:
print(f"Number of rows: { df.shape[0] }")

Number of rows: 70000


Checking if there's NULL value in any of the cells.

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [7]:
df['age'] = round(df['age']/365.25,2)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_predict, cross_validate
from sklearn.metrics import accuracy_score, r2_score, confusion_matrix,confusion_matrix, roc_curve
from xgboost import plot_importance
import warnings
warnings.filterwarnings('ignore')

In [9]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50.36,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55.38,1,156,85.0,140,90,3,1,0,0,1,1
2,2,51.63,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48.25,2,169,82.0,150,100,1,1,0,0,1,1
4,4,47.84,1,156,56.0,100,60,1,1,0,0,0,0


In [20]:
X = df.drop(['cardio'], axis=1)
y = df['cardio']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

<hr>

<img src="https://upload.wikimedia.org/wikipedia/commons/7/76/Random_forest_diagram_complete.png" width="30%">

In [23]:
random_model = RandomForestClassifier(n_estimators=51,
                          max_depth=10,
                          random_state=0)

random_model.fit(X_train, y_train)
print(f"Testing accuracy: {round(accuracy_score(random_model.predict(X_test), y_test),4)*100}%")
print(f"Average testing accuracy: {round(cross_validate(random_model, X, y, cv=5)['test_score'].mean()*100,2)}%")

Testing accuracy: 73.78%
Average testing accuracy: 69.02%


In [24]:
import pickle
with open("cardio_model.pkl","wb") as f:
    pickle.dump(random_model,f)


In [30]:
random_model.predict(np.array([1234,43.33,1,168,65.0,120,80,1,1,0,0,0]).reshape(1,-1))

array([0], dtype=int64)

In [26]:
X_test

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
29755,42542,43.33,1,168,65.0,120,80,1,1,0,0,0
49336,70448,43.48,1,165,102.0,180,1000,1,1,0,0,1
58299,83189,61.66,1,149,89.0,130,90,1,1,0,0,0
66033,94264,59.86,1,164,110.0,140,80,3,1,0,0,1
48407,69119,63.85,1,150,83.0,160,100,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1381,1940,49.85,1,162,95.0,120,80,1,1,0,0,0
45773,65387,60.02,2,169,70.0,140,90,1,1,0,0,1
21431,30620,41.72,2,172,70.0,110,80,1,1,0,0,1
58015,82787,63.98,2,168,74.0,120,70,1,1,0,0,1
