In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV , RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import time
import warnings
warnings.filterwarnings("ignore")

In [3]:
data=pd.read_csv("D:/shineteach/cardio_train5.csv")
data2=data.iloc[:,2:]
data2

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1


<a id="import"></a>
# <p style="background-color:royalblue; font-family:calibri; font-size:130%; color:white; text-align:center; border-radius:10px 10px; padding:15px"> Preprocessing</p>

In [4]:
age=data2["age"]
new_age=age//365
new_age

0        50
1        55
2        51
3        48
4        47
         ..
69995    52
69996    61
69997    52
69998    61
69999    56
Name: age, Length: 70000, dtype: int64

In [5]:
data2=data2.drop("age",axis=1)

In [6]:
data2.insert(0,"new_age",new_age)

In [7]:
data2['weight']=data2['weight'].astype(int)

In [8]:
df=pd.DataFrame(data2)

In [9]:
df=df[df["new_age"]>32]

In [10]:
df=df[df["height"]>144]
df=df[df["height"]<200]

In [11]:
df=df[df["weight"]>42]
df=df[df["weight"]<150]

In [12]:
df=df[df["ap_hi"]<172]
df=df[df["ap_hi"]>80]

In [13]:
df=df[df["ap_lo"]<115]
df=df[df["ap_lo"]>69]

In [14]:
new_ap=(df["ap_lo"]+df["ap_hi"])/2

In [15]:
df=df.drop("ap_lo",axis=1)

In [16]:
df=df.drop("ap_hi",axis=1)

In [17]:
df.insert(0,"new_ap",new_ap)

In [94]:
duplicates=df.duplicated()
print(duplicates.sum()) 

df=df.drop_duplicates()

3464


In [95]:
df=df.reset_index()

In [96]:
df=df.drop("index",axis=1)

<a id="import"></a>
# <p style="background-color:royalblue; font-family:calibri; font-size:130%; color:white; text-align:center; border-radius:10px 10px; padding:15px">  Decision Tree Model Building</p>

In [97]:
df

Unnamed: 0,new_ap,new_age,gender,height,weight,cholesterol,gluc,smoke,alco,active,cardio
0,95.0,50,2,168,62,1,1,0,0,1,0
1,115.0,55,1,156,85,3,1,0,0,1,1
2,100.0,51,1,165,64,3,1,0,0,0,1
3,125.0,48,2,169,82,1,1,0,0,1,1
4,100.0,60,1,151,67,2,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
60610,130.0,51,1,161,56,1,1,0,0,1,1
60611,110.0,53,1,172,70,1,1,0,0,1,1
60612,115.0,61,1,158,126,2,2,0,0,1,1
60613,107.5,61,1,163,72,1,2,0,0,0,1


In [103]:
x=df.drop("cardio",axis=1)
y=df["cardio"]

In [104]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=4)

In [105]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(x_train,y_train)

In [106]:
y_pred=clf.predict(x_test)

In [107]:
print("accuracy:", metrics.accuracy_score(y_test,y_pred))

accuracy: 0.6034809865544832


<a id="import"></a>
# <p style="background-color:royalblue; font-family:calibri; font-size:130%; color:white; text-align:center; border-radius:10px 10px; padding:15px">  Hyperparameter Tuning with RandomSearchCV</p>

In [108]:
param_random = {
    "criterion": ["gini", "entropy",'log_loss'],
    "splitter": ["best", "random"],
    "max_depth": np.arange(1, 12),
    "min_samples_split": np.arange(1, 12),
    "min_samples_leaf": np.arange(1, 12),
    "max_features": ["auto", "sqrt", "log2"]
}

In [109]:
clf=DecisionTreeClassifier(random_state=0)

In [130]:
clf2_random = RandomizedSearchCV(clf, param_random, n_iter=200, n_jobs=-1)

In [131]:
start_time = time.time()

clf2_random.fit(x_train, y_train)

end_time = time.time()

print("Best hyperparameters:\n", clf2_random.best_params_, "\nTime:", int(end_time - start_time), "Seconds")

Best hyperparameters:
 {'splitter': 'best', 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 7, 'criterion': 'gini'} 
Time: 9 Seconds


In [132]:
y_pred2 = clf2_random.predict(x_test)

In [133]:
print("accuracy:",metrics.accuracy_score(y_test,y_pred2))

accuracy: 0.721933514806566


<a id="import"></a>
# <p style="background-color:royalblue; font-family:calibri; font-size:130%; color:white; text-align:center; border-radius:10px 10px; padding:15px"> overfitting or underfitting?</p>

In [134]:
y_pred_test = clf2_random.predict(x_test)

In [135]:
print(metrics.classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.69      0.78      0.73      5906
           1       0.76      0.67      0.71      6217

    accuracy                           0.72     12123
   macro avg       0.73      0.72      0.72     12123
weighted avg       0.73      0.72      0.72     12123



In [136]:
y_pred_train = clf2_random.predict(x_train)

In [137]:
print(metrics.classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.69      0.77      0.73     23527
           1       0.76      0.67      0.71     24965

    accuracy                           0.72     48492
   macro avg       0.73      0.72      0.72     48492
weighted avg       0.73      0.72      0.72     48492



In [138]:
confusion_matrix(y_test,y_pred_test)

array([[4588, 1318],
       [2053, 4164]], dtype=int64)

<a id="import"></a>
# <p style="background-color:royalblue; font-family:calibri; font-size:130%; color:white; text-align:center; border-radius:10px 10px; padding:15px"> Random Forest Model Building</p>

In [139]:
x=df.drop("cardio",axis=1)
y=df["cardio"]

In [140]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=4)

In [141]:
clf2=RandomForestClassifier(random_state=0)
clf2.fit(x_train,y_train)

In [142]:
y_pred=clf2.predict(x_test)

In [143]:
print("accuracy:",metrics.accuracy_score(y_test,y_pred))

accuracy: 0.6759877918007094


<a id="import"></a>
# <p style="background-color:royalblue; font-family:calibri; font-size:130%; color:white; text-align:center; border-radius:10px 10px; padding:15px">  Hyperparameter Tuning with RandomSearchCV</p>

In [144]:
param_random = {
    'n_estimators': [50,100,130,150,180,200,250,300,350,400],
    'max_depth': [3, 4, 5, 6, 7, 8,9,10,11],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['sqrt', 'log2']
}

In [145]:
rf_random = RandomizedSearchCV(clf2, param_random, n_iter=200 , n_jobs=-1)

In [146]:
start_time = time.time()

rf_random.fit(x_train, y_train)

end_time = time.time()

print("Best hyperparameters:\n", rf_random.best_params_, "\nTime:", int(end_time - start_time), "Seconds")

Best hyperparameters:
 {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10} 
Time: 479 Seconds


In [147]:
y_pred_test = rf_random.predict(x_test)

In [148]:
print("accuracy:",metrics.accuracy_score(y_test,y_pred_test))

accuracy: 0.7252330281283511


<a id="import"></a>
# <p style="background-color:royalblue; font-family:calibri; font-size:130%; color:white; text-align:center; border-radius:10px 10px; padding:15px">  overfitting or underfitting?</p>

In [149]:
y_pred_test = rf_random.predict(x_test)

In [150]:
print(metrics.classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.70      0.77      0.73      5906
           1       0.76      0.68      0.72      6217

    accuracy                           0.73     12123
   macro avg       0.73      0.73      0.73     12123
weighted avg       0.73      0.73      0.72     12123



In [151]:
y_pred_train = rf_random.predict(x_train)

In [152]:
print(metrics.classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.71      0.79      0.75     23527
           1       0.78      0.69      0.73     24965

    accuracy                           0.74     48492
   macro avg       0.74      0.74      0.74     48492
weighted avg       0.74      0.74      0.74     48492



In [153]:
confusion_matrix(y_test,y_pred_test)

array([[4564, 1342],
       [1989, 4228]], dtype=int64)