In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn import pipeline 
import time

In [32]:
df = pd.read_csv('heart.csv')
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [33]:
#feature engineer
df.isnull().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

In [34]:
np.random.seed(0)

In [35]:
def data_enhancement(df):
    
    data = df
    
    for output in data['output'].unique():
        output_data       =  data[data['output'] == output]
        trtbps_std = output_data['trtbps'].std()
        chol_std = output_data['chol'].std()
        thalachh_std = output_data['thalachh'].std()
       
        
        for i in data[data['output'] == output].index:
            if np.random.randint(2) == 1:
                data['trtbps'].values[i] += trtbps_std/10
            else:
                data['trtbps'].values[i] -= trtbps_std/10
                
            if np.random.randint(2) == 1:
                data['chol'].values[i] += chol_std/10
            else:
                data['chol'].values[i] -= chol_std/10
                
            if np.random.randint(2) == 1:
                data['thalachh'].values[i] += thalachh_std/10
            else:
                data['thalachh'].values[i] -= thalachh_std/10

    return data

In [36]:
new_data = data_enhancement(df)
new_data

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,143,238,1,0,151,0,2.3,0,0,1,1
1,37,1,2,128,255,0,1,188,0,3.5,0,0,2,1
2,41,0,1,131,209,0,0,173,0,1.4,2,0,2,1
3,56,1,1,121,241,0,1,176,0,0.8,2,0,2,1
4,57,0,0,118,359,0,1,161,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,138,245,0,1,120,1,0.2,1,0,3,0
299,45,1,3,111,259,0,1,134,0,1.2,1,0,3,0
300,68,1,0,145,197,1,1,143,0,3.4,1,2,3,0
301,57,1,0,128,126,0,1,112,1,1.2,1,1,3,0


In [37]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: output, Length: 303, dtype: int64

In [38]:
scaler_Models = pipeline.Pipeline(steps=[('scaling' , StandardScaler())])

In [39]:
tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(random_state=0),
  "Extra Trees":ExtraTreesClassifier(random_state=0),
  "Random Forest":RandomForestClassifier(random_state=0, max_depth=4, n_estimators=200),
  "AdaBoost":AdaBoostClassifier(random_state=0),
  "Skl GBM": GradientBoostingClassifier(random_state=0),
  "Skl HistGBM":HistGradientBoostingClassifier(random_state=0),
  "XGBoost": XGBClassifier(random_state=0),
  "LightGBM":LGBMClassifier(random_state=0),
  "CatBoost": CatBoostClassifier(random_state=0)}
tree_classifiers = {name: pipeline.make_pipeline(scaler_Models, model) for name, model in tree_classifiers.items()}  
 


In [40]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=0, test_size=0.2)



results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})


for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    # tree_prepro.fit(x_train)
    # X_train_transformed = tree_prepro.transform(x_train)
    # X_train_transformed = pd.DataFrame(X_train_transformed, columns=list(num_vars) + list(cat_vars))
    model.fit(x_train,y_train)
    # GET PREDICTIONS USING x_val
    pred = model.predict(x_test)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_test, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_test, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              




results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
# results_ord.index += 1 
# results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')
results

  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,




  results = results.append({"Model":    model_name,


Learning rate set to 0.005621
0:	learn: 0.6885641	total: 1.73ms	remaining: 1.73s
1:	learn: 0.6838767	total: 3.09ms	remaining: 1.54s
2:	learn: 0.6800161	total: 4.35ms	remaining: 1.45s
3:	learn: 0.6755699	total: 5.6ms	remaining: 1.39s
4:	learn: 0.6711260	total: 6.83ms	remaining: 1.36s
5:	learn: 0.6671142	total: 8.04ms	remaining: 1.33s
6:	learn: 0.6637292	total: 9.32ms	remaining: 1.32s
7:	learn: 0.6601031	total: 10.5ms	remaining: 1.3s
8:	learn: 0.6558187	total: 11.6ms	remaining: 1.28s
9:	learn: 0.6516547	total: 12.8ms	remaining: 1.27s
10:	learn: 0.6484048	total: 14.1ms	remaining: 1.27s
11:	learn: 0.6439559	total: 15.7ms	remaining: 1.29s
12:	learn: 0.6409952	total: 17ms	remaining: 1.29s
13:	learn: 0.6366159	total: 18.3ms	remaining: 1.28s
14:	learn: 0.6330914	total: 19.4ms	remaining: 1.28s
15:	learn: 0.6295922	total: 20.7ms	remaining: 1.27s
16:	learn: 0.6255725	total: 21.9ms	remaining: 1.27s
17:	learn: 0.6219149	total: 23.1ms	remaining: 1.26s
18:	learn: 0.6181123	total: 24.3ms	remaining: 1.

  results = results.append({"Model":    model_name,


Unnamed: 0,Model,Accuracy,Bal Acc.,Time
0,Decision Tree,75.409836,76.034858,0.007701
1,Extra Trees,83.606557,83.3878,0.111154
2,Random Forest,85.245902,84.477124,0.291605
3,AdaBoost,86.885246,87.091503,0.067371
4,Skl GBM,81.967213,82.298475,0.081511
5,Skl HistGBM,81.967213,81.917211,0.294251
6,XGBoost,80.327869,80.446623,0.135156
7,LightGBM,83.606557,83.3878,0.060841
8,CatBoost,83.606557,83.3878,1.601901


In [41]:
extra_sample = new_data.sample(new_data.shape[0] // 4)
x_train_enh = pd.concat([x_train, extra_sample.drop(['output'], axis=1 ) ])
y_train_ehn = pd.concat([y_train, extra_sample['output'] ])

In [42]:
# scaler_2 = StandardScaler()
# x_train_enh = scaler_2.fit_transform(x_train_enh)
# x_test = scaler_2.transform(x_test)



results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})


for model_name, model in tree_classifiers.items():
    start_time = time.time()

    model.fit(x_train_enh,y_train_ehn)
   
    pred = model.predict(x_test)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_test, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_test, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              




results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
# results_ord.index += 1 
# results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')
results

  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,




  results = results.append({"Model":    model_name,


Learning rate set to 0.006308
0:	learn: 0.6880177	total: 1.42ms	remaining: 1.42s
1:	learn: 0.6829580	total: 2.72ms	remaining: 1.36s
2:	learn: 0.6781583	total: 4.17ms	remaining: 1.39s
3:	learn: 0.6729274	total: 5.67ms	remaining: 1.41s
4:	learn: 0.6689475	total: 7.15ms	remaining: 1.42s
5:	learn: 0.6645438	total: 8.48ms	remaining: 1.4s
6:	learn: 0.6594881	total: 9.79ms	remaining: 1.39s
7:	learn: 0.6539352	total: 11.5ms	remaining: 1.42s
8:	learn: 0.6490164	total: 12.8ms	remaining: 1.41s
9:	learn: 0.6447617	total: 14.1ms	remaining: 1.4s
10:	learn: 0.6403053	total: 15.3ms	remaining: 1.38s
11:	learn: 0.6355263	total: 16.6ms	remaining: 1.36s
12:	learn: 0.6312023	total: 17.8ms	remaining: 1.35s
13:	learn: 0.6262382	total: 19ms	remaining: 1.34s
14:	learn: 0.6224726	total: 20.3ms	remaining: 1.33s
15:	learn: 0.6182759	total: 21.6ms	remaining: 1.32s
16:	learn: 0.6141935	total: 22.9ms	remaining: 1.32s
17:	learn: 0.6099959	total: 24.3ms	remaining: 1.32s
18:	learn: 0.6060152	total: 25.8ms	remaining: 1.

  results = results.append({"Model":    model_name,


Unnamed: 0,Model,Accuracy,Bal Acc.,Time
0,Decision Tree,83.606557,83.769063,0.007979
1,Extra Trees,88.52459,88.180828,0.127893
2,Random Forest,90.163934,90.03268,0.338442
3,AdaBoost,90.163934,90.03268,0.101498
4,Skl GBM,90.163934,90.795207,0.079265
5,Skl HistGBM,85.245902,85.239651,0.416592
6,XGBoost,86.885246,86.71024,0.11256
7,LightGBM,85.245902,85.239651,0.049835
8,CatBoost,86.885246,87.091503,2.346376


In [43]:
# extra_sample = new_data.sample(new_data.shape[0] // 4)
# new_df= pd.concat([df, extra_sample])
# new_df.shape

In [44]:
# x = new_df.iloc[:,:-1]
# y = new_df.iloc[:,-1]

In [45]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(x_train)
# X_test = scaler.transform(x_test)

# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(random_state=0, max_depth=4, n_estimators=200)

# from sklearn.metrics import  accuracy_score
# rf.fit(X_train,y_train)
# pred =  rf.predict(X_test)
# acc = accuracy_score(y_test, pred)

# acc
