In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix

from xgboost import XGBClassifier
from lazypredict.Supervised import LazyClassifier

heart = pd.read_csv("heart_disease_df_1.csv")
heart.sample(15, random_state=43)

In [None]:
# information on the value counts and datatypes for the dataset
heart.info(verbose=True, show_counts=True)

In [None]:
# detailed description of the data
heart.describe()

In [None]:
heart_clean = heart.drop(columns='oldpeak').ffill()
heart_clean

In [None]:
# making dure that there are no missing values
heart_clean.isna().sum()

### EXPLORATORY DATA ANALYSIS

In [None]:
# spliting the data frame between men and women 
men = heart_clean.set_index('sex').loc[1]
women = heart_clean.set_index('sex').loc[0]
men.sort_values('age')
#women = heart_clean.set_index('sex').loc[0]

In [None]:
# grouping the dataset by age and sex and getting the mean of the coressponding features
heart_stats = heart_clean.groupby(['sex', 'age'])[heart_clean.drop(['sex', 'age', 'target'], axis=1).columns].mean()
heart_stats

In [None]:
plt.bar(heart_clean['age'].values, heart_clean['chol'].values, )
plt.show()

### MODEL TRAINING

In [None]:
# using lazy regressor to chose the appropriate model
x = heart_clean.drop('target', axis=1)
y = heart_clean['target']

xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=34, train_size=.7)

lazy = LazyClassifier(verbose=1, ignore_warnings=True, random_state=43)
models, predictions = lazy.fit(xtrain, xtest, ytrain, ytest)
predictions.sort_values('Time Taken')

In [None]:
params = {"criterion": [],
          "max_depth": [],
          "min_sample_split": []}

extra = ExtraTreeClassifier()
dtc = DecisionTreeClassifier()

xgb = XGBClassifier()
xgb_params = {}

bag = BaggingClassifier(estimator=extra, n_estimators=15, oob_score=True)
rf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=5)
vote = VotingClassifier(estimators=[extra, dtc], n_jobs=2)

# employing randsearchcv to find the best model configureations amongst the models
models = [dtc, extra]

kfold = KFold(n_splits=5, shuffle=True, random_state=41)

for model in models:
    rand = RandomizedSearchCV(estimator=model,
                              param_distributions=params, 
                              n_iter=5, 
                              cv=kfold,
                              random_state=456)
    rand.fit(xtrain, ytrain)
    rand.best_estimator_, rand.best_score_
# hyperparameter tunning 

models = [dtc, extra]
for model in models:
    scores = cross_val_score(model, xtrain, ytrain, cv=kfold,)
    scores

In [None]:
ypred = model.predict(xtest)
confusion_matrix(ytest, ypred, normalize='true')

In [None]:
from sklearn.feature_selection import SelectFromModel

dtc = dtc.fit(xtrain, ytrain)

model = SelectFromModel(dtc, prefit=True)
features_bool = model.get_support()
features = heart_clean.drop('target', axis=1).columns[features_bool]
features