# Data columns desc
* age = age in years
* sex = (1 = male; 0 = female)
* cp= chest pain type
* trestbps= resting blood pressure (in mm Hg on admission to the hospital)
* chol = serum cholestoral in mg/dl
* fbs = ( fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
* restecg = resting electrocardiographic results
* thalach = maximum heart rate achieved
* exang = exercise induced angina (1 = yes; 0 = no)
* oldpeak = ST depression induced by exercise relative to rest
* slope = the slope of the peak exercise ST segment
* ca = number of major vessels (0-3) colored by flourosopy
* thal3 = normal; 6 = fixed defect; 7 = reversable defect
* target = 1 or 0

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,KFold,cross_val_score,KFold
from collections import Counter
import os
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score 
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline






In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path_file_name = os.path.join(dirname, filename)
        print(path_file_name)

data = pd.read_csv(path_file_name)

In [None]:
data.head()

In [None]:
data['target'].value_counts()

In [None]:
sns.countplot(x='target',data=data)

The target is balanced

*Explore data*

In [None]:
data.info()

In [None]:
data.describe()

Check for null values

In [None]:
data.isnull().sum()

In [None]:
fig,ax = plt.subplots(figsize=(10,14))
ax = sns.heatmap(data.corr(),cmap='coolwarm',linecolor='white',linewidths=1,annot=True)
plt.yticks(rotation=0) 

In [None]:
data.groupby('slope')['target'].sum()

In [None]:
sns.barplot(x='slope',y='target',data=data)

In [None]:
sns.distplot(data['thalach'],color='red')

In [None]:
data.groupby('cp')['target'].sum()

In [None]:
sns.barplot(x='cp',y='target',data=data)

In [None]:
sns.distplot(data['age'])

# Preproccessing data

# OutLiers

In [None]:
def get_outliers(df,n,features):
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1

        # outlier step
        outlier_step = 1.5 * IQR

        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index

        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)

    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )

    return multiple_outliers   
# detect outliers from Age, SibSp , Parch and Fare
Outliers_to_drop = get_outliers(data,2,["age","chol","restecg","trestbps","oldpeak"])

In [None]:
data.loc[Outliers_to_drop]

**No Outliers found**

In [None]:
data.columns

* **Age**

In [None]:
#age
g= sns.FacetGrid(data,col='target')
g.map(plt.hist,'age',bins=20)

In [None]:
grid = sns.FacetGrid(data, row='target', col='sex', size=2.2, aspect=1.6)
grid.map(plt.hist, 'age',  bins=20)
grid.add_legend()

Weman are checked for disease when they are older than the men

In [None]:
data['AgeBand'] = pd.cut(data['age'], 5)
data[['AgeBand', 'target']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

In [None]:
    
data.loc[ data['age'] < 38.6, 'age_group'] = 1
data.loc[(data['age'] >= 38.6) & (data['age'] < 48.2), 'age_group'] = 2
data.loc[(data['age'] >= 48.2) & (data['age'] < 57.8), 'age_group'] = 3
data.loc[(data['age'] >= 57.8) & (data['age'] < 67.4), 'age_group'] = 4
data.loc[ data['age'] >= 67.4, 'age_group'] = 5
data['age_group'].astype('int')
data.head()

In [None]:
#drop age and AgeBand
data.drop(['age','AgeBand'] , axis= 1,inplace=True)

In [None]:
grid = sns.FacetGrid(data, row='target', col='sex', size=2.2, aspect=1.6)
grid.map(plt.hist, 'age_group',  bins=20)
grid.add_legend()

In [None]:
#change age_group to be dummy column
data = pd.get_dummies(data, columns = ['age_group'], prefix="AgeGrp")
data.head()

* **Sex**

In [None]:
data.groupby('sex')['target'].count()

In [None]:
#sns.barplot(x='sex',y='target',data=data)
sns.countplot(data=data,x='sex')


There are twice as much men than weman in the data

In [None]:
data = pd.get_dummies(data,columns = ['sex'], prefix="Sex")

* **cp**

In [None]:
data = pd.get_dummies(data,columns = ['cp'], prefix="CP")
data.head()

* **trestbps**

In [None]:
sns.distplot(data['trestbps'])

* **chol**

In [None]:
sns.distplot(data['chol'])

In [None]:
data['chol'].skew()

* **fbs**

In [None]:
data['fbs'].value_counts() /data.shape[0] * 100

In [None]:
sns.countplot(data['fbs'])

Not many people died from complications of high suger level 

In [None]:
data = pd.get_dummies(data,columns=['fbs'],prefix='fbs')
data.head()

* **restecg**

The electrocardiogram (ECG or EKG) is a noninvasive test that is used to reflect underlying heart conditions by measuring the electrical activity of the heart. By positioning leads (electrical sensing devices) on the body in standardized locations, health care professionals can learn information about many heart conditions by looking for characteristic patterns on the EKG.

from https://www.medicinenet.com/electrocardiogram_ecg_or_ekg/article.htm

In [None]:
sns.countplot(data['restecg'])

In [None]:
data = pd.get_dummies(data,columns=['restecg'],prefix='restecg')
data.head()

* **thalach**

In [None]:
g= sns.FacetGrid(data,col='target')
g.map(sns.distplot,'thalach')

* **exang**

Angina is a type of chest pain caused by reduced blood flow to the heart. Angina (an-JIE-nuh or AN-juh-nuh) is a symptom of coronary artery disease.

Angina, also called angina pectoris, is often described as squeezing, pressure, heaviness, tightness or pain in your chest. Some people with angina symptoms say angina feels like a vise squeezing their chest or a heavy weight lying on their chest. Angina may be a new pain that needs to be checked by a doctor, or recurring pain that goes away with treatment.

Although angina is relatively common, it can still be hard to distinguish from other types of chest pain, such as the discomfort of indigestion. If you have unexplained chest pain, seek medical attention right away.

https://www.mayoclinic.org/diseases-conditions/angina/symptoms-causes/syc-20369373

In [None]:
data['exang'].value_counts()

In [None]:
sns.countplot(data['exang'])

In [None]:
data = pd.get_dummies(data,columns=['exang'],prefix='exang')

* **oldpeak**


In a cardiac stress test, an ST depression of at least 1 mm after adenosine administration indicates a reversible ischaemia, while an exercise stress test requires an ST depression of at least 2 mm to significantly indicate reversible ischaemia.

https://en.wikipedia.org/wiki/ST_depression

In [None]:
sns.distplot(data['oldpeak'])

In [None]:
g= sns.FacetGrid(data,col='target')
g.map(sns.distplot,'oldpeak')

* **slope**

In [None]:
data = pd.get_dummies(data,columns=['slope'],prefix='slope')

* **ca**

In [None]:
sns.barplot(x='ca',y='target',data=data)

In [None]:
data = pd.get_dummies(data,columns=['ca'],prefix='ca')

* **thal**

In [None]:
sns.barplot(x='thal',y='target',data=data)

In [None]:
data = pd.get_dummies(data,columns=['thal'],prefix='thal')

In [None]:
data.info()

In [None]:
target = data['target']
data.drop('target',axis=1,inplace=True)

# Normalize nomeric data

In [None]:
numeric_feats = data.dtypes[data.dtypes != "uint8" ].index
numeric_feats

In [None]:
ss= StandardScaler()
data[numeric_feats] = ss.fit_transform(data[numeric_feats])

In [None]:
data.head()

# Modeling

* Gradient boosting
* Random forest
* Adaboost
* SVM

In [None]:
#split data to train and test 
x_train,x_test,y_train,y_test = train_test_split(data.values , target.values ,train_size = 0.75)

In [None]:
print(f'{x_train.shape}  {y_train.shape} {x_test.shape} {y_test.shape}')

*Cross validation*

In [None]:
def run_cv_model(classifier,x,y,**params):
    clf = classifier(**params)
    
    scores = cross_val_score(clf,x,y,cv=5)

    return round(scores.mean() * 100,2)

> * **Gradient boosting**

In [None]:
gb_score = run_cv_model(GradientBoostingClassifier,x_train,y_train ,learning_rate = 0.1,n_estimators =500)
gb_score

In [None]:
gb_score_1 =  run_cv_model(GradientBoostingClassifier,x_train,y_train ,learning_rate = 0.5,n_estimators= 500)
gb_score_1

In [None]:
gb_score_2 =  run_cv_model(GradientBoostingClassifier,x_train,y_train ,learning_rate  = 0.1,n_estimators = 200)
gb_score_2

* **Random Forest**

In [None]:
rf_score_1 = run_cv_model(RandomForestClassifier,x_train,y_train , n_estimators = 500 ,criterion ='gini',max_depth =1)
rf_score_1

In [None]:
rf_score_2 = run_cv_model(RandomForestClassifier,x_train,y_train , n_estimators =500 ,criterion ='entropy',max_depth =1)
rf_score_2

In [None]:
rf_score_3 = run_cv_model(RandomForestClassifier,x_train,y_train , n_estimators =1000 ,criterion = 'entropy',max_depth =1)
rf_score_3

In [None]:
rf_score_4 = run_cv_model(RandomForestClassifier,x_train,y_train , n_estimators = 1000 ,criterion = 'gini',max_depth =1)
rf_score_4

* **AdaBoost**

In [None]:
ab_score_1 = run_cv_model(AdaBoostClassifier,x_train,y_train , n_estimators = 100,learning_rate=1)
ab_score_1                         

In [None]:
ab_score_2 = run_cv_model(AdaBoostClassifier,x_train,y_train , n_estimators =1000,learning_rate=1)
ab_score_2     

In [None]:
ab_score_3 = run_cv_model(AdaBoostClassifier,x_train,y_train , n_estimators =100,learning_rate =0.1)
ab_score_3   

In [None]:
ab_score_4 = run_cv_model(AdaBoostClassifier,x_train,y_train , n_estimators = 1000,learning_rate= 0.1)
ab_score_4   

* **SVM**

In [None]:
svm_score_1 = run_cv_model(SVC,x_train,y_train , C =1,kernel ='rbf')
svm_score_1   

In [None]:
svm_score_2 = run_cv_model(SVC,x_train,y_train , C =0.1,kernel ='rbf')
svm_score_2 

In [None]:
svm_score_3 = run_cv_model(SVC,x_train,y_train , C = 1,kernel = 'linear')
svm_score_3

In [None]:
svm_score_4 = run_cv_model(SVC,x_train,y_train , C= 0.1,kernel='linear')
svm_score_4

In [None]:
svm_score_5 = run_cv_model(SVC,x_train,y_train , C =1,kernel ='poly')
svm_score_5

In [None]:
svm_score_6 = run_cv_model(SVC,x_train,y_train , C = 0.1,kernel='poly')
svm_score_6

In [None]:
models_names = ['Gradient boosting','Gradient boosting','Gradient boosting','Random forest','Random forest',
                'Random forest','Random forest','Adaboost','Adaboost','Adaboost','Adaboost',
               'SVM','SVM','SVM','SVM','SVM','SVM']
models_scores =[gb_score , gb_score_1,gb_score_2,rf_score_1,rf_score_2,rf_score_3,rf_score_4,
               ab_score_1,ab_score_2,ab_score_3,ab_score_4,svm_score_1,svm_score_2,svm_score_3,svm_score_4,svm_score_5,svm_score_6]
cross_validation_df = pd.DataFrame({'Model' :models_names,'Score' :models_scores})
cross_validation_df

In [None]:
cross_validation_df.groupby('Model').max()

# Stacking best cross validation from 4 models

In [None]:
def get_oof(classifier, x_train, y_train, x_test,n_folds,**params):
    oof_train = np.zeros(x_train.shape[0])
    oof_test = np.zeros(x_test.shape[0])
    oof_test_skf = np.empty((n_folds, len(y_test)))
   
    clf = classifier(**params) 
    kf = KFold(n_splits=n_folds) 
    for i, (train_index, valid_index) in enumerate(kf.split(x_train,y_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_valid = x_train[valid_index]

        clf.fit(x_tr, y_tr)

        oof_train[valid_index] = clf.predict(x_valid)
        oof_test_skf[i, :] = clf.predict(x_test)

    
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
#start stacking
gd_oof_train , gd_oof_test =  get_oof(GradientBoostingClassifier,x_train,y_train ,x_test , 5,learning_rate  = 0.1,n_estimators = 200)
rf_oof_train , rf_oof_test =  get_oof(RandomForestClassifier,x_train,y_train ,x_test,5, n_estimators =500 ,criterion ='entropy',max_depth =1)
ad_oof_train , ad_oof_test =  get_oof(AdaBoostClassifier,x_train,y_train ,x_test,5, n_estimators = 1000,learning_rate= 0.1)
svm_oof_train,svm_oof_test =  get_oof(SVC,x_train,y_train ,x_test,5, C= 0.1,kernel='linear')

In [None]:
#construct new training set from stacking
x_stacking_train = pd.DataFrame({'Gradient boosting':gd_oof_train.flatten() , 'Random forest': rf_oof_train.flatten(),
                                'Adaboost' : ad_oof_train.flatten() , 'SVM':svm_oof_train.flatten()})

x_stacking_train.head()

Stage 2 in stacking run the stacking train data through another boosting meta model

In [None]:
x_stacking_test =np.c_[gd_oof_test,rf_oof_test]
x_stacking_test = np.c_[x_stacking_test,ad_oof_test]
x_stacking_test = np.c_[x_stacking_test,svm_oof_test]


In [None]:
x_stacking_test.shape

In [None]:
xgClf = xgb.XGBClassifier(n_estimators= 2000,
 max_depth= 4,
 gamma=0.9,
 nthread= -1,
 scale_pos_weight=1)

xgClf.fit(x_stacking_train.values,y_train)

y_pred = xgClf.predict(x_stacking_test)

score = round(accuracy_score(y_pred,y_test) * 100,2)

print(f'Final score for predicting death by heart disease is {score}')