# Heart attack possibility prediction 
# Exploratory Data Analysis

## Part 1 - Data Preprocessing
### Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import seaborn as sb;

import warnings
warnings.filterwarnings('ignore')

### Importing the dataset

In [None]:
dataset = pd.read_csv('heart.csv')
s_dataset = dataset.sample(frac =1).reset_index(drop=True)


#X = s_dataset.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12]].values
#X = s_dataset.iloc[:, [0,1,2,7,8,9,10,11,12]].values
X = s_dataset.drop('target',axis=1)
y = s_dataset.iloc[:, -1].values

In [None]:
s_dataset.head()

In [None]:
s_dataset.describe()

In [None]:
print(s_dataset.shape)

In [None]:
s_dataset["target"].value_counts()

In [None]:
#histogram with respect to age

sb.FacetGrid(s_dataset, hue="target", size = 6)\
  .map(sb.distplot, "age")\
  .add_legend();  

In [None]:
sb.FacetGrid(dataset, hue="target", size = 6)\
  .map(sb.distplot, "trestbps")\
  .add_legend(); 

In [None]:
sb.FacetGrid(dataset, hue="target", size = 6)\
  .map(sb.distplot, "restecg")\
  .add_legend(); 

In [None]:
sb.FacetGrid(dataset, hue="target", size = 6)\
  .map(sb.distplot, "thalach")\
  .add_legend(); 

In [None]:
sb.FacetGrid(dataset, hue="target", size = 6)\
  .map(sb.distplot, "oldpeak")\
  .add_legend(); 

In [None]:
sb.FacetGrid(dataset, hue="target", size = 6)\
  .map(sb.distplot, "slope")\
  .add_legend(); 

In [None]:
sb.boxplot(x="target", y="age", data= dataset)
plt.show();

In [None]:
sb.boxplot(x="sex", y="target", data= dataset)
plt.show();

In [None]:
 def bar_plot(variable):
    """
    input: variable ex: sex
    output: barplot & value count
    """
    # get features
    var = s_dataset[variable]
    # count number of categorical variable (value/sample)
    varValue = var.value_counts()
    # visualize
    plt.figure(figsize=(9,3))
    plt.bar(varValue.index, varValue)
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequanc")
    plt.title(variable)
    plt.show()
    
    print("{}\n{}".format(variable, varValue))

### Categorical variable bar plots

In [None]:
categorical = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal", "target"]
for c in categorical:
    bar_plot(c)

#### Numerical variable bar plots

In [None]:
def plot_hist(variable):
    plt.figure(figsize=(9,3))
    plt.hist(s_dataset[variable], bins=40)
    plt.xlabel(variable)
    plt.ylabel("Frequancy")
    plt.title("{} distribution with histogram".format(variable))
    plt.show()
numericVar = ["age", "trestbps", "chol", "thalach", "oldpeak"]
for n in numericVar:
    plot_hist(n)

#### By observing the kaggle dataset there are no missing values

## Correlation heatmap

In [None]:
s_dataset.corr()

In [None]:

plot = sb.heatmap(dataset.corr(), linewidths=0 )

# ML ALGORITHMS
## Cross Validation Algorithms
#### Decision tree classifier
#### Support vector classifier
#### Random Forest Classifier
#### Logistic Regression
#### KNeighbours Classifier

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

### Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size = 0.2 , random_state = 1)

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
random_state=42
classifier = [DecisionTreeClassifier(random_state=random_state),
             SVC(random_state=random_state),
             RandomForestClassifier(random_state=random_state),
             LogisticRegression(random_state=random_state),
             KNeighborsClassifier()]

dt_param_grid = {"min_samples_split":range(10,500,20),
                "max_depth":range(1,20,2)}

svc_param_grid = {"kernel":["rbf"],
                 "gamma":[0.001,0.01,0.1,1],
                 "C":[1,10,50,100,200,300,1000]}

rf_param_grid = {"max_features":[1.3,10],
                "min_samples_split":[2,3,10],
                "min_samples_leaf":[1,3,10],
                "bootstrap":[False],
                "n_estimators":[100,300],
                "criterion":["gini"]}

lr_param_grid = {"C":np.logspace(-3,3,7),
                "penalty":["l1","l2"]}

knn_param_grid = {"n_neighbors":np.linspace(1,19,10, dtype=int).tolist(),
                 "weights":["uniform","distance"],
                 "metric":["euclidean","manhattan"]}

classifier_param = [dt_param_grid,
                   svc_param_grid,
                   rf_param_grid,
                   lr_param_grid,
                   knn_param_grid]

In [None]:
model_names=["DecisionTree :", "SVC : ", "RandomForest : ", "LogisticRegression : ", "KNN : "]
cv_result = []
best_estimators = []
for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv=StratifiedKFold(n_splits=10), scoring="accuracy", n_jobs=-1, verbose=1)
    clf.fit(X_train,y_train)
    cv_result.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print(model_names[i], cv_result[i])

In [None]:
print(cv_result)
cv_results = pd.DataFrame({"Cross Validation Means":cv_result, "ML Models":["DecisionTreeClassifier", "SVC", "RandomForestClassifier", "LogisticRegression", "KNeigborsClassifier"]})

g = sb.barplot(x="Cross Validation Means", y = "ML Models", data=cv_results)
g.set_xlabel("Means Accuracy")
g.set_title("Cross Validation Scores")
plt.show()

# Artificial Neural Network(ANN)
##  Building the ANN
### Initialising the ANN

In [None]:
ann = tf.keras.models.Sequential()

### Adding the input layer and the first hidden layer

In [None]:
ann.add(tf.keras.layers.Dense(units=9, activation ='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
ann.add(tf.keras.layers.Dropout(0.45))

In [None]:
ann.add(tf.keras.layers.Dense(units=6, activation ='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
ann.add(tf.keras.layers.Dropout(0.52))

### Adding the output layer

In [None]:
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

## Part 3 - Training the ANN
### Compiling the ANN

In [None]:
optimizer = keras.optimizers.Adam(learning_rate= 0.01)
ann.compile( optimizer=optimizer , loss = 'binary_crossentropy' , metrics = ['accuracy']  )


### Training the ANN on the Training set

In [None]:
ann.fit(X_train,y_train , validation_split=0.2,batch_size= 16 , verbose=2,shuffle = True,epochs  = 100)
test_loss, test_acc = ann.evaluate(X_test, y_test, verbose=2)
print('\nTest accuracy:', test_acc)


## XGBOoost Classifier

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate=0.01, n_estimators=25, max_depth=15,gamma=0.6, subsample=0.52,colsample_bytree=0.6,seed=27, 
                    reg_lambda=2, booster='dart', colsample_bylevel=0.6, colsample_bynode=0.5)
xgb.fit(X_train, y_train)
xgb_predicted = xgb.predict(X_test)
xgb_conf_matrix = confusion_matrix(y_test, xgb_predicted)
xgb_acc_score = accuracy_score(y_test, xgb_predicted)
print("confussion matrix")
print(xgb_conf_matrix)
print("\n")
print("Accuracy of Extreme Gradient Boost:",xgb_acc_score*100,'\n')


In [None]:
model_ev = pd.DataFrame({'Model': ['Extreme Gradient Boost','ANN'], 'Accuracy': [xgb_acc_score*100,test_acc*100]})
model_ev

In [58]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('svr', make_pipeline(StandardScaler(),
                           LinearSVC(random_state=42)))
 ]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
 )
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
     X, y, stratify=y, random_state=42
 )
clf.fit(X_train, y_train).score(X_test, y_test)

0.8421052631578947

## Making the predictions and evaluating the model

In [None]:
age = int(input("Enter your age: "))
sex = int(input("Enter 0 if you are female, 1 if you are male: "))
cp = int(input("Enter your chest pain type 1/2/3/4: "))
trestbps = int(input("Enter your resting blood pressure: "))
chol = int(input("Enter your serum cholestrol in mg/dl: "))
fbs = int(input("Enter 1 if your fbs is greater than 120mg/dl: "))
restecg = int(input("Enter your resting ecg 0/1/2: "))
thalach = int(input("Enter maximum heart rate achieved : "))
exang = int(input("Do you have exercise induced agnia? 0/1:  "))
oldpeak = float(input("Enter ST depression induced by exercise relative to rest?:   "))
slope = int(input("Slope of the peak exercise ST segment: 0/1/2:  "))
ca = int(input("number of major vessels (0-3): "))
thal = int(input("0 = normal; 1 = fixed defect; 2 = reversable defect:  "))


In [None]:
if(ann.predict(sc.transform([[age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal]]))>0.5):
    print("You have a possibility of heart attack")
else:
    print("No worries")