In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,classification,roc_curve
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import classification_report
from yellowbrick.classifier import ClassificationReport
from sklearn.metrics import log_loss
from matplotlib import pyplot
from numpy import array
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest,SelectPercentile
from mlxtend.classifier import StackingClassifier
from sklearn.externals import joblib 

In [0]:
data = pd.read_csv("/content/gdrive/My Drive/Heart Disease/heart.csv")

In [0]:
data.head(5)

In [0]:
col = data.columns

In [0]:
data.isnull().sum()

In [0]:
X = data.drop(['target'],1)
y=data['target']

In [0]:
X_train, X_test, y_train, y_test = tts(
    X,
    y,
    test_size=0.3,
    random_state=0)
 
X_train.shape, X_test.shape

In [0]:
######################################################### Feature Selection Methods ##################################################################

In [0]:
##Forwards Feature Selection
sfs1=SFS(RandomForestClassifier(n_jobs=-1,n_estimators=100),
         k_features=10,
         forward=True,
         floating=False,
         verbose=2,
         scoring='roc_auc',
         cv=3
         )

sfs1=sfs1.fit(X_train,y_train)

In [0]:
select_feat_forward= X_train.columns[list(sfs1.k_feature_idx_)]
print("Feature Selection Method - Forward Feature Selection : ",select_feat_forward)

In [0]:
## Backward Feature Selection
sfs2=SFS(RandomForestClassifier(n_jobs=1,n_estimators=100),
         k_features=10,
         forward=False,
         floating=False,
         verbose=2,
         scoring='roc_auc',
         cv=3
         )

sfs2=sfs2.fit(np.array(X_train),y_train)

In [0]:
select_feat_backward= X_train.columns[list(sfs1.k_feature_idx_)]
print("Frature Selection Method - Backward Feature Selection : ", select_feat_backward)


Frature Selection Method - Backward Feature Selection :  Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'exang', 'oldpeak', 'ca',
       'thal'],
      dtype='object')


In [0]:
##Mutual Information Gain
mi=mutual_info_classif(X_train,y_train)
mi=pd.Series(mi)
mi.index=X_train.columns
mi.sort_values(ascending=False)
mi.sort_values(ascending=False).plot.bar(figsize=(20,8))

In [0]:
sel_= SelectPercentile(mutual_info_classif,percentile=80).fit(X_train,y_train)
selected_feat_mutual_information=X_train.columns[sel_.get_support()]
print("Feature Selection method - Mutual Information Gain : ",selected_feat_mutual_information)

In [0]:
##univariate Analysis
univariate = f_classif(X_train,y_train)
univariate

univariate = pd.Series(univariate[1])
univariate.index=X_train.columns
univariate.sort_values(ascending=False,inplace=True)

univariate.sort_values(ascending=False).plot.bar(figsize=(20,8))

In [0]:
sel_ =SelectPercentile(f_classif,percentile=80).fit(X_train,y_train)
selected_feat_univariate=X_train.columns[sel_.get_support()]
print("Feature Selection Method - Univariate Analysis : ",selected_feat_univariate)

In [0]:
  ################################################################ Models Applied ############################################################################

In [0]:
def plot_roc_curve(fpr, tpr):  
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [0]:
#Randomise Search CV for Ada Boost Random Forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}
print(random_grid)

In [0]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train,y_train)

In [0]:
rf_random.best_params_

In [0]:
def AdaBoost(X_train,y_train,X_test,y_test):
  ada = AdaBoostClassifier(RandomForestClassifier(),n_estimators= 50, random_state=42)
  ada.fit(X_train,y_train)
  print("Ada Boost:train set")
  y_pred = ada.predict(X_train)
  pred=ada.predict_proba(X_test)   
  print("Ada Boost:Confusion Matrix: ", confusion_matrix(y_train, y_pred))
  print ("Ada Boost:Accuracy : ", accuracy_score(y_train,y_pred)*100)
  print("Ada Boost:Test set")
  y_pred = ada.predict(X_test)
  print("Ada Boost:Confusion Matrix: ", confusion_matrix(y_test, y_pred))
  print ("Ada Boost:Accuracy : ", accuracy_score(y_test,y_pred)*100)
  #confusion Matrix
  matrix =confusion_matrix(y_test, y_pred)
  class_names=[0,1] 
  fig, ax = plt.subplots()
  tick_marks = np.arange(len(class_names))
  plt.xticks(tick_marks, class_names)
  plt.yticks(tick_marks, class_names)
  sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g')
  ax.xaxis.set_label_position("top")
  plt.tight_layout()
  plt.title('Confusion matrix', y=1.1)
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')
  plt.show()
  #ROC_AUC curve
  probs = ada.predict_proba(X_test) 
  probs = probs[:, 1]  
  auc = roc_auc_score(y_test, probs)  
  print('AUC: %.2f' % auc)
  le = preprocessing.LabelEncoder()
  y_test1=le.fit_transform(y_test)
  fpr, tpr, thresholds = roc_curve(y_test1, probs)
  plot_roc_curve(fpr, tpr)
  #Log_Loss
  loss = log_loss(y_test, probs)
  print("Log Loss : ", loss)
  yhat = [x*0.01 for x in range(0, 101)]
  # evaluate predictions for a 0 true value
  losses_0 = [log_loss([0], [x], labels=[0,1]) for x in yhat]
  # evaluate predictions for a 1 true value
  losses_1 = [log_loss([1], [x], labels=[0,1]) for x in yhat]
  # plot input to loss
  pyplot.plot(yhat, losses_0, label='true=0')
  pyplot.plot(yhat, losses_1, label='true=1')
  pyplot.legend()
  pyplot.show()
  #Classification Report
  target_names = ['Yes', 'No']
  prediction=ada.predict(X_test)
  print(classification_report(y_test, prediction, target_names=target_names))
  classes = ["Yes", "No"]
  visualizer = ClassificationReport(ada, classes=classes, support=True)
  visualizer.fit(X_train, y_train)  
  visualizer.score(X_test, y_test)  
  g = visualizer.poof()

In [0]:
AdaBoost(X_train[select_feat_forward],y_train,X_test[select_feat_forward],y_test)

In [0]:
#XGBOOST
def XGBoost(X_train,y_train,X_test,y_test):
  xgb=XGBClassifier(max_depth=5, learning_rate=6, n_estimators=100, subsample=0.8, colsample_bytree=1, reg_alpha=0.5)
  xgb.fit(X_train,y_train)
  print("xgb Boost:train set")
  y_pred = xgb.predict(X_train)
  pred=xgb.predict_proba(X_test)   
  print("xgb Boost:Confusion Matrix: ", confusion_matrix(y_train, y_pred))
  print ("xgb Boost:Accuracy : ", accuracy_score(y_train,y_pred)*100)
  print("xgb Boost:Test set")
  y_pred = xgb.predict(X_test)
  print("xgb Boost:Confusion Matrix: ", confusion_matrix(y_test, y_pred))
  print ("xgb Boost:Accuracy : ", accuracy_score(y_test,y_pred)*100)
  #confusion Matrix
  matrix =confusion_matrix(y_test, y_pred)
  class_names=[0,1] 
  fig, ax = plt.subplots()
  tick_marks = np.arange(len(class_names))
  plt.xticks(tick_marks, class_names)
  plt.yticks(tick_marks, class_names)
  sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g')
  ax.xaxis.set_label_position("top")
  plt.tight_layout()
  plt.title('Confusion matrix', y=1.1)
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')
  plt.show()
  #ROC_AUC curve
  probs = xgb.predict_proba(X_test) 
  probs = probs[:, 1]  
  auc = roc_auc_score(y_test, probs)  
  print('AUC: %.2f' % auc)
  le = preprocessing.LabelEncoder()
  y_test1=le.fit_transform(y_test)
  fpr, tpr, thresholds = roc_curve(y_test1, probs)
  plot_roc_curve(fpr, tpr)
  #Log_Loss
  loss = log_loss(y_test, probs)
  print("Log Loss : ", loss)
  yhat = [x*0.01 for x in range(0, 101)]
  # evaluate predictions for a 0 true value
  losses_0 = [log_loss([0], [x], labels=[0,1]) for x in yhat]
  # evaluate predictions for a 1 true value
  losses_1 = [log_loss([1], [x], labels=[0,1]) for x in yhat]
  # plot input to loss
  pyplot.plot(yhat, losses_0, label='true=0')
  pyplot.plot(yhat, losses_1, label='true=1')
  pyplot.legend()
  pyplot.show()
  #Classification Report
  target_names = ['Yes', 'No']
  prediction=xgb.predict(X_test)
  print(classification_report(y_test, prediction, target_names=target_names))
  classes = ["Yes", "No"]
  visualizer = ClassificationReport(xgb, classes=classes, support=True)
  visualizer.fit(X_train, y_train)  
  visualizer.score(X_test, y_test)  
  g = visualizer.poof()

In [0]:
XGBoost(X_train[select_feat_forward],y_train,X_test[select_feat_forward],y_test)

In [0]:
#Voting Classifier
knn = KNeighborsClassifier()
params_knn = {'n_neighbors': np.arange(1, 25)}
knn_gs = GridSearchCV(knn, params_knn, cv=5)
knn_gs.fit(X_train, y_train)
knn_best = knn_gs.best_estimator_
log_reg = LogisticRegression()
svm = SVC(probability=True)

In [0]:
def Voting(X_train,y_train,X_test,y_test):
  estimators=[('KNN', knn_best), ('SVM', svm), ('Logistic', log_reg)]
  vc = VotingClassifier(estimators,voting='soft')  
  vc.fit(X_train,y_train)
  print("Voting Classifier :train set")
  y_pred = vc.predict(X_train)
  #pred=vc.predict_proba(X_test)   
  print("Voting Classifier :Confusion Matrix: ", confusion_matrix(y_train, y_pred))
  print ("Voting Classifier :Accuracy : ", accuracy_score(y_train,y_pred)*100)
  print("Voting Classifier :Test set")
  y_pred = vc.predict(X_test)
  print("Voting Classifier :Confusion Matrix: ", confusion_matrix(y_test, y_pred))
  print ("Voting Classifier :Accuracy : ", accuracy_score(y_test,y_pred)*100)
  #confusion Matrix
  matrix =confusion_matrix(y_test, y_pred)
  class_names=[0,1] 
  fig, ax = plt.subplots()
  tick_marks = np.arange(len(class_names))
  plt.xticks(tick_marks, class_names)
  plt.yticks(tick_marks, class_names)
  sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g')
  ax.xaxis.set_label_position("top")
  plt.tight_layout()
  plt.title('Confusion matrix', y=1.1)
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')
  plt.show()
  #ROC_AUC curve
  probs = vc.predict_proba(X_test) 
  probs = probs[:, 1]  
  auc = roc_auc_score(y_test, probs)  
  print('AUC: %.2f' % auc)
  le = preprocessing.LabelEncoder()
  y_test1=le.fit_transform(y_test)
  fpr, tpr, thresholds = roc_curve(y_test1, probs)
  plot_roc_curve(fpr, tpr)
  #Log_Loss
  loss = log_loss(y_test, probs)
  print("Log Loss : ", loss)
  yhat = [x*0.01 for x in range(0, 101)]
  # evaluate predictions for a 0 true value
  losses_0 = [log_loss([0], [x], labels=[0,1]) for x in yhat]
  # evaluate predictions for a 1 true value
  losses_1 = [log_loss([1], [x], labels=[0,1]) for x in yhat]
  # plot input to loss
  pyplot.plot(yhat, losses_0, label='true=0')
  pyplot.plot(yhat, losses_1, label='true=1')
  pyplot.legend()
  pyplot.show()
  #Classification Report
  target_names = ['Yes', 'No']
  prediction=vc.predict(X_test)
  print(classification_report(y_test, prediction, target_names=target_names))
  classes = ["Yes", "No"]
  visualizer = ClassificationReport(vc, classes=classes, support=True)
  visualizer.fit(X_train, y_train)  
  visualizer.score(X_test, y_test)  
  g = visualizer.poof()

In [0]:
Voting(X_train[select_feat_forward],y_train,X_test[select_feat_forward],y_test)

In [0]:
#knn1 = KNeighborsClassifier()
#log_reg1 = LogisticRegression()
#svm1 = SVC(probability=True)
#gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.05, max_features=2, max_depth = 2, random_state = 0)
ada = AdaBoostClassifier(RandomForestClassifier(bootstrap=True,max_depth= 70,max_features= 'auto',min_samples_leaf= 4,min_samples_split= 10,n_estimators= 400),n_estimators= 400, random_state=42)
xgb=XGBClassifier(max_depth=5, learning_rate=0.01, n_estimators=100, gamma=0,min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.005)
rf2=RandomForestClassifier(bootstrap=True,max_depth= 70,max_features= 'auto',min_samples_leaf= 4,min_samples_split= 10,n_estimators= 400)
def stacking(X_train,y_train,X_test,y_test):
  classifiers=[xgb,rf2]
  sc = StackingClassifier(classifiers,meta_classifier=ada)  
  sc.fit(X_train,y_train)
  joblib.dump(sc, '/content/gdrive/My Drive/Heart Disease/stacking.pkl') 
  print("Stacking Classifier :train set")
  y_pred = sc.predict(X_train)
  #pred=vc.predict_proba(X_test)   
  print("Stacking Classifier :Confusion Matrix: ", confusion_matrix(y_train, y_pred))
  print ("Stacking Classifier :Accuracy : ", accuracy_score(y_train,y_pred)*100)
  print("Stacking Classifier :Test set")
  y_pred = sc.predict(X_test)
  print("Stacking Classifier :Confusion Matrix: ", confusion_matrix(y_test, y_pred))
  print ("Stacking Classifier :Accuracy : ", accuracy_score(y_test,y_pred)*100)
  #confusion Matrix
  matrix =confusion_matrix(y_test, y_pred)
  class_names=[0,1] 
  fig, ax = plt.subplots()
  tick_marks = np.arange(len(class_names))
  plt.xticks(tick_marks, class_names)
  plt.yticks(tick_marks, class_names)
  sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g')
  ax.xaxis.set_label_position("top")
  plt.tight_layout()
  plt.title('Confusion matrix', y=1.1)
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')
  plt.show()
  #ROC_AUC curve
  probs = sc.predict_proba(X_test) 
  probs = probs[:, 1]  
  auc = roc_auc_score(y_test, probs)  
  print('AUC: %.2f' % auc)
  le = preprocessing.LabelEncoder()
  y_test1=le.fit_transform(y_test)
  fpr, tpr, thresholds = roc_curve(y_test1, probs)
  plot_roc_curve(fpr, tpr)
  #Log_Loss
  loss = log_loss(y_test, probs)
  print("Log Loss : ", loss)
  yhat = [x*0.01 for x in range(0, 101)]
  # evaluate predictions for a 0 true value
  losses_0 = [log_loss([0], [x], labels=[0,1]) for x in yhat]
  # evaluate predictions for a 1 true value
  losses_1 = [log_loss([1], [x], labels=[0,1]) for x in yhat]
  # plot input to loss
  pyplot.plot(yhat, losses_0, label='true=0')
  pyplot.plot(yhat, losses_1, label='true=1')
  pyplot.legend()
  pyplot.show()
  #Classification Report
  target_names = ['Yes', 'No']
  prediction=sc.predict(X_test)
  print(classification_report(y_test, prediction, target_names=target_names))
  classes = ["Yes", "No"]
  visualizer = ClassificationReport(sc, classes=classes, support=True)
  visualizer.fit(X_train, y_train)  
  visualizer.score(X_test, y_test)  
  g = visualizer.poof()

In [0]:
stacking(X_train,y_train,X_test,y_test)

In [0]:
###Prediction of New Datas
age = float(input("enter Value of age : "))	
# male = 1 , female = 0 
sex = float(input("enter Value of sex : "))
cp = float(input("enter Value of cp : "))
trestbps = float(input("enter Value of trestbps : "))
chol = float(input("enter Value of chol : "))
fbs = float(input("enter Value of fbs : "))
restecg = float(input("enter Value of restecg : "))
thalach = float(input("enter Value of thalach : "))
exang = float(input("enter Value of exang : "))
oldpeak = float(input("enter Value of oldpeak : "))
slope = float(input("enter Value of slope : "))
ca = float(input("enter Value of ca : "))
thal = float(input("enter Value of thal : "))


In [0]:
col=col[:-1]

In [0]:
output_data=[]
output_data.append(age)
output_data.append(sex)
output_data.append(cp)
output_data.append(trestbps)
output_data.append(chol)
output_data.append(fbs)
output_data.append(restecg)
output_data.append(thalach)
output_data.append(exang)
output_data.append(oldpeak)
output_data.append(slope)
output_data.append(ca)
output_data.append(thal)

In [0]:
output_data=pd.DataFrame([output_data],columns = col)

In [0]:
sc1 = joblib.load('/content/gdrive/My Drive/Heart Disease/stacking.pkl')  

In [0]:
pred=sc1.predict(output_data)
print("Prediction for newly added data : ",pred)