In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
%matplotlib inline
from nltk import DecisionTreeClassifier, accuracy, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import  RandomizedSearchCV, train_test_split
#from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.impute import  SimpleImputer
#"reade csv file "
data = pd.read_csv("heart.csv")
data

In [None]:
#to check for missing values
data.info()
#no NULL values!

In [None]:
#check for inconsistent data
#see if the features are correctly classified in python(binary,categorical and continous)
data.nunique()
#Feature ‘ca’ ranges from 0–3, however, df.nunique() listed 0–4. So lets find the ‘4’ and change them to NaN
#Also Feature ‘thal’ ranges from 1–3, however, df.nunique() listed 0–3. There are two values of ‘0’. So lets change them to NaN

In [None]:
#to see each unique category in feature 'ca'
data['ca'].unique()
#we need to change the value 4 to NaN thats why we need to find these rows and know how many rows are we going to change

In [None]:
data.ca.value_counts()
#5 rows have wrong ca value

In [None]:
data[data['ca']==4]
#to visualize the wrong ca values having value 4

In [None]:
data.loc[data['ca']==4,'ca']=np.NaN
#check for unique values again
data['ca'].unique()

In [None]:
#same for Feature ‘thal’
#to see each unique category in feature 'thal'
data['thal'].unique()
#we need to change the value 0 to NaN thats why we need to find these rows and know how many rows are we going to change

In [None]:
data.thal.value_counts()
#2 rows have wrong thal value

In [None]:
data[data['thal']==0]
#to visualize the wrong thal values having value 0

In [None]:
data.loc[data['thal']==0,'thal']=np.NaN #replace wrong data with NaN
#check for unique values again
data['thal'].unique()

In [None]:
#now its time to check for missing values agian previously we had no miising values but now we hanve 5 for ca and 2 for thal
data.isnull().sum()

In [None]:
#replace the nan with median(has better accuracy than removing thw whole row)
data = data.fillna(data.median())
#check agian
data.isnull().sum()

In [None]:
#check for duplicate rows
duplicated = data.duplicated().sum()
if duplicated:
  print("Duplicated values = {} ".format(duplicated))
else:
  print("No duplicate values")

#to display duplicate row
data[data.duplicated()==True] 
#there is one duplicated row

In [None]:
#to remove THE FIRST duplicate row
data.drop_duplicates(subset=None, keep='first', inplace=True)
data

In [None]:
#check the outliers value in each feature

data.plot(kind='box', subplots=True, layout=(2,7),
sharex=False,sharey=False, figsize=(20, 10), 
color='blue');
continous_features = ['age','trestbps','chol','thalach','oldpeak']  
def outliers(data, drop = False):
    for each_feature in data.columns:
        feature_data = data[each_feature]
        Q1 = np.percentile(feature_data, 25.) # 25th percentile of the data of the given feature
        Q3 = np.percentile(feature_data, 75.) # 75th percentile of the data of the given feature
        IQR = Q3-Q1 #Interquartile Range
        outlier_step = IQR * 1.5 #That's we were talking about above
        outliers = feature_data[~((feature_data >= Q1 - outlier_step) & (feature_data <= Q3 + outlier_step))].index.tolist()  
        if not drop:
            print('For the feature {}, No of Outliers is {}'.format(each_feature, len(outliers)))
        if drop:
            data.drop(outliers, inplace = True, errors = 'ignore')
            print('Outliers from {} feature removed'.format(each_feature))

outliers(data[continous_features])

In [None]:

#figure that describe the outliers of features

#remove outliers and replace the NaN wwith median
for x in data:
    q75,q25 = np.percentile(data.loc[:,x],[75,25])
    intr_qr = q75-q25

    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)

    data.loc[data[x] < min,x] = np.nan
    data.loc[data[x] > max,x] = np.nan

data = data.fillna(data.median())
data

In [None]:
#check agian for NULLS
data.isnull().sum()
#no NaN values

In [None]:
from sklearn.model_selection import train_test_split

predictors = data.drop("target",axis=1)
target = data["target"]

X_train,X_test,Y_train,Y_test = train_test_split(predictors,target,train_size=0.80,test_size=0.20,random_state=0)


knn = KNeighborsClassifier(n_neighbors = 8)
knn.fit(X_train,Y_train)
y_pred1 = knn.predict(X_test)
score_nb = round(accuracy_score(y_pred1,Y_test)*100,2)

print("The accuracy score achieved using KNN is: "+str(score_nb)+" %")

from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

nb.fit(X_train,Y_train)

Y_pred_nb = nb.predict(X_test)
score_nb = round(accuracy_score(Y_pred_nb,Y_test)*100,2)

print("The accuracy score achieved using Naive Bayes is: "+str(score_nb)+" %")

from sklearn.tree import DecisionTreeClassifier

max_accuracy = 0

for x in range(302):
    dt = DecisionTreeClassifier(random_state=x)
    dt.fit(X_train, Y_train)
    Y_pred_dt = dt.predict(X_test)
    current_accuracy = round(accuracy_score(Y_pred_dt, Y_test) * 100, 2)
    if (current_accuracy > max_accuracy):
        max_accuracy = current_accuracy
        best_x = x




dt = DecisionTreeClassifier(random_state=best_x)
dt.fit(X_train, Y_train)
Y_pred_dt = dt.predict(X_test)
score_dt = round(accuracy_score(Y_pred_dt,Y_test)*100,2)
clf_report = pd.DataFrame(classification_report(Y_test, Y_pred_dt, output_dict=True))
print("The accuracy score achieved using Decision Tree is: "+str(score_dt)+" %")
#print(clf_report)

from sklearn.ensemble import RandomForestClassifier

max_accuracy = 0

for x in range(302):
    rf = RandomForestClassifier(random_state=x) 
    rf.fit(X_train, Y_train)
    Y_pred_rf = rf.predict(X_test)
    current_accuracy = round(accuracy_score(Y_pred_rf, Y_test) * 100, 2)
    if (current_accuracy > max_accuracy):
        max_accuracy = current_accuracy
        best_x = x



rf = RandomForestClassifier(random_state=best_x)
rf.fit(X_train, Y_train)
Y_pred_rf = rf.predict(X_test)

score_rf = round(accuracy_score(Y_pred_rf,Y_test)*100,2)

print("The accuracy score achieved using random classifier is: "+str(score_rf)+" %")

In [None]:

plt.figure(figsize=(20,12))
sns.set_context('notebook',font_scale = 1.3)
sns.heatmap(data.corr(),annot=True,linewidth =2)
plt.tight_layout()

In [None]:
sns.set_context('notebook',font_scale = 2.3)
data.drop('target', axis=1).corrwith(data.target).plot(kind='bar', grid=True, figsize=(20, 10),
title="Correlation with the target feature")
plt.tight_layout()
plt.show()
"Four feature( “cp”, “restecg”, “thalach”, “slope” ) are positively correlated with the target feature."
"oldpeak, exang, ca, thal, sex, age shows a good negative correlation with target"
"fbs ,chol, trestbps, restecg has low correlation with our target"

In [None]:
"Age Analysis"
" divide the Age feature into three parts  “Young”, “Middle” and “Elder”"
Young = data[(data.age>=29)&(data.age<40)]
Middle = data[(data.age>=40)&(data.age<55)]
Elder = data[(data.age>55)]

plt.figure(figsize=(23,10))
sns.set_context('notebook',font_scale = 1.5)
sns.barplot(x=['young ages','middle ages','elderly ages'],y=[len(Young),len(Middle),len(Elder)])
plt.tight_layout()

"we can see that elder people are the most affected by heart disease and young ones are the least affected."

In [None]:
colors = ['blue','green','yellow']
explode = [0,0,0.1]
plt.figure(figsize=(10,10))
sns.set_context('notebook',font_scale = 1.2)
plt.pie([len(Young),len(Middle),len(Elder)],labels=['young ages','middle ages','elderly ages'],explode=explode,colors=colors, autopct='%1.1f%%')
plt.tight_layout()
"to prove above inference"

In [None]:
sns.barplot(data["sex"],data["target"])
"Analysis sex vs target column "

In [None]:
"Chest Pain Type(cp) Analysis"
plt.figure(figsize=(18,9))
sns.set_context('notebook',font_scale = 1.5)
sns.countplot(data['cp'])
plt.tight_layout()
"Inference: As seen, there are 4 types of chest pain"

In [None]:
"Analyzing cp vs target column"
sns.barplot(data["cp"],data["target"])
"Inference: From the above graph we can make some inferences",

"People having the least chest pain are not likely to have heart disease."
"People having severe chest pain are likely to have heart disease."
"Elderly people are more likely to have chest pain."

In [None]:
"Thal Analysis"
sns.barplot(data["thal"],data["target"])

"thal : [1 = normal, 2 = fixed defect, 3 = reversible defect]"

In [None]:
sns.barplot(data["slope"],data["target"])
"We observe, that Slope '2' causes heart pain much more than Slope '0' and '1'"


In [None]:
data.target.value_counts()

In [None]:
plt.figure(figsize=(18,9))
sns.set_context('notebook',font_scale = 1.5)
sns.countplot(data['target'])
plt.tight_layout()