In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/heart.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
import seaborn as sns

#### Plotting the no.of Men and Women in the dataset

In [None]:
sns.countplot(x = 'sex',data = df)  # 1-Male 0-Female

#### Plotting to find the relation b/w sex and the presence of heart disease

In [None]:
sns.countplot(x = 'sex',data = df,hue='target') # 1-Male, 0-Female

###### From the above count plot, we can infer that Men have a higher chance of having a Heart disease than Women

#### Finding the correlation b/w the various features

In [None]:
sns.heatmap(df.corr(),cmap = 'Blues')
plt.title('Correlation B/w Various Features')

##### The target variable is highly correlated with the cp feature - which means the chest pain type

#### Exploring the CP feature

In [None]:
df['cp'].unique()

###### so, there are 4 types of Chest Pain type

In [None]:
sns.countplot(x = 'cp',data = df)

#### Chest pain of type 0 is popular. Lets explore the correlation with the target variable

In [None]:
sns.countplot(x = 'cp',data = df,hue = 'target')
plt.title('Chest Pain Type vs Target')

###### From the above count plot, Patients having Chest Pain of type 2, are most susceptible to Heart Disease

#### Thalach - Max heart rate achieved, is also highly correlated with the target variable

In [None]:
df['thalach'].max()

In [None]:
df['thalach'].min()

In [None]:
df['thalach'].mean()

In [None]:
df[df['thalach'] == df['thalach'].max()]

In [None]:
df[df['thalach'] == df['thalach'].min()]

In [None]:
sns.swarmplot(y = 'thalach',x = 'target',data=df)

#### How Target varies wrt Age feature

In [None]:
sns.boxplot(x = 'target',y = 'age',data = df)

## Data Processing

In [None]:
X = df.drop('target',axis = 1)
y = df['target']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X.columns

In [None]:
X.sex.unique()

In [None]:
sex = pd.get_dummies(X['sex'],prefix = 'Gender')

In [None]:
sex = sex.rename(columns = {
    "Gender_0": "Female",
    "Gender_1": "Male"
})

In [None]:
X.drop('sex',inplace=True,axis = 1)

In [None]:
X.sex = sex

In [None]:
X = pd.concat([sex,X],axis = 1)

In [None]:
X.columns

In [None]:
chestPainType = pd.get_dummies(X.cp,prefix = 'Type')

In [None]:
X.drop('cp',axis = 1,inplace=True)

In [None]:
X = pd.concat([X,chestPainType],axis = 1)

In [None]:
X.columns

In [None]:
exang = pd.get_dummies(X.exang,prefix = 'Exang')

In [None]:
X.drop('exang',axis = 1,inplace=True)

In [None]:
X = pd.concat([X,exang],axis = 1)

In [None]:
X.columns

In [None]:
X.ca.unique()

In [None]:
X.head(5)

#### Scaling the data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sca = StandardScaler()

In [None]:
sca.fit(X)

In [None]:
X = sca.transform(X)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.131)

In [None]:
X_train.shape

# Logistics Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log = LogisticRegression()

In [None]:
log.fit(X_train,y_train)

In [None]:
pred = log.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,pred))

# K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors = 2)

In [None]:
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

In [None]:
### Choosing the appropriate no.of neighbors

In [None]:
error_rate = []

for i in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,10),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

###### 2,6,7,8 are some good examples of No.of Neighbors

### Support Vector Classifier

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()

In [None]:
svc.fit(X_train,y_train)

In [None]:
pred = svc.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dec = DecisionTreeClassifier()

In [None]:
dec.fit(X_train,y_train)

In [None]:
pred = dec.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

## Random Forests Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=10)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
pred = rfc.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

### Selecting the optimal no.of estimators

In [None]:
error_rate = []
for i in range(1,10):
    rfc = RandomForestClassifier(n_estimators=i)
    rfc.fit(X_train,y_train)
    pred_i = rfc.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,10),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. No.of Estimators')
plt.xlabel('No.of Estimators')
plt.ylabel('Error Rate')

### Choosing no.of estimators as 7

In [None]:
rfc = RandomForestClassifier(n_estimators = 7)
rfc.fit(X_train,y_train)

In [None]:
pred = rfc.predict(X_test)

In [None]:
print(classification_report(y_test,pred))