# Predicting Heart Disease using learning
1. Problem Definition
2. Data
3. Evaluation
4. Features
5. Modelling
6. Experimentation
   ## 1.Problem Definition


In [None]:
import numpy as np
import pandas as pd

In [None]:
#Load the data
hea=pd.read_csv('heart-disease.csv')

In [None]:
hea.head()

In [None]:
hea.info()

In [None]:
hea.isnull().sum()

## Preparing the tools 

we're going to use pandas and numpy 

In [None]:
#Import the tools we need

#regular EDA librariers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#Models from sklearn 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#Model Evaluation
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.metrics import RocCurveDisplay

In [None]:
hea.shape #(rows,col)

## Data Exploration EDA

The goal is to find out more about data and become a subject matter export on the dataset you're working with1
1. What Questions are you trying to solve
2. What kind of data you have and how do you treat diff types?
3. Whats missing from the data and how to deal eith them
4. Where are the outliers and why should you care about them?
5. How can you add change or remove features to get more out of your data?


In [None]:
hea['target'].value_counts()

In [None]:
hea['target'].value_counts().plot(kind='bar',color=['red','lightblue'])

In [None]:
hea.describe()

### Heart Disease Frequency according to Sex

In [None]:
hea.sex.value_counts()

In [None]:
#Compare target column with sex column
pd.crosstab(hea.target,hea.sex).plot(kind='bar')
plt.title('Heart Disease Frequency according to Sex')
plt.xlabel('0 = No Disease , 1 = Disease')
plt.ylabel('Amount')
plt.legend(['Female','Male'])
plt.xticks(rotation=0)
plt.show()

### Age vs Max Heart Rate for Heart Disease

In [None]:
#Create another figure
plt.scatter(hea.age[hea.target==1],
            hea.thalach[hea.target==1],color='red')
plt.scatter(hea.age[hea.target==0],
            hea.thalach[hea.target==0],color='blue')
plt.xlabel('Age')
plt.ylabel('Max Heart rate')
plt.title('Age v Max Heart rate')
plt.legend(['Have heart Disease','No heart disease'])

In [None]:
# Check the distribution of the age column with a histogram
hea.age.plot.hist()

### Heart Disease Frequency per Chest pain type

In [None]:
pd.crosstab(hea.cp,hea.target)

In [None]:
# Make it visualise
pd.crosstab(hea.cp,hea.target).plot(kind='bar')

In [None]:
# make a coorelational matrix
hea.corr()


In [None]:
#lets make our correlational matrrix a little prettier

In [None]:
corr_matrix=hea.corr()
fig,ax=plt.subplots(figsize=(15,10))
ax=sns.heatmap(corr_matrix,annot=True,
              linewidths=0.5,
              fmt=".2f")


## Modelling

In [None]:
hea.head()

In [None]:
# Spliting the data
x=hea.drop('target' ,axis=1)
y=hea['target']

In [None]:

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [None]:
x_train.head()

In [None]:
models={'Logistic Regression':LogisticRegression(),
       'KNN':KNeighborsClassifier(),
       'Random Forest': RandomForestClassifier()}
# Create a function to fit and score models
def fit_score_(models,x_train,y_train,x_test,y_test):
    #set random seeds
    np.random.seed(42)
    model_score={}
    for name,model in models.items():
        #fit the model to the data
        model.fit(x_train,y_train)
        model_score[name]=model.score(x_test,y_test)
    return model_score
x=fit_score_(models,x_train,y_train,x_test,y_test)
x

## Model Comparison

In [None]:
model_compare=pd.DataFrame(x,index=['accuracy'])
model_compare.T.plot(kind='bar')

* Hyperparameter tuning

In [None]:
# lets tune KNN
train_scores=[]
test_scores=[]

#create a list of different values for N_neighbors
neighbors=range(1,21)
knn=KNeighborsClassifier()

for i in neighbors:
    knn.set_params(n_neighbors=i)

    #fit 
    knn.fit(x_train,y_train)
    test_scores.append(knn.score(x_test,y_test))
    train_scores.append(knn.score(x_train,y_train))
    

In [None]:
max(test_scores)

In [None]:
train_scores

## Hyperparameter tuning

In [None]:
# with RandomizedSearchCV
log_reg_grid={"C":np.logspace(-4,4,20),
             "solver":["liblinear"]}
#Create a hyperparameter grid for RandomForestClassifier
rf_grid={"n_estimators":np.arange(10,1000,50),
        "max_depth":[None ,3,5,10],
        "min_samples_split":np.arange(2,20,2),
         "min_samples_leaf":np.arange(1,20,2)}

RandomSearchcv

In [None]:
#Logistic regerrion

np.random.seed(42)

#Setup random hyperparameter search for logisticRegression
rs_log_reg=RandomizedSearchCV(LogisticRegression(),param_distributions=log_reg_grid,
                             cv=5,
                             n_iter=20,verbose=True)

#fit
rs_log_reg.fit(x_train, y_train)

In [None]:
rs_log_reg.best_params_

In [None]:
rs_log_reg.score(x_test,y_test)

In [None]:
rf=RandomizedSearchCV(RandomForestClassifier(),param_distributions=rf_grid,cv=5,n_iter=20,verbose=True)
rf.fit(x_train,y_train)

In [None]:
rf.best_params_

In [None]:
rf.score(x_test,y_test)