In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import random as rd
import matplotlib.patches as mpatches
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.style as stl

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score   

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import  RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import os
import csv



In [None]:
#pip install pandas_profiling
from pandas_profiling import ProfileReport

In [None]:
#basic functons ()
def find_extention(path):
    extention=(os.path.splitext(path)[1])[1:]
    return extention

def find_delimiter(filename):
    sniffer = csv.Sniffer()
    with open(filename) as fp:
        delimiter = sniffer.sniff(fp.read(5000)).delimiter
    fp.close
    return delimiter


def open_file(path, extention):
    if extention=="csv":
        try:
            df=pd.read_csv(path, encoding=open(path).encoding, sep=find_delimiter(path))
            return df
          
        except Exception as e:
            #In case of error, return the message
            return e
                
    elif extention=="xlsx":        
        try:
            df=pd.read_excel(path, encoding=open(path).encoding)
            return df
        except Exception as e:
            return e
    else:
        print("Extension not supported, please upload a csv or xlsx file")
        return False

def g_coor(df):
    plt.figure(figsize=(20,12))
    sns.set_context('notebook',font_scale = 1.3)
    sns.set_theme()
    matrix = np.triu(df.corr())
    sns.heatmap(df.corr(),annot=True,linewidth =2,mask=matrix)
    plt.tight_layout()

def rd_forest(col, df, target):
    if len(col)>0:
        df=df[col]    

    X = df.drop(target, axis=1)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test
    #Create a Gaussian Classifier
    clf=RandomForestClassifier(n_estimators=100)
    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(X_train,y_train)

    y_pred=clf.predict(X_test)

    return y_pred, y_test

def knn_teste(col, df, target):
    if len(col)>0:
        df=df[col]
        
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    knn = KNeighborsClassifier(n_neighbors = 10)
    knn.fit(X_train,y_train)
    y_pred1 = knn.predict(X_test)

    return y_test,y_pred1

In [None]:
file_heart_part1="../input/heart-desease-dataset/Heart-disease/heart_part1.csv"
file_heart_part2="../input/heart-desease-dataset/Heart-disease/heart_part2.csv"


#abrindo os arquiivos

df_heart_part1=open_file(file_heart_part1,find_extention(file_heart_part1))
df_heart_part2=open_file(file_heart_part2, find_extention(file_heart_part2))


df_all_data=pd.concat((df_heart_part1,df_heart_part2), axis=0)

df_all_data=df_all_data.drop("Unnamed: 0", axis=1)
#simplifying to take up less memory
df_all_data["Age"] = df_all_data.Age.astype(np.int16)
df_all_data["RestingBP"] = df_all_data.RestingBP.astype(np.int16)
df_all_data["Cholesterol"] = df_all_data.Cholesterol.astype(np.int16)
df_all_data["FastingBS"] = df_all_data.FastingBS.astype(np.int16)
df_all_data["MaxHR"] = df_all_data.MaxHR.astype(np.int16)
df_all_data["HeartDisease"] = df_all_data.HeartDisease.astype(np.int16)

In [None]:
df_all_data.head()

In [None]:
ProfileReport(df_all_data)

In [None]:
sns.set_context('notebook',font_scale = 2.3)
sns.set_theme()
df=df_all_data.drop('HeartDisease', axis=1)
df.corrwith(df_all_data.HeartDisease).plot(kind='bar', grid=False, figsize=(20, 10) )
plt.tight_layout()
plt.show()

The correlation is not that great, denser than the largest .04 (module). For clarity, let's just look at our goal: HeartDisease

In [None]:
sns.set_context('notebook',font_scale = 2.3)
sns.set_theme()
df_sem_target=df_all_data.drop('HeartDisease', axis=1)
df_sem_target.corrwith(df_all_data.HeartDisease).plot(kind='bar', grid=False, figsize=(20, 10), title="Correlação com HeartDisease" )
plt.tight_layout()

This means that we cannot expect a very accurate model. A model is real with accuracy between 70-90%. Thus, for the model to be practicable, it must have at least 70%.

Considering that it deals with a labeled and discrete database, the most suitable model will be the supervised one with a regression model.

The main question is: given this information, the patient will or will not have heart disease.

A YES/NO decision in which a decision tree algorithm is welcome, so I understand that Random Forest is the most suitable tool for the job, but as a countermeasure of the comparison, KNN will be applied and compare the results.

As it turned out, the dataframe has high correlation in numeric variables. Considering that there has already been a "cleanup" and that the data remains to be seen if the literal data can be used to improve the performance of the model.
So, let's make a comparison.

FIRST ONLY WITH THE INT DATA FROM DATASET

In [None]:
cols=[]
for col in df_all_data.columns.unique():
    if df_all_data[col].dtype== np.dtype('int16'):
        cols.append(col)

pred_rd_forest, teste_rd_forest=rd_forest(cols, df_all_data, "HeartDisease")
pred_knn, teste_knn=knn_teste(cols, df_all_data,"HeartDisease")

print("Randrom Forest's accuracy : {}".format(accuracy_score(teste_rd_forest,pred_rd_forest)))
print("KNN's accuracy: {}".format(accuracy_score(teste_knn,pred_knn)))

Random Forest performed better, and accuracy between 70% and 90% can be considered realistic. But our result was on the threshold of acceptable and is very generic (underfitting).

Let's try to improve this by replacing strings with integers as the "weight" of each result type. It may sound "holistic" but considering that Ramdom Forest is a combination of decision trees, it's worth a try.

Obviously I'm not a doctor (and here may be the Achilles heel of this action), but what counts is the correlation between data, so, in theory, the values, even if random, can get the same result

In [None]:
#Escalando as variáveis 
dict_RestingECG= {
                'Normal':1,
                'ST':2,
                'LVH':3,
                }

dict_RestingECG={
                'Normal':0,
                'ST':2,
                'LVH':1,
                }

dict_ST_Slope={
                'Up':2,
                'Flat':1,
                'Down':0,
                }
                
dict_ChestPainType = {
                        "TA":1,
                        "ATA":2,
                        "NAP":3,
                        "ASY":4,
                     }
pain_list=df_all_data.ChestPainType.unique()
for pain in pain_list:
    df_all_data.ChestPainType=df_all_data.ChestPainType.replace([pain],dict_ChestPainType[pain])

for ECG in df_all_data.RestingECG.unique():
    df_all_data.RestingECG=df_all_data.RestingECG.replace([ECG], dict_RestingECG[ECG])

for slope in df_all_data.ST_Slope.unique():
    df_all_data.ST_Slope=df_all_data.ST_Slope.replace([slope],dict_ST_Slope[slope])

#for binary data, just 1 or 0
df_all_data.Sex=df_all_data.Sex.replace(["M","F"],[1,0])
df_all_data.ExerciseAngina=df_all_data.ExerciseAngina.replace(["Y","N"],[1,0])

df_all_data.head()

In [None]:
#let's see new correlation
ProfileReport(df_all_data)

It's seems we've made some progress

In [None]:
cols=[]

pred_rd_forest, teste_rd_forest=rd_forest(cols, df_all_data, "HeartDisease")
pred_knn, teste_knn=knn_teste(cols, df_all_data,"HeartDisease")

print("Randrom Forest's accuracy : {}".format(accuracy_score(teste_rd_forest,pred_rd_forest)))
print("KNN's accuracy: {}".format(accuracy_score(teste_knn,pred_knn)))

KNN has not improved much and is still below the acceptable minimum, so we can bear that KNN is not suitable for our analysis.

However, Ramdont Flores improved and is in a totally acceptable range.

We will bring a more complete analysis to evaluate our models.

At last, let's check it out classification report

In [None]:
print(classification_report(pred_rd_forest, teste_rd_forest))