# HEART DISEASSE DETECTION

## Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Read dataset

In [2]:
data_set = pd.read_csv('heart_all.csv')

## Exploring data

In [3]:
data_set.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [4]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

## Remove nulls if exist

In [5]:
data_set.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [6]:
data_set.isna().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

## Remove duplicates if exists

In [7]:
data_set.duplicated().sum()
data_set = data_set.drop(columns=["SleepTime"])

18078

In [8]:
data_set = data_set.drop_duplicates()

In [9]:
data_set.nunique()

HeartDisease           2
BMI                 3604
Smoking                2
AlcoholDrinking        2
Stroke                 2
PhysicalHealth        31
MentalHealth          31
DiffWalking            2
Sex                    2
AgeCategory           13
Race                   6
Diabetic               4
PhysicalActivity       2
GenHealth              5
SleepTime             24
Asthma                 2
KidneyDisease          2
SkinCancer             2
dtype: int64

In [10]:
data_set['GenHealth'].unique()

array(['Very good', 'Fair', 'Good', 'Poor', 'Excellent'], dtype=object)

## Replace strings with numirical logical values

In [11]:
# replacing code here
data_set["Sex"] = data_set["Sex"].replace({"Male": 1, "Female": 0})
columns_to_convert = ["HeartDisease", "Smoking", "AlcoholDrinking", "DiffWalking", "Diabetic", "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer", "Stroke"]
for column in columns_to_convert:
    data_set[column] = data_set[column].replace({"Yes": 1, "No": 0})
data_set["Male"] = data_set["Sex"]
data_set["Female"] = 1 - data_set["Sex"]
data_set = data_set.drop(columns=["Sex", "Race"])
data_set['GenHealth'] = data_set['GenHealth'].replace({"Excellent": 5, "Very good": 4, "Good": 3, "Fair": 2,"Poor":1 })
data_set['AgeCategory']=data_set['AgeCategory'].replace(['55-59', '80 or older', '65-69', '75-79', '40-44', '70-74',
       '60-64', '50-54', '45-49', '18-24', '35-39', '30-34', '25-29'],[57, 85, 67, 77, 42, 72, 62, 52, 47, 21, 37, 32, 27])
data_set.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,AgeCategory,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Male,Female
0,0,16.6,1,0,0,3.0,30.0,0,55-59,1,1,4,5.0,1,0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,80 or older,0,1,4,7.0,0,0,0,0,1
2,0,26.58,1,0,0,20.0,30.0,0,65-69,1,1,2,8.0,1,0,0,1,0
3,0,24.21,0,0,0,0.0,0.0,0,75-79,0,0,3,6.0,0,0,1,0,1
4,0,23.71,0,0,0,28.0,0.0,1,40-44,0,1,4,8.0,0,0,0,0,1


## Detect outliers

In [22]:
without_label = data_set[:,1:]
data_mean = np.mean(without_label, axis=0)
data_standered = np.std(without_label, axis=0)
z_score = np.abs((without_label-data_mean) / data_standered)
outliers = np.where(z_score > 3)

InvalidIndexError: (slice(None, None, None), slice(1, None, None))

## Remove outliers

In [None]:
# remove outliers code 