In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [45]:
data = pd.read_csv('./data/heart_attack_prediction_dataset.csv')
target = data["Heart Attack Risk"]
data = data.drop(columns=["Patient ID", "Heart Attack Risk"])
categorical_columns = data.select_dtypes(include=['object']).columns
numerical_columns = data.select_dtypes(exclude=['object']).columns
print(" Columns with numerical values are :", list(numerical_columns),"\n","Columns with categorical values are :", list(categorical_columns))

categorical_data = data[categorical_columns]
numerical_data  = data[numerical_columns]

 Columns with numerical values are : ['Age', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity', 'Alcohol Consumption', 'Exercise Hours Per Week', 'Previous Heart Problems', 'Medication Use', 'Stress Level', 'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides', 'Physical Activity Days Per Week', 'Sleep Hours Per Day'] 
 Columns with categorical values are : ['Sex', 'Blood Pressure', 'Diet', 'Country', 'Continent', 'Hemisphere']


### Data preparation

As seen before, there are no missing values in our dataset. We only need to scale our data and split it into train/dev/test.

In [46]:
data["Blood Pressure"].head()

0     158/88
1     165/93
2     174/99
3    163/100
4      91/88
Name: Blood Pressure, dtype: object

The feature "Blood Pressure" is expressed as a ratio of two numbers, like "158/88" or "165/93." These values represent blood pressure measurements in millimeters of mercury (mm Hg) and are typically written in the form of "systolic/diastolic" blood pressure. We'll use these values to categorize the blood pressure into different ranges: 

    Normal Blood Pressure:
        Systolic Pressure: Less than 120 mm Hg
        Diastolic Pressure: Less than 80 mm Hg

    Elevated Blood Pressure:
        Systolic Pressure: 120-129 mm Hg
        Diastolic Pressure: Less than 80 mm Hg

    Hypertension Stage 1:
        Systolic Pressure: 130-139 mm Hg
        Diastolic Pressure: 80-89 mm Hg

    Hypertension Stage 2:
        Systolic Pressure: 140 mm Hg or higher
        Diastolic Pressure: 90 mm Hg or higher

    Hypertensive Crisis:
        Systolic Pressure: Higher than 180 mm Hg
        Diastolic Pressure: Higher than 120 mm Hg

In [47]:
# Function to categorize blood pressure
def categorize_blood_pressure(blood_pressure):
    systolic, diastolic = map(int, blood_pressure.split('/'))
    if systolic < 120 or diastolic < 80:
        return 'Normal'
    elif systolic < 130 or diastolic < 80:
        return 'Elevated'
    elif systolic < 140 or diastolic < 90:
        return 'Hypertension Stage 1'
    elif systolic >= 140 or diastolic >= 90:
        return 'Hypertension Stage 2'
    else:
        return 'Unknown'


categorical_data['Blood Pressure'] = data['Blood Pressure'].apply(categorize_blood_pressure)

categorical_data['Blood Pressure'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical_data['Blood Pressure'] = data['Blood Pressure'].apply(categorize_blood_pressure)


Blood Pressure
Normal                  5139
Hypertension Stage 2    1685
Hypertension Stage 1    1348
Elevated                 591
Name: count, dtype: int64

We can see that many patients suffer from high blood pressure

In [48]:
print(data["Sex"].value_counts(),"\n")
print(data["Diet"].value_counts(),"\n")
print(data["Country"].value_counts(),"\n")


Sex
Male      6111
Female    2652
Name: count, dtype: int64 

Diet
Healthy      2960
Average      2912
Unhealthy    2891
Name: count, dtype: int64 

Country
Germany           477
Argentina         471
Brazil            462
United Kingdom    457
Australia         449
Nigeria           448
France            446
Canada            440
China             436
New Zealand       435
Japan             433
Italy             431
Spain             430
Colombia          429
Thailand          428
South Africa      425
Vietnam           425
United States     420
India             412
South Korea       409
Name: count, dtype: int64 



In [49]:
print(data["Continent"].value_counts(),"\n")
print(data["Hemisphere"].value_counts(),"\n")

Continent
Asia             2543
Europe           2241
South America    1362
Australia         884
Africa            873
North America     860
Name: count, dtype: int64 

Hemisphere
Northern Hemisphere    5660
Southern Hemisphere    3103
Name: count, dtype: int64 



Of course features "Country", "Continent" and "Hemisphere" will be correlated. So we decide to keep feature "Continent" only.

In [50]:
categorical_data = categorical_data.drop(columns=["Country", "Hemisphere"])

In [51]:
#pip install -U scikit-learn
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
categorical_data_encoded = ordinal_encoder.fit_transform(categorical_data)

In [52]:
correlation_matrix = np.corrcoef(categorical_data_encoded, rowvar=False)
print(correlation_matrix)

[[ 1.          0.00998315 -0.00804504 -0.01690162]
 [ 0.00998315  1.         -0.01749413 -0.03176742]
 [-0.00804504 -0.01749413  1.          0.01659173]
 [-0.01690162 -0.03176742  0.01659173  1.        ]]
