## 6. Feature Engineering in the Heart Disease Dataset
### <b>Task:</b> Create new features from existing ones in the Heart Disease dataset, such as age groups, cholesterol levels, and more.

In [2]:
# Importing Libraries
import pandas as pd

In [3]:
# Loading the dataset
heart_dataset = pd.read_csv('Datasets\\HeartDisease.csv')

print(heart_dataset.shape, '\n')
heart_dataset.head(10)

(1025, 14) 



Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1
6,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
7,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0
8,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
9,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0


<h3>Understanding the Existing features</h3>
<p><ul>
    <li><b>age: </b>Age of the patient in years.</li>
    <li><b>sex: </b>Gender of the patient (1 = male, 0 = female).</li>
    <li><b>cp (chest pain type): </b>Type of chest pain experienced by the patient. <br>
            0: Typical angina <br>
            1: Atypical angina <br>
            2: Non-anginal pain <br>
            3: Asymptomatic</li>
    <li><b>trestbps (resting blood pressure): </b>Resting blood pressure in mm Hg when the patient was admitted to the hospital.</li>
    <li><b>chol (serum cholesterol): </b>Serum cholesterol level in mg/dl.</li>
    <li><b>fbs (fasting blood sugar): </b>Fasting blood sugar level (1 = fasting blood sugar > 120 mg/dl, 0 = otherwise).</li>
    <li><b>restecg (resting electrocardiographic results): </b>Results of the resting electrocardiogram. <br>
            0: Normal <br>
            1: Having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) <br>
            2: Showing probable or definite left ventricular hypertrophy by Estes' criteria</li>
    <li><b>thalach (maximum heart rate achieved): </b>Maximum heart rate achieved during a stress test.</li>
    <li><b>exang (exercise induced angina): </b>Exercise-induced angina (1 = yes, 0 = no).</li>
    <li><b>oldpeak: </b>ST depression induced by exercise relative to rest (numeric value measuring the extent of depression).</li>
    <li><b>slope: </b>(slope of the peak exercise ST segment)
            0: Upsloping <br>
            1: Flat <br>
            2: Downsloping</li>
    <li><b>ca (number of major vessels colored by fluoroscopy): </b>Number of major vessels (0-3) colored by fluoroscopy.</li>
    <li><b>thal: </b>Thalassemia blood disorder status.
            1: Normal <br>
            2: Fixed defect <br>
            3: Reversible defect <br></li>
    <li><b>target: </b>Presence of heart disease (1 = yes, 0 = no).</li>
</ul></p>

In [4]:
# Checking for the missing values
heart_dataset.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

-> Since there are no missing values, we can proceed to the feature engineering.

In [5]:
# Printing the dataset info
heart_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [6]:
# Printing the basic statistic of the dataset
heart_dataset.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


<h2>Feature Creation</h2>
<p>Creating new features from existing features</p>

In [None]:
# Defining age bins and labels
age_bins = [0, 30, 40, 50, 60, 70, 80]
age_labels = ['0-30', '30-40', '40-50', '50-60', '60-70', '70-80']

# Creating age group feature
heart_dataset['Age Group'] = pd.cut(heart_dataset['age'], bins=age_bins, labels=age_labels)

In [None]:
# Defining cholestrols bins and labels
chol_bins = [0, 200, 240, 300, 600]
chol_labels = ['Normal', 'Borderline', 'High', 'Very High']

# Creating cholestrol level feature
heart_dataset['Cholestrol Level'] = pd.cut(heart_dataset['chol'], bins=chol_bins, labels=chol_labels)

In [14]:
# Defining max heart rate bins and labels
thalach_bins = [0, 100, 140, 180, 220]
thalach_labels = ['Very Low', 'Low', 'Normal', 'High']

# Creating max heart rate level feature
heart_dataset['Max Heart Rate Level'] = pd.cut(heart_dataset['thalach'], bins=thalach_bins, labels=thalach_labels)

In [15]:
# Defining resting blood pressure bins and labels
trestbps_bins = [0, 120, 130, 140, 200]
trestbps_labels = ['Normal', 'Elevated', 'High', 'Very High']

# Creating resting blood pressure level feature
heart_dataset['Resting BP Level'] = pd.cut(heart_dataset['trestbps'], bins=trestbps_bins, labels= trestbps_labels)

In [20]:
# Creating feature of overall cardiovascular risk that combines cholestrol and blood pressure
interaction_quantiles = (heart_dataset['chol'] * heart_dataset['trestbps']).quantile([0.33, 0.66])

def map_risk(value):

    if value <= interaction_quantiles[0.33]:
        return 'Low'
    elif value >= interaction_quantiles[0.66]:
        return 'High'
    else:
        return 'Normal'

heart_dataset['Cardiovascular Risk'] = (heart_dataset['chol'] * heart_dataset['trestbps']).apply(map_risk)

In [21]:
# Printing dataset after creating new features
heart_dataset.head(15)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,Age Group,Cholestrol Level,Max Heart Rate Level,Resting BP Level,Cardiovascular Risk
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,50-60,Borderline,Normal,Elevated,Low
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,50-60,Borderline,Normal,High,Normal
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0,60-70,Normal,Low,Very High,Low
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0,60-70,Borderline,Normal,Very High,Normal
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0,60-70,High,Low,High,High
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1,50-60,High,Low,Normal,Low
6,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0,50-60,Very High,Low,Normal,High
7,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0,50-60,High,Normal,Very High,High
8,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0,40-50,High,Normal,Normal,Normal
9,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0,50-60,High,Low,Elevated,High


-> So these new features will explain the dataset more effectively.

<hr>