# Feature Engineering

## 📌 Import Necessary Libraries for Feature Engineering


In [1]:
# Suppress warnings  
import warnings
warnings.simplefilter("ignore")

# Data handling and processing  
import pandas as pd  
import numpy as np  
from scipy import stats  

# Data visualization  
import matplotlib.pyplot as plt  
import seaborn as sns  
sns.set()  

# Feature Scaling & Selection  
from sklearn.preprocessing import StandardScaler  
from sklearn.model_selection import train_test_split  
from sklearn.model_selection import cross_val_score  

### 📂 Loading the dataset

In [2]:
data = pd.read_csv("../data/processed/processed_data.csv")

data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0


### Standarization

In [3]:
from sklearn.preprocessing import StandardScaler
# List of numerical columns to scale
numerical_features = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
# Initialize the scaler
scaler = StandardScaler()
# Fit and transform the numerical features
data[numerical_features] = scaler.fit_transform(data[numerical_features])
# Check the scaled data
print(data.head())

   id       age  gender    height    weight     ap_hi     ap_lo  cholesterol  \
0   0 -0.418315       2  0.443881 -0.847599 -1.016518 -0.143537            1   
1   1  0.320429       1 -1.022764  0.759808  0.815668  0.940917            3   
2   2 -0.270566       1  0.077220 -0.707824  0.204939 -1.227990            3   
3   3 -0.713813       2  0.566101  0.550146  1.426396  2.025371            1   
4   4 -0.861561       1 -1.022764 -1.266923 -1.627246 -2.312444            1   

   gluc  smoke  alco  active  cardio  
0     1      0     0       1       0  
1     1      0     0       1       1  
2     1      0     0       0       1  
3     1      0     0       1       1  
4     1      0     0       0       0  


# Feature Selection

## Chi-Square Method

In [4]:
from sklearn.feature_selection import SelectKBest, chi2
X = data.drop(columns=['cardio'])  # Features
y = data['cardio']  # Target variable

# Since chi-square works with non-negative values, ensure no negative values are in X
X_abs = X.abs()

# Apply chi-square feature selection
chi2_selector = SelectKBest(chi2, k='all') 
X_new = chi2_selector.fit_transform(X_abs, y)

# Get the scores of each feature
feature_scores = chi2_selector.scores_

# Create a DataFrame to view the feature scores
feature_scores_df = pd.DataFrame({
    'Feature': X.columns,
    'Chi-Square Score': feature_scores
})

# Sort features based on chi-square score
feature_scores_df = feature_scores_df.sort_values(by='Chi-Square Score', ascending=False)

feature_scores_df

Unnamed: 0,Feature,Chi-Square Score
0,id,16518.574821
7,cholesterol,1129.276708
5,ap_hi,729.412109
6,ap_lo,495.783947
8,gluc,145.404794
4,weight,65.18889
11,active,19.449967
1,age,18.4509
9,smoke,16.458641
3,height,9.68999


In [5]:
# Drop the 'id' column
X = data.drop(columns=['id', 'cardio'])
y = data['cardio']

## Transferring data for model Part

In [6]:
processed_data = data
# Save preprocessed data
processed_data.to_csv("../data/processed/processed_data.csv", index=False)

print("Preprocessed data saved successfully!")

Preprocessed data saved successfully!
