<h1>Task 2: Data Cleaning & Preprocessing</h1>

In [2]:
import pandas as pd
import numpy as np

<h3>Load dataset</h3>

In [10]:
df = pd.read_csv("cardio_train.csv", sep=';')
df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


<h3>Remove id and duplicate data</h3>

In [11]:
df.drop(columns=['id'], inplace=True)
df.drop_duplicates()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1


<h3>Convert age from days to years</h3>

In [12]:
df['age_years'] = (df['age'] / 365).astype(int)
df['age_group'] = pd.cut(df['age_years'], bins=[0, 30, 50, 100], labels=[1, 2, 3])#labels = 1: Young, 2: Middle-aged, 3: Senior
df.drop(columns=['age'], inplace=True)

<h3>Basic sanity check for impossible values</h3>

<h5>Blood pressure sanity: systolic (ap_hi) should be >= diastolic (ap_lo)</h5>

In [13]:
swap_mask = df['ap_hi'] < df['ap_lo']
df.loc[swap_mask, ['ap_hi', 'ap_lo']] = df.loc[swap_mask, ['ap_lo', 'ap_hi']].values

df['ap_hi'] = df['ap_hi'].clip(lower=90, upper=250)
df['ap_lo'] = df['ap_lo'].clip(lower=40, upper=180)

In [14]:
df.shape

(70000, 13)

<h3>Remove unreasonable height or weight outliers</h3>

In [15]:
df = df[(df['height'] > 100) & (df['height'] < 220)]
df = df[(df['weight'] > 30) & (df['weight'] < 180)]

<h3>normalize blood pressure columns using z-score</h3>

In [16]:
df['ap_hi_z'] = (df['ap_hi'] - df['ap_hi'].mean()) / df['ap_hi'].std()
df['ap_lo_z'] = (df['ap_lo'] - df['ap_lo'].mean()) / df['ap_lo'].std()

In [17]:
for col in ['ap_hi_z', 'ap_lo_z']:
    z_threshold = 3
    mask = df[col].abs() > z_threshold
    outliers = df[mask]
    if not outliers.empty:
        df.loc[df[col] > z_threshold, col] = z_threshold
        df.loc[df[col] < -z_threshold, col] = -z_threshold

<h3>Add BMI feature</h3>

In [18]:
df['bmi'] = df['weight'] / ((df['height']/100) ** 2)

In [19]:
for col in ['ap_hi', 'ap_lo', 'bmi']:
    df[f'{col}_log'] = np.log1p(df[col])

<h3>Smooth statistical outliers</h3>

In [20]:
for col in ['bmi', 'ap_hi_log', 'ap_lo_log', 'bmi_log', 'age_years']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    mask = (df[col] < lower) | (df[col] > upper)
    outliers = df[mask]
    if not outliers.empty:
        df[col] = np.where(df[col] < lower, lower, df[col])
        df[col] = np.where(df[col] > upper, upper, df[col])

In [21]:
df

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,age_group,ap_hi_z,ap_lo_z,bmi,ap_hi_log,ap_lo_log,bmi_log
0,2,168,62.0,110,80,1,1,0,0,1,0,50.0,2,-0.829405,-0.166623,21.967120,4.709530,4.394449,3.134064
1,1,156,85.0,140,90,3,1,0,0,1,1,55.0,3,0.526506,0.603754,34.927679,4.948760,4.510860,3.581508
2,1,165,64.0,130,70,3,1,0,0,0,1,51.0,3,0.074536,-0.937000,23.507805,4.875197,4.262680,3.198992
3,2,169,82.0,150,100,1,1,0,0,1,1,48.0,2,0.978476,1.374130,28.710479,5.017280,4.615121,3.391500
4,1,156,56.0,100,60,1,1,0,0,0,0,47.0,2,-1.281375,-1.707376,23.011177,4.615121,4.219834,3.178519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,2,168,76.0,120,80,1,1,1,0,1,0,52.0,3,-0.377434,-0.166623,26.927438,4.795791,4.394449,3.329610
69996,1,158,126.0,140,90,2,2,0,0,1,1,61.0,3,0.526506,0.603754,39.661304,4.948760,4.510860,3.779409
69997,2,183,105.0,180,90,3,1,0,1,0,1,52.0,3,2.334387,0.603754,31.353579,5.178214,4.510860,3.476725
69998,1,163,72.0,135,80,1,2,0,0,0,1,61.0,3,0.300521,-0.166623,27.099251,4.912655,4.394449,3.335743


<h3>Check for NaN values</h3>

In [454]:
df.isnull().sum()

gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
age_years      0
age_group      0
ap_hi_z        0
ap_lo_z        0
bmi            0
ap_hi_log      0
ap_lo_log      0
bmi_log        0
dtype: int64

<h3>Reset index after dropping rows</h3>

In [455]:
df.reset_index(drop=True, inplace=True)

<h3>Final preprocessed dataset shape</h3>

In [456]:
df.shape

(69949, 19)

<h3>Verify value distributions </h3>

In [457]:
df[['age_years', 'bmi', 'ap_hi', 'ap_lo']].describe()

Unnamed: 0,age_years,bmi,ap_hi,ap_lo
count,69949.0,69949.0,69949.0,69949.0
mean,52.842199,27.367558,128.35087,82.162876
std,6.765543,4.852338,22.125354,12.980663
min,33.0,14.403401,90.0,40.0
25%,48.0,23.875115,120.0,80.0
50%,53.0,26.370238,120.0,80.0
75%,58.0,30.189591,140.0,90.0
max,64.0,39.661304,250.0,180.0


<h3>Save cleaned dataset</h3>

In [458]:
df.to_csv("cardio_cleaned.csv", index=False)
print("Cleaned dataset saved as cardio_cleaned.csv")

Cleaned dataset saved as cardio_cleaned.csv
