In [1]:
import pandas as pd

### Part 1: Reading the Data

In [2]:
df = pd.read_csv("data.csv")

In [3]:
print(df.head())

   Index  id   age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  \
0      0   0  9125       2     149      54    177     78            3     2   
1      1   1  9125       2     146      55    149     95            1     1   
2      2   2  9125       2     145      75    141     98            3     1   
3      3   3  9125       2     158      79    135     95            1     1   
4      4   4  9125       2     178      62    155     98            1     1   

   smoke  alco  active  CVD  Blood pressure  
0      0     0       1    0               1  
1      0     0       1    0               1  
2      0     0       0    0               1  
3      0     1       0    0               1  
4      0     0       1    0               2  


In [4]:
df = df.drop(df.columns[0], axis=1)

In [5]:
df = df.rename(columns={"cardio": "CVD"})

In [6]:
print(df.head())

   id   age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  \
0   0  9125       2     149      54    177     78            3     2      0   
1   1  9125       2     146      55    149     95            1     1      0   
2   2  9125       2     145      75    141     98            3     1      0   
3   3  9125       2     158      79    135     95            1     1      0   
4   4  9125       2     178      62    155     98            1     1      0   

   alco  active  CVD  Blood pressure  
0     0       1    0               1  
1     0       1    0               1  
2     0       0    0               1  
3     1       0    0               1  
4     0       1    0               2  


In [7]:
df = df.replace({
    "gender": {1: "male", 2: "female"},
    "cholesterol": {1: "normal", 2: "borderline high", 3: "elevated"},
    "gluc": {1: "normal", 2: "pre diabetic", 3: "diabetic"},
    "smoke": {0:"no-smoke", 1: "smoke"},
    "alco": {0: "non-alco", 1: "alco"},
    "active": {0: "non-active", 1: "active"},
    "CVD": {0: "non-cvd", 1: "cvd"},
    "Blood pressure": {1: "normal", 2: "type 1 ", 3: "type 2"}
})


In [8]:
print(df.head())

   id   age  gender  height  weight  ap_hi  ap_lo cholesterol          gluc  \
0   0  9125  female     149      54    177     78    elevated  pre diabetic   
1   1  9125  female     146      55    149     95      normal        normal   
2   2  9125  female     145      75    141     98    elevated        normal   
3   3  9125  female     158      79    135     95      normal        normal   
4   4  9125  female     178      62    155     98      normal        normal   

      smoke      alco      active      CVD Blood pressure  
0  no-smoke  non-alco      active  non-cvd         normal  
1  no-smoke  non-alco      active  non-cvd         normal  
2  no-smoke  non-alco  non-active  non-cvd         normal  
3  no-smoke      alco  non-active  non-cvd         normal  
4  no-smoke  non-alco      active  non-cvd        type 1   


In [9]:
df["age"] = (df["age"] / 365).astype(int)

In [10]:
print(df.age)

0         25
1         25
2         25
3         25
4         25
          ..
299995    84
299996    84
299997    84
299998    84
299999    84
Name: age, Length: 300000, dtype: int64


In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              300000 non-null  int64 
 1   age             300000 non-null  int64 
 2   gender          300000 non-null  object
 3   height          300000 non-null  int64 
 4   weight          300000 non-null  int64 
 5   ap_hi           300000 non-null  int64 
 6   ap_lo           300000 non-null  int64 
 7   cholesterol     300000 non-null  object
 8   gluc            300000 non-null  object
 9   smoke           300000 non-null  object
 10  alco            300000 non-null  object
 11  active          300000 non-null  object
 12  CVD             300000 non-null  object
 13  Blood pressure  300000 non-null  object
dtypes: int64(6), object(8)
memory usage: 32.0+ MB
None


### Part 2: Data Cleaning (Preprocessing)

####  1. Checking for Missing Values

In [12]:
print(df.isnull().sum())

id                0
age               0
gender            0
height            0
weight            0
ap_hi             0
ap_lo             0
cholesterol       0
gluc              0
smoke             0
alco              0
active            0
CVD               0
Blood pressure    0
dtype: int64


#### 2. Checking for Duplicates

In [13]:
print(df.duplicated().sum())

0


#### 3. Handling Outliers (Invalid Ranges)

##### filter the dataset based on medically valid ranges

In [14]:
df = df[(df["age"] >= 18) & (df["age"] <= 100)]

df = df[(df["height"] >= 140) & (df["height"] <= 210)]

df = df[(df["weight"] >= 40) & (df["weight"] <= 200)]

df = df[(df["ap_hi"] >= 80) & (df["ap_hi"] <= 250)]

df = df[(df["ap_lo"] >= 50) & (df["ap_lo"] <= 150)]

### Part 3: Feature Engineering (Adding New Columns)

#### 1. Body Mass Index (BMI) 

In [15]:
# Calculate BMI
df["BMI"] = df["weight"] / ((df["height"] / 100) ** 2)

In [16]:
df["BMI"] = df["BMI"].round(2)

In [17]:
# BMI classification
def bmi_class(bmi):
    if bmi < 18.5:
        return "Under weight"
    elif 18.5 <= bmi <= 24.9:
        return "Normal weight"
    elif 25 <= bmi <= 29.9:
        return "Over weight"
    else:
        return "Obese"

df["BMI_Class"] = df["BMI"].apply(bmi_class)

#### 2. Blood Pressure Classification


In [18]:
# Function to classify blood pressure
def bp_class(row):
    sbp = row["ap_hi"]
    dbp = row["ap_lo"]
    
    if sbp < 120 and dbp < 80:
        return "Normal"
    elif 120 <= sbp <= 129 and dbp < 80:
        return "Elevated"
    elif (130 <= sbp <= 139) or (80 <= dbp <= 89):
        return "Hypertension Stage 1"
    else:
        return "Hypertension Stage 2"

# Apply the function
df["BP_Class"] = df.apply(bp_class, axis=1)

In [19]:
print(df.head())

   id  age  gender  height  weight  ap_hi  ap_lo cholesterol          gluc  \
0   0   25  female     149      54    177     78    elevated  pre diabetic   
1   1   25  female     146      55    149     95      normal        normal   
2   2   25  female     145      75    141     98    elevated        normal   
3   3   25  female     158      79    135     95      normal        normal   
4   4   25  female     178      62    155     98      normal        normal   

      smoke      alco      active      CVD Blood pressure    BMI  \
0  no-smoke  non-alco      active  non-cvd         normal  24.32   
1  no-smoke  non-alco      active  non-cvd         normal  25.80   
2  no-smoke  non-alco  non-active  non-cvd         normal  35.67   
3  no-smoke      alco  non-active  non-cvd         normal  31.65   
4  no-smoke  non-alco      active  non-cvd        type 1   19.57   

       BMI_Class              BP_Class  
0  Normal weight  Hypertension Stage 2  
1    Over weight  Hypertension Stage 2  

#### 3. Age Distribution

In [20]:
bins = [19,24,29, 34, 39, 44, 49, 54, 59, 64,69,74,79,84]
labels = ['19-24','25-29','30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84']
df['age_distribution'] = pd.cut(df['age'], bins=bins, labels=labels, include_lowest=True)

In [21]:
print(df[['age', 'age_distribution']].head())

   age age_distribution
0   25            25-29
1   25            25-29
2   25            25-29
3   25            25-29
4   25            25-29


### Part 4: Final Verification

##### After cleaning and adding new features, verify the dataset.

In [22]:
print(df.head(10))

   id  age  gender  height  weight  ap_hi  ap_lo cholesterol          gluc  \
0   0   25  female     149      54    177     78    elevated  pre diabetic   
1   1   25  female     146      55    149     95      normal        normal   
2   2   25  female     145      75    141     98    elevated        normal   
3   3   25  female     158      79    135     95      normal        normal   
4   4   25  female     178      62    155     98      normal        normal   
5   5   25  female     149      85    143     89      normal        normal   
6   6   25  female     147      75    151     66      normal        normal   
7   7   25  female     166      62    109     63      normal      diabetic   
8   8   25  female     177      82    101     89      normal      diabetic   
9   9   25  female     162      77    144     72      normal  pre diabetic   

      smoke      alco      active      CVD Blood pressure    BMI  \
0  no-smoke  non-alco      active  non-cvd         normal  24.32   
1  no

In [23]:
print(df.columns)

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'CVD',
       'Blood pressure', 'BMI', 'BMI_Class', 'BP_Class', 'age_distribution'],
      dtype='object')


In [24]:
print(df.head())

   id  age  gender  height  weight  ap_hi  ap_lo cholesterol          gluc  \
0   0   25  female     149      54    177     78    elevated  pre diabetic   
1   1   25  female     146      55    149     95      normal        normal   
2   2   25  female     145      75    141     98    elevated        normal   
3   3   25  female     158      79    135     95      normal        normal   
4   4   25  female     178      62    155     98      normal        normal   

      smoke      alco      active      CVD Blood pressure    BMI  \
0  no-smoke  non-alco      active  non-cvd         normal  24.32   
1  no-smoke  non-alco      active  non-cvd         normal  25.80   
2  no-smoke  non-alco  non-active  non-cvd         normal  35.67   
3  no-smoke      alco  non-active  non-cvd         normal  31.65   
4  no-smoke  non-alco      active  non-cvd        type 1   19.57   

       BMI_Class              BP_Class age_distribution  
0  Normal weight  Hypertension Stage 2            25-29  
1    O

In [25]:
df = df[[
    "id","age","age_distribution","gender","cholesterol","gluc","smoke","alco","active",
    "height","weight","BMI","BMI_Class",
    "ap_hi", "ap_lo", "BP_Class","Blood pressure",
    "CVD"
]]

In [26]:
print(df.columns)

Index(['id', 'age', 'age_distribution', 'gender', 'cholesterol', 'gluc',
       'smoke', 'alco', 'active', 'height', 'weight', 'BMI', 'BMI_Class',
       'ap_hi', 'ap_lo', 'BP_Class', 'Blood pressure', 'CVD'],
      dtype='object')


In [27]:
print(df.head())

   id  age age_distribution  gender cholesterol          gluc     smoke  \
0   0   25            25-29  female    elevated  pre diabetic  no-smoke   
1   1   25            25-29  female      normal        normal  no-smoke   
2   2   25            25-29  female    elevated        normal  no-smoke   
3   3   25            25-29  female      normal        normal  no-smoke   
4   4   25            25-29  female      normal        normal  no-smoke   

       alco      active  height  weight    BMI      BMI_Class  ap_hi  ap_lo  \
0  non-alco      active     149      54  24.32  Normal weight    177     78   
1  non-alco      active     146      55  25.80    Over weight    149     95   
2  non-alco  non-active     145      75  35.67          Obese    141     98   
3      alco  non-active     158      79  31.65          Obese    135     95   
4  non-alco      active     178      62  19.57  Normal weight    155     98   

               BP_Class Blood pressure      CVD  
0  Hypertension Stage 2 

In [28]:
print(df.shape)

(300000, 18)


In [29]:
df.to_csv("heart_data1_cleaned1.csv", index=False)