## Feature engineering

## 2.1a.0 $$BMI = \frac{Mass(kg)}{Height(m)^2}$$

In [535]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly_express as px

In [536]:
cardio_dataset = pd.read_csv("../data/cardio_train.csv", delimiter=";")
cardio_dataset

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [537]:
# Filter out women heights where height is less 140 and more than 180
cardio_dataset = cardio_dataset[((cardio_dataset['gender'] == 2) | ((cardio_dataset['gender'] == 1) & (cardio_dataset['height'] >= 140) & (cardio_dataset['height'] <= 180)))]

In [538]:
# Filter out men heights where height is less 150 and more than 200
cardio_dataset = cardio_dataset[((cardio_dataset['gender'] == 1) | ((cardio_dataset['gender'] == 2) & (cardio_dataset['height'] >= 150) & (cardio_dataset['height'] <= 200)))]

cardio_dataset

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [539]:
cardio_dataset['height'].min(), cardio_dataset['height'].max()

(140, 200)

In [540]:
# Filter out women weights where weight is less 140 and more than 180
cardio_dataset = cardio_dataset[((cardio_dataset['gender'] == 2) | ((cardio_dataset['gender'] == 1) & (cardio_dataset['weight'] >=50)))]

In [541]:
# Filter out men weights where weight is less 150 and more than 200
cardio_dataset = cardio_dataset[((cardio_dataset['gender'] == 1) | ((cardio_dataset['gender'] == 2) & (cardio_dataset['weight'] >= 60)))]

cardio_dataset

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [542]:
# Convert age in days to age in years

cardio_dataset["age_years"] = (cardio_dataset["age"] / (365.25)).round().astype(int)
cardio_dataset

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0,53
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1,62
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1,52
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1,61


In [543]:
cardio_dataset['weight'].min(), cardio_dataset['weight'].max()

(50.0, 200.0)

In [544]:
cardio_dataset['bmi'] = (cardio_dataset['weight']) / ((cardio_dataset['height']/100) ** 2)
cardio_dataset

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,21.967120
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52,23.507805
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48,23.011177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0,53,26.927438
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1,62,50.472681
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1,52,31.353579
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1,61,27.099251


In [545]:
cardio_dataset['bmi'].min(), cardio_dataset['bmi'].max()

(15.559636771757985, 69.82755294934609)

In [546]:
# cardio_dataset = cardio_dataset[(cardio_dataset['bmi'] < 40)]

In [547]:
cardio_dataset['bmi'].min(), cardio_dataset['bmi'].max()

(15.559636771757985, 69.82755294934609)

In [548]:
# create a new column to specify weight category based on bmi
def weight_category(bmi):
    if bmi < 18.5:
        return 'underweight'
    elif 18.5 <= bmi <= 24.9:
        return 'normal'
    elif 25.0 <= bmi <= 29.9:
        return 'overweight'
    elif 30.0<= bmi <= 34.9:
        return 'obese 1'
    elif 35.0<= bmi <= 39.9:
        return 'obese 2'
    else:
        return 'obese 3'

cardio_dataset['bmi_category'] = cardio_dataset['bmi'].apply(weight_category)

In [549]:
cardio_dataset

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bmi_category
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,21.967120,normal
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,obese 3
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52,23.507805,normal
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,overweight
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48,23.011177,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0,53,26.927438,overweight
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1,62,50.472681,obese 3
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1,52,31.353579,obese 1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1,61,27.099251,overweight


In [550]:
cardio_dataset[cardio_dataset['bmi_category'] == 'obese 1']

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bmi_category
13,21,19809,1,158,78.0,110,70,1,1,0,0,1,0,54,31.244993,obese 1
20,30,19778,2,163,83.0,120,80,1,1,0,0,1,0,54,31.239414,obese 1
27,38,18085,1,159,78.0,120,80,1,1,0,0,1,0,50,30.853210,obese 1
36,49,18328,2,175,95.0,120,80,1,1,0,0,1,0,50,31.020408,obese 1
52,67,19575,2,166,85.0,150,100,1,1,0,0,1,1,54,30.846277,obese 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69970,99958,22572,2,173,103.0,140,80,3,1,1,1,0,1,62,34.414782,obese 1
69975,99963,21264,2,182,100.0,120,80,1,1,0,0,1,1,58,30.189591,obese 1
69981,99972,17500,2,182,110.0,130,90,2,2,0,0,1,1,48,33.208550,obese 1
69989,99985,21013,1,157,83.0,120,70,1,1,0,0,1,1,58,33.672766,obese 1


In [551]:
fig = px.scatter(cardio_dataset, x='bmi', y='age_years', color='bmi_category', title='BMI distribution')
fig.show()

---

## 2.1.1 Feature Engeneering Blood Pressure

- Age | Objective Feature | age | int (days)
- Height | Objective Feature | height | int (cm) |
- Weight | Objective Feature | weight | float (kg) |
- Gender | Objective Feature | gender | categorical code |
- Systolic blood pressure | Examination Feature | ap_hi | int |
- Diastolic blood pressure | Examination Feature | ap_lo | int |
- Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
- Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
- Smoking | Subjective Feature | smoke | binary |
- Alcohol intake | Subjective Feature | alco | binary |
- Physical activity | Subjective Feature | active | binary |
- Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

In [552]:
cardio_dataset['ap_hi'].max(),cardio_dataset['ap_hi'].min(),

(16020, -140)

In [553]:
cardio_dataset['ap_lo'].max(),cardio_dataset['ap_lo'].min(),

(11000, -70)

*Extremely high blood pressure can damage blood vessels and weaken arteries in the brain, increasing the risk of stroke. Blood pressure readings above 180/120 mmHg are considered stroke-level, dangerously high, and require immediate medical attention.*
*https://www.medicinenet.com/what_is_stroke-level_high_blood_pressure/article.htm*

In [554]:
cardio_dataset = cardio_dataset[(cardio_dataset['ap_hi'] >= 0) & (cardio_dataset['ap_hi'] <= 190)]
cardio_dataset = cardio_dataset[(cardio_dataset['ap_lo'] >= 0) & (cardio_dataset['ap_lo'] <= 130)]

cardio_dataset.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bmi_category
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,21.96712,normal
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,obese 3
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52,23.507805,normal
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,overweight
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48,23.011177,normal


In [555]:
cardio_dataset['ap_lo'].max(),cardio_dataset['ap_lo'].min(),cardio_dataset['ap_hi'].max(),cardio_dataset['ap_hi'].min(),

(130, 0, 190, 7)

In [556]:
# Here I am defining a function to categorize blood pressure based on systolic(ap_hi) and diastolic(ap_lo) readings
def categorize_bp(row):
    if row['ap_hi'] < 120 and row['ap_lo'] < 80:
        return 'healthy'
    elif 120 <= row['ap_hi'] <= 129 and row['ap_lo'] < 80:
        return 'elevated'
    elif (130 <= row['ap_hi'] <= 139) or (80 <= row['ap_lo'] <= 89):
        return 'stage 1 hyperpetension'
    elif (140 <= row['ap_hi'] <= 180) or (90 <= row['ap_lo'] <= 120):
        return 'stage 2 hyperpetension'
    elif (row['ap_hi'] > 180) or (row['ap_lo'] > 120):
        return 'hypertension crisis'


# Apply the function to create a new column 'blood_pressure'
cardio_dataset['blood_pressure'] = cardio_dataset.apply(categorize_bp, axis=1)

In [557]:
cardio_dataset.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bmi_category,blood_pressure
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,21.96712,normal,stage 1 hyperpetension
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,obese 3,stage 2 hyperpetension
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52,23.507805,normal,stage 1 hyperpetension
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,overweight,stage 2 hyperpetension
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48,23.011177,normal,healthy


In [558]:
cardio_dataset[cardio_dataset['blood_pressure'] == 'hypertension crisis'].head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bmi_category,blood_pressure
3447,4880,19992,2,180,80.0,80,125,3,3,1,1,1,1,55,24.691358,normal,hypertension crisis
5446,7737,18968,1,165,110.0,95,130,1,1,0,1,1,1,52,40.40404,obese 3,hypertension crisis
8422,12019,23152,2,162,60.0,80,130,1,1,0,0,0,1,63,22.862369,normal,hypertension crisis
8947,12756,17320,2,172,102.0,80,130,1,1,0,1,1,0,47,34.478096,obese 1,hypertension crisis
9010,12847,20400,2,176,82.0,90,130,3,3,0,0,1,1,56,26.472107,overweight,hypertension crisis


---

## 2.2.0 Visualisation for portion of sickness

In [559]:

blood_pressure = cardio_dataset.groupby(['blood_pressure', 'cardio']).agg({'gender': 'count'}).reset_index()
blood_pressure['cardio'] = blood_pressure['cardio'].replace({0: 'disease absent', 1: 'disease present'})
blood_pressure

Unnamed: 0,blood_pressure,cardio,gender
0,elevated,disease absent,2007
1,elevated,disease present,972
2,healthy,disease absent,6788
3,healthy,disease present,2015
4,hypertension crisis,disease absent,6
5,hypertension crisis,disease present,21
6,stage 1 hyperpetension,disease absent,21076
7,stage 1 hyperpetension,disease present,17430
8,stage 2 hyperpetension,disease absent,3134
9,stage 2 hyperpetension,disease present,12661


In [560]:
bmi = cardio_dataset.groupby(['bmi_category','cardio']).agg({'gender': 'count'}).reset_index()
bmi['gender'] = bmi['gender'].replace({1: 'women', 2: 'men'})
bmi['cardio'] = bmi['cardio'].replace({0: 'disease absent', 1: 'disease present'})
bmi

Unnamed: 0,bmi_category,cardio,gender
0,normal,disease absent,13510
1,normal,disease present,9045
2,obese 1,disease absent,4736
3,obese 1,disease present,7042
4,obese 2,disease absent,1369
5,obese 2,disease present,2806
6,obese 3,disease absent,1232
7,obese 3,disease present,1847
8,overweight,disease absent,12084
9,overweight,disease present,12327


In [561]:
import plotly_express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=2, shared_yaxes=True)

# Add trace 1
trace1 = px.bar(blood_pressure, x="blood_pressure", y="gender",  hover_data=['cardio'])
fig.add_trace(
    go.Bar(
        name="Blood Pressure",
        x=trace1.data[0]["x"],
        y=trace1.data[0]["y"],
        showlegend=True,
        text=[str(val) for val in trace1.data[0]["x"]],
        hovertext=trace1.data[0]['hovertext']
    ),
    row=1,
    col=1
)

# Add trace 2
trace2 = px.bar(bmi, x="bmi_category", y="gender",  hover_data=['cardio'])
fig.add_trace(
    go.Bar(
        name="BMI",
        x=trace2.data[0]["x"],
        y=trace2.data[0]["y"],
        showlegend=True,
        text=[f"BMI: " + str(val) for val in trace2.data[0]["x"]],
        hovertext=trace2.data[0]['hovertext']
    ),
    row=1,
    col=2
)

# Update layout
fig["layout"].update(
    height=600,
    width=1200,
    title="Blood Pressure and BMI distribution for existence of cardiovascular disease",
    plot_bgcolor="rgba(255,255,255,0.1)",
    paper_bgcolor="rgba(255,255,255,0.9)",
    showlegend=True,
)
fig.update_yaxes(
    tickmode="auto",
    showgrid=False,
    linecolor="#000",
    showline=True,
    spikecolor="#000000",
    title="Participants",
    type='log',
    row=1,
    col=1,
)
fig.update_xaxes(
    tickmode="auto",
    showgrid=False,
    zeroline=True,
    title="Blood Pressure",
    linecolor="#000",
    showline=True,
    spikecolor="#000000",
    showticklabels=True,
    row=1,
    col=1,
    
)
fig.update_xaxes(
    tickmode="auto",
    showgrid=False,
    zeroline=True,
    title="BMI",
    linecolor="#000",
    showline=True,
    spikecolor="#000000",
    showticklabels=True,
    
    row=1,
    col=2
)
fig.show()


---

## 2.2.1 Visualisation correlation heatmap

In [562]:
cardio_dataset.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bmi_category,blood_pressure
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,21.96712,normal,stage 1 hyperpetension
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,obese 3,stage 2 hyperpetension
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52,23.507805,normal,stage 1 hyperpetension
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,overweight,stage 2 hyperpetension
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48,23.011177,normal,healthy


In [563]:
import numpy as np
import plotly.figure_factory as ff

columns = ['gender', 'age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'cardio', 'bmi']

corr = cardio_dataset[columns].corr().round(2)
mask = np.triu(np.ones_like(corr, dtype=bool))
df_mask = corr.mask(mask)

fig = ff.create_annotated_heatmap(z=df_mask.to_numpy(), 
                                  x=df_mask.columns.tolist(),
                                  y=df_mask.columns.tolist(),
                                  colorscale=px.colors.diverging.RdBu,
                                  showscale=True, ygap=1, xgap=1
                                 )

fig.update_xaxes(side="bottom")

fig.update_layout(
    title_text='Heatmap for some features in dataframe', 
    title_x=0.1, 
    width=1000, 
    height=800,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    yaxis_autorange='reversed',
    template='plotly_white'
)

fig.show()

---

## 2.3 Create Two datasets

In [564]:
dataset_1 = cardio_dataset.drop(['ap_hi', 'ap_lo', 'height', 'weight', 'bmi'], axis=1)
dataset_1.head()

Unnamed: 0,id,age,gender,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi_category,blood_pressure
0,0,18393,2,1,1,0,0,1,0,50,normal,stage 1 hyperpetension
1,1,20228,1,3,1,0,0,1,1,55,obese 3,stage 2 hyperpetension
2,2,18857,1,3,1,0,0,0,1,52,normal,stage 1 hyperpetension
3,3,17623,2,1,1,0,0,1,1,48,overweight,stage 2 hyperpetension
4,4,17474,1,1,1,0,0,0,0,48,normal,healthy


In [565]:
dataset_1['gender'] = dataset_1['gender'].replace({1: 'women', 2: 'men'})
dataset_1.head()

Unnamed: 0,id,age,gender,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi_category,blood_pressure
0,0,18393,men,1,1,0,0,1,0,50,normal,stage 1 hyperpetension
1,1,20228,women,3,1,0,0,1,1,55,obese 3,stage 2 hyperpetension
2,2,18857,women,3,1,0,0,0,1,52,normal,stage 1 hyperpetension
3,3,17623,men,1,1,0,0,1,1,48,overweight,stage 2 hyperpetension
4,4,17474,women,1,1,0,0,0,0,48,normal,healthy


In [566]:
dataset_1 = pd.get_dummies(dataset_1, columns = ["bmi_category", "blood_pressure", 'gender'], drop_first=True)
dataset_1.head()

Unnamed: 0,id,age,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi_category_obese 1,bmi_category_obese 2,bmi_category_obese 3,bmi_category_overweight,bmi_category_underweight,blood_pressure_healthy,blood_pressure_hypertension crisis,blood_pressure_stage 1 hyperpetension,blood_pressure_stage 2 hyperpetension,gender_women
0,0,18393,1,1,0,0,1,0,50,0,0,0,0,0,0,0,1,0,0
1,1,20228,3,1,0,0,1,1,55,0,0,1,0,0,0,0,0,1,1
2,2,18857,3,1,0,0,0,1,52,0,0,0,0,0,0,0,1,0,1
3,3,17623,1,1,0,0,1,1,48,0,0,0,1,0,0,0,0,1,0
4,4,17474,1,1,0,0,0,0,48,0,0,0,0,0,1,0,0,0,1


In [567]:
dataset_1.columns

Index(['id', 'age', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio',
       'age_years', 'bmi_category_obese 1', 'bmi_category_obese 2',
       'bmi_category_obese 3', 'bmi_category_overweight',
       'bmi_category_underweight', 'blood_pressure_healthy',
       'blood_pressure_hypertension crisis',
       'blood_pressure_stage 1 hyperpetension',
       'blood_pressure_stage 2 hyperpetension', 'gender_women'],
      dtype='object')

In [568]:
dataset_2 = cardio_dataset.drop(['bmi_category', 'blood_pressure', 'height', 'weight'], axis=1)
dataset_2.head()

Unnamed: 0,id,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi
0,0,18393,2,110,80,1,1,0,0,1,0,50,21.96712
1,1,20228,1,140,90,3,1,0,0,1,1,55,34.927679
2,2,18857,1,130,70,3,1,0,0,0,1,52,23.507805
3,3,17623,2,150,100,1,1,0,0,1,1,48,28.710479
4,4,17474,1,100,60,1,1,0,0,0,0,48,23.011177


In [569]:
dataset_2['gender'] = dataset_2['gender'].replace({1: 'women', 2: 'men'})
dataset_2.head()

Unnamed: 0,id,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi
0,0,18393,men,110,80,1,1,0,0,1,0,50,21.96712
1,1,20228,women,140,90,3,1,0,0,1,1,55,34.927679
2,2,18857,women,130,70,3,1,0,0,0,1,52,23.507805
3,3,17623,men,150,100,1,1,0,0,1,1,48,28.710479
4,4,17474,women,100,60,1,1,0,0,0,0,48,23.011177


In [570]:
dataset_2 = pd.get_dummies(dataset_2, columns = ['gender'], drop_first=True)
dataset_2.head()

Unnamed: 0,id,age,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,gender_women
0,0,18393,110,80,1,1,0,0,1,0,50,21.96712,0
1,1,20228,140,90,3,1,0,0,1,1,55,34.927679,1
2,2,18857,130,70,3,1,0,0,0,1,52,23.507805,1
3,3,17623,150,100,1,1,0,0,1,1,48,28.710479,0
4,4,17474,100,60,1,1,0,0,0,0,48,23.011177,1
