## Data Exploration


In [130]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [94]:
df=pd.read_csv("cardiovascular_diseases.csv",sep = ';'  )

In [95]:
df.head(5) 

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [96]:
df.tail(5)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1
69999,99999,20540,1,170,72.0,120,80,2,1,0,0,1,0


In [97]:
df.sample(5)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
42866,61237,14568,1,165,80.0,110,70,1,1,0,0,0,0
31501,44969,21997,1,155,56.0,140,90,1,1,0,0,1,1
2100,2971,22710,2,170,72.0,125,80,1,1,0,0,1,0
11995,17122,20231,1,172,82.0,110,70,2,1,0,0,1,0
67420,96269,22560,1,157,73.0,140,1000,1,1,0,1,0,1


In [98]:
df.shape

(70000, 13)

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [100]:
df.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

## Data Pre-Processing

In [101]:
df.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [102]:
#drop in useful column that contains data it will not be used in data analysis 
df=df.drop('id', axis=1)

In [103]:
df.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [104]:
#Calculate the age by year
df['age']=round(df['age']/365.25)

In [105]:
df.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50.0,2,168,62.0,110,80,1,1,0,0,1,0
1,55.0,1,156,85.0,140,90,3,1,0,0,1,1
2,52.0,1,165,64.0,130,70,3,1,0,0,0,1
3,48.0,2,169,82.0,150,100,1,1,0,0,1,1
4,48.0,1,156,56.0,100,60,1,1,0,0,0,0


In [106]:
#Use the height and weight to calculate BMI(Body Mass Index)
df['bmi']=round((df['weight']/(df['height']/100)**2), 1)

In [107]:
df.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50.0,2,168,62.0,110,80,1,1,0,0,1,0,22.0
1,55.0,1,156,85.0,140,90,3,1,0,0,1,1,34.9
2,52.0,1,165,64.0,130,70,3,1,0,0,0,1,23.5
3,48.0,2,169,82.0,150,100,1,1,0,0,1,1,28.7
4,48.0,1,156,56.0,100,60,1,1,0,0,0,0,23.0


In [108]:
#Change the value of gender from 1 and 2 to Male and Female :
def gender(gender):
    if gender==1:
        return 'Female'
    else:
        return 'Male'
    

In [109]:
df['gender']=df.gender.apply(gender)

In [110]:
df.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50.0,Male,168,62.0,110,80,1,1,0,0,1,0,22.0
1,55.0,Female,156,85.0,140,90,3,1,0,0,1,1,34.9
2,52.0,Female,165,64.0,130,70,3,1,0,0,0,1,23.5
3,48.0,Male,169,82.0,150,100,1,1,0,0,1,1,28.7
4,48.0,Female,156,56.0,100,60,1,1,0,0,0,0,23.0


In [111]:
cholesterol = {
    1: 'normal',
    2: 'above normal',
    3: 'well above normal',
    
}

In [112]:
df['cholesterol'] = df.cholesterol.map(cholesterol)

In [113]:
df.head(10)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50.0,Male,168,62.0,110,80,normal,1,0,0,1,0,22.0
1,55.0,Female,156,85.0,140,90,well above normal,1,0,0,1,1,34.9
2,52.0,Female,165,64.0,130,70,well above normal,1,0,0,0,1,23.5
3,48.0,Male,169,82.0,150,100,normal,1,0,0,1,1,28.7
4,48.0,Female,156,56.0,100,60,normal,1,0,0,0,0,23.0
5,60.0,Female,151,67.0,120,80,above normal,2,0,0,0,0,29.4
6,61.0,Female,157,93.0,130,80,well above normal,1,0,0,1,0,37.7
7,62.0,Male,178,95.0,130,90,well above normal,3,0,0,1,1,30.0
8,48.0,Female,158,71.0,110,70,normal,1,0,0,1,0,28.4
9,54.0,Female,164,68.0,110,60,normal,1,0,0,0,0,25.3


In [114]:
gluc = {
    1: 'normal',
    2: 'above normal',
    3: 'well above normal',
    
}

In [115]:
df['gluc'] = df.gluc.map(gluc)

In [116]:
df.head(20)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50.0,Male,168,62.0,110,80,normal,normal,0,0,1,0,22.0
1,55.0,Female,156,85.0,140,90,well above normal,normal,0,0,1,1,34.9
2,52.0,Female,165,64.0,130,70,well above normal,normal,0,0,0,1,23.5
3,48.0,Male,169,82.0,150,100,normal,normal,0,0,1,1,28.7
4,48.0,Female,156,56.0,100,60,normal,normal,0,0,0,0,23.0
5,60.0,Female,151,67.0,120,80,above normal,above normal,0,0,0,0,29.4
6,61.0,Female,157,93.0,130,80,well above normal,normal,0,0,1,0,37.7
7,62.0,Male,178,95.0,130,90,well above normal,well above normal,0,0,1,1,30.0
8,48.0,Female,158,71.0,110,70,normal,normal,0,0,1,0,28.4
9,54.0,Female,164,68.0,110,60,normal,normal,0,0,0,0,25.3


In [117]:
#Change the value of smoke from 1 and 0 to smoker and non smoker :
smoke = {
    0: 'non smoker',
    1: 'smoker'
    
}

In [118]:
df['smoke'] = df.smoke.map(smoke)

In [119]:
df.head(20)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50.0,Male,168,62.0,110,80,normal,normal,non smoker,0,1,0,22.0
1,55.0,Female,156,85.0,140,90,well above normal,normal,non smoker,0,1,1,34.9
2,52.0,Female,165,64.0,130,70,well above normal,normal,non smoker,0,0,1,23.5
3,48.0,Male,169,82.0,150,100,normal,normal,non smoker,0,1,1,28.7
4,48.0,Female,156,56.0,100,60,normal,normal,non smoker,0,0,0,23.0
5,60.0,Female,151,67.0,120,80,above normal,above normal,non smoker,0,0,0,29.4
6,61.0,Female,157,93.0,130,80,well above normal,normal,non smoker,0,1,0,37.7
7,62.0,Male,178,95.0,130,90,well above normal,well above normal,non smoker,0,1,1,30.0
8,48.0,Female,158,71.0,110,70,normal,normal,non smoker,0,1,0,28.4
9,54.0,Female,164,68.0,110,60,normal,normal,non smoker,0,0,0,25.3


In [120]:
#Change the value of alco from 1 and 0 to drinker and not drinker :
alco = {
    0: 'not drinker',
    1: 'drinker'
    
}

In [121]:
df['alco'] = df.alco.map(alco)

In [122]:
df.head(20)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50.0,Male,168,62.0,110,80,normal,normal,non smoker,not drinker,1,0,22.0
1,55.0,Female,156,85.0,140,90,well above normal,normal,non smoker,not drinker,1,1,34.9
2,52.0,Female,165,64.0,130,70,well above normal,normal,non smoker,not drinker,0,1,23.5
3,48.0,Male,169,82.0,150,100,normal,normal,non smoker,not drinker,1,1,28.7
4,48.0,Female,156,56.0,100,60,normal,normal,non smoker,not drinker,0,0,23.0
5,60.0,Female,151,67.0,120,80,above normal,above normal,non smoker,not drinker,0,0,29.4
6,61.0,Female,157,93.0,130,80,well above normal,normal,non smoker,not drinker,1,0,37.7
7,62.0,Male,178,95.0,130,90,well above normal,well above normal,non smoker,not drinker,1,1,30.0
8,48.0,Female,158,71.0,110,70,normal,normal,non smoker,not drinker,1,0,28.4
9,54.0,Female,164,68.0,110,60,normal,normal,non smoker,not drinker,0,0,25.3


In [124]:
#Change the value of active from 1 and 0 to active and not active :
active = {
    0: 'not active',
    1: 'active'
    
}

In [125]:
df['active'] = df.active.map(active)

In [126]:
df.head(20)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50.0,Male,168,62.0,110,80,normal,normal,non smoker,not drinker,active,0,22.0
1,55.0,Female,156,85.0,140,90,well above normal,normal,non smoker,not drinker,active,1,34.9
2,52.0,Female,165,64.0,130,70,well above normal,normal,non smoker,not drinker,not active,1,23.5
3,48.0,Male,169,82.0,150,100,normal,normal,non smoker,not drinker,active,1,28.7
4,48.0,Female,156,56.0,100,60,normal,normal,non smoker,not drinker,not active,0,23.0
5,60.0,Female,151,67.0,120,80,above normal,above normal,non smoker,not drinker,not active,0,29.4
6,61.0,Female,157,93.0,130,80,well above normal,normal,non smoker,not drinker,active,0,37.7
7,62.0,Male,178,95.0,130,90,well above normal,well above normal,non smoker,not drinker,active,1,30.0
8,48.0,Female,158,71.0,110,70,normal,normal,non smoker,not drinker,active,0,28.4
9,54.0,Female,164,68.0,110,60,normal,normal,non smoker,not drinker,not active,0,25.3


In [127]:
#Change the value of cardio from 1 and 0 to Presence and absence :
cardio = {
    0: 'absence',
    1: 'presence'
    
}

In [128]:
df['cardio'] = df.cardio.map(cardio)

In [129]:
df.head(20)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50.0,Male,168,62.0,110,80,normal,normal,non smoker,not drinker,active,absence,22.0
1,55.0,Female,156,85.0,140,90,well above normal,normal,non smoker,not drinker,active,presence,34.9
2,52.0,Female,165,64.0,130,70,well above normal,normal,non smoker,not drinker,not active,presence,23.5
3,48.0,Male,169,82.0,150,100,normal,normal,non smoker,not drinker,active,presence,28.7
4,48.0,Female,156,56.0,100,60,normal,normal,non smoker,not drinker,not active,absence,23.0
5,60.0,Female,151,67.0,120,80,above normal,above normal,non smoker,not drinker,not active,absence,29.4
6,61.0,Female,157,93.0,130,80,well above normal,normal,non smoker,not drinker,active,absence,37.7
7,62.0,Male,178,95.0,130,90,well above normal,well above normal,non smoker,not drinker,active,presence,30.0
8,48.0,Female,158,71.0,110,70,normal,normal,non smoker,not drinker,active,absence,28.4
9,54.0,Female,164,68.0,110,60,normal,normal,non smoker,not drinker,not active,absence,25.3


In [None]:
#Use the blood pressure categories to change the values of ap_hi and the ap_lo
def blood_pressure_categories(x,y):
    if x<=120 and y<=80:
        return 'Normal'
    elif x<=129 and y<=80:
        return 'Elevated'
    elif x<=139 or y<=89:
        return 'Stage_1'
    elif x<=180 or y<=120:
        return "Stage_2"
    elif x>180 or y>120:
        return 'Stage_3'
    else:
        return None

In [None]:
df['bp_c']=df.apply(lambda x:blood_pressure_categories(x['ap_hi'],x['ap_lo']),axis=1)


In [131]:
df_train,df_test=train_test_split(df, test_size=.2, random_state=199)