In [23]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from config import db_password

In [24]:
file_path = "Resources/cardio_train.csv"
cardio_df = pd.read_csv(file_path)
cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [25]:
cardio_df.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [26]:
cardio_df.dtypes


id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [27]:
#Find null values

for column in cardio_df.columns:
    print(f"Column {column} has {cardio_df[column].isnull().sum()} null values")

Column id has 0 null values
Column age has 0 null values
Column gender has 0 null values
Column height has 0 null values
Column weight has 0 null values
Column ap_hi has 0 null values
Column ap_lo has 0 null values
Column cholesterol has 0 null values
Column gluc has 0 null values
Column smoke has 0 null values
Column alco has 0 null values
Column active has 0 null values
Column cardio has 0 null values


In [28]:
# Find duplicate entries
print(f"Duplicate entries: {cardio_df.duplicated().sum()}")

Duplicate entries: 0


In [29]:
cardio_df.shape

(70000, 13)

In [30]:
cardio_df['Prediction'] = ''
cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,Prediction
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,


In [31]:
cardio_df.shape

(70000, 14)

In [32]:
cardio_df['age'] =cardio_df['age']/365
cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,Prediction
0,0,50.391781,2,168,62.0,110,80,1,1,0,0,1,0,
1,1,55.419178,1,156,85.0,140,90,3,1,0,0,1,1,
2,2,51.663014,1,165,64.0,130,70,3,1,0,0,0,1,
3,3,48.282192,2,169,82.0,150,100,1,1,0,0,1,1,
4,4,47.873973,1,156,56.0,100,60,1,1,0,0,0,0,


In [33]:
cardio_df['age'] = cardio_df['age'].astype(int)
cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,Prediction
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0,
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1,
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1,
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1,
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0,


In [34]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Cardio_Mockup"


In [35]:
engine = create_engine(db_string)

In [36]:
cardio_df.to_sql(name='cardio', con=engine, if_exists='replace')

In [37]:
# Saving cleaned data
file_path = "Resources/cardio_mockup.csv"
cardio_df.to_csv(file_path, index=False)