In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# Store filepath in a variable
file = Path("CVD_cleaned.csv")
diabetes_df = pd.read_csv(file, encoding="ANSI")
pd.set_option('display.max_columns', None)

In [3]:
# Create a new column with the title "UniqueID" that give each row its own id number
diabetes_df['UniqueID'] = range(1, len(diabetes_df) + 1)


In [4]:
# Check to verify that the UniqueID was created properly, only display the first 5 rows
diabetes_df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,UniqueID
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0,1
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0,2
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0,3
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0,4
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0,5


In [5]:
# Created a new dataframe of only the consumption columns, listing uniqueid first.  Display the first 5 rows.
consumption_df = diabetes_df[["UniqueID","Alcohol_Consumption", "Fruit_Consumption", "Green_Vegetables_Consumption",
                              "FriedPotato_Consumption"]]
consumption_df.head()

Unnamed: 0,UniqueID,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,1,0.0,30.0,16.0,12.0
1,2,0.0,30.0,0.0,4.0
2,3,4.0,12.0,3.0,16.0
3,4,0.0,30.0,30.0,8.0
4,5,0.0,8.0,4.0,0.0


In [6]:
# Create a new dataframe of only the non-consumption columns, listing uniqueid first.  Display the first 5 rows.
health_df = diabetes_df[["UniqueID","General_Health","Checkup","Exercise","Heart_Disease","Skin_Cancer","Other_Cancer",
                         "Depression","Diabetes","Arthritis","Sex","Age_Category","Height_(cm)","Weight_(kg)",
                         "BMI","Smoking_History"]]
health_df.head()

Unnamed: 0,UniqueID,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History
0,1,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes
1,2,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No
2,3,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No
3,4,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No
4,5,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes


In [7]:
# Push the consumption DataFrame to a new CSV file
consumption_df.to_csv("consumptiondata.csv",
                  encoding="utf-8", index=False, header=True)

In [8]:
# Push the health DataFrame to a new CSV file
health_df.to_csv("healthdata.csv",
                  encoding="utf-8", index=False, header=True)