### IMPORTS

In [1]:
# manipulate data
import numpy as np
import pandas as pd

# visualize data
import matplotlib.pyplot as plt
import seaborn as sn

# model preparation
from sklearn.model_selection import (train_test_split, learning_curve,
                                     LearningCurveDisplay, validation_curve,
                                     ValidationCurveDisplay)
from sklearn.preprocessing import StandardScaler

# machine learning models
import mlrose_hiive as rh

# model evaluation
from sklearn.metrics import (balanced_accuracy_score, 
                             accuracy_score, make_scorer,
                             recall_score, log_loss)

np.random.seed(123)

### Load and Preprocess: Cardiovascular Disease

In [2]:
file1 = "cardio_data_processed.csv"
cardio_vasc = pd.read_csv(file1)

In [3]:
# id: just an index
# useing age_years over age
# using weight and height instead of bmi
# using ap_hi/lo instead of bp_category and bp_category_encoded
cardio_cols_drop = ['id', 'age', 'bmi', 'bp_category',
                    'bp_category_encoded', "alco", "smoke",
                    'gender', 'gluc', 'active', 'cholesterol']
cardio_vasc = cardio_vasc.drop(columns=cardio_cols_drop, axis=1)

In [4]:
cardio_vasc = cardio_vasc.sample(frac=0.15)

In [5]:
cardio_vasc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10231 entries, 12148 to 67751
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   height     10231 non-null  int64  
 1   weight     10231 non-null  float64
 2   ap_hi      10231 non-null  int64  
 3   ap_lo      10231 non-null  int64  
 4   cardio     10231 non-null  int64  
 5   age_years  10231 non-null  int64  
dtypes: float64(1), int64(5)
memory usage: 559.5 KB


In [6]:
cardio_vasc.head(3)

Unnamed: 0,height,weight,ap_hi,ap_lo,cardio,age_years
12148,173,70.0,110,70,0,43
5541,168,75.0,120,80,1,51
40053,158,85.0,140,80,1,64


In [7]:
cardio_vasc['cardio'].value_counts() / len(cardio_vasc['cardio'])

cardio
0    0.510898
1    0.489102
Name: count, dtype: float64

In [8]:
# split data
cd_train, cd_test = train_test_split(cardio_vasc, test_size=.15,
                                     random_state=123, stratify=cardio_vasc['cardio'])

# get X, y
target_col = "cardio"
cols_drop = ["cardio"]
X_train_cd = cd_train.drop(cols_drop, axis=1)
X_test_cd = cd_test.drop(cols_drop, axis=1)
y_train_cd = cd_train[target_col]
y_test_cd = cd_test[target_col]

scale = StandardScaler()
scale.fit(X_train_cd)
X_train_scaled_cd = pd.DataFrame(data=scale.transform(X_train_cd),
                                 columns=X_train_cd.columns,
                                 index=X_train_cd.index)
X_test_scaled_cd = pd.DataFrame(data=scale.transform(X_test_cd),
                                columns=X_test_cd.columns,
                                index=X_test_cd.index)

### Baseline: Cardiovascular Disease

In [9]:
# calculat a baseline
act_pred_error_cd = pd.DataFrame({"actual": y_train_cd})
act_pred_error_cd["baseline_prediction"] = y_train_cd.value_counts().index[0]

baseline_acc_cd = accuracy_score(act_pred_error_cd["actual"], act_pred_error_cd["baseline_prediction"])

# print baseline accuracy
print(f"Baseline Accuracy Score: {round(baseline_acc_cd, 2)}%")

Baseline Accuracy Score: 0.51%


### Load and Preprocess: Nutrition Facts

In [10]:
file2 = "MyFoodData_Nutrition_Facts_SpreadSheet_Release_1.4.xlsx"
nutrition_facts = pd.read_excel(file2)

In [11]:
nutrition_facts = nutrition_facts.dropna(axis=0, subset=["Food Group"])

# dropping columns that have every value missing
cols_drop = ["Added Sugar g", "Soluble Fiber g", "Insoluble Fiber g",
             "Total sugar alcohols g", "Molybdenum mcg", "Chlorine mg",
             "Biotin B7 mcg", "NetCarbs g"]
nutrition_facts = nutrition_facts.drop(columns=cols_drop, axis=1)

# dropping cols that don't seem to mean much
more_drop = ["PRAL score", "ID", "Name", '183 n3 ccc ALA mg',
             '205 n3 EPA mg', '225 n3 DPA mg', '226 n3 DHA mg',
             "Serving Weight 1 g", "Serving Weight 2 g", "Serving Weight 3 g",
             "Serving Weight 4 g", "Serving Weight 5 g", "Serving Weight 6 g",
             "Serving Weight 7 g", "Serving Weight 8 g", "Serving Weight 9 g",
             "200 Calorie Weight g", "Saturated Fats g",
             "Fat g", "Fiber g", "Calcium mg", "Iron Fe mg", "Potassium K mg", "Magnesium mg",
             "Vitamin A RAE mcg", "Vitamin C mg", "Vitamin B12 mcg", "Vitamin D mcg",
             "Vitamin E AlphaTocopherol mg", "Omega 3s mg", "Omega 6s mg", "Phosphorus P mg",
             "Copper Cu mg", "Thiamin B1 mg", "Riboflavin B2 mg", "Vitamin B6 mg", "Folate B9 mcg",
             "Folic acid mcg", "Food Folate mcg", "Folate DFE mcg", "Choline mg", "Retinol mcg",
             "Carotene beta mcg", "Carotene alpha mcg", "Lycopene mcg", "Lutein + Zeaxanthin mcg",
             "Vitamin K mcg", "Fatty acids total monounsaturated mg", "Fatty acids total polyunsaturated mg",
             "Alcohol g", "Caffeine mg", "Theobromine mg", "Sugars g", "Niacin B3 mg",
             "Selenium Se mcg", "Zinc Zn mg", "Calories"]
nutrition_facts = nutrition_facts.drop(columns=more_drop, axis=1)

# drop column if 70% of its rows are empty
threshold = int(.70*len(nutrition_facts))
nutrition_facts.dropna(axis=1, thresh=threshold, inplace=True)

nutrition_facts.fillna(0, inplace=True)

nutrition_facts.columns = nutrition_facts.columns.str.lower()

cols_rename = {"food group": "food_group", "protein g": "protein", "carbohydrate g": "carbohydrate",
                "cholesterol mg": "cholesterol", "water g": "water", "sodium mg": "sodium"}

nutrition_facts = nutrition_facts.rename(mapper=cols_rename, axis=1)

In [12]:
nutrition_facts['food_group'].value_counts() / len(nutrition_facts)

food_group
Meats                      0.227415
Vegetables                 0.162582
Baked Foods                0.066899
Fish                       0.055144
Prepared Meals             0.054360
Fast Foods                 0.049444
Beverages                  0.043246
Baby Foods                 0.038757
Soups and Sauces           0.037903
Sweets                     0.035836
Fruits                     0.032488
Beans and Lentils          0.032132
Breakfast Cereals          0.026788
Dairy and Egg Products     0.026218
Snacks                     0.020732
Dairy and Egg Products     0.019521
Fats and Oils              0.017241
Grains and Pasta           0.016885
Nuts and Seeds             0.012183
American Indian            0.011755
Restaurant Foods           0.007979
Spices and Herbs           0.004488
Name: count, dtype: float64

In [13]:
bool_mask1 = (nutrition_facts['food_group'] == 'Meats') | (nutrition_facts['food_group'] == 'Vegetables') 
bool_mask2 = bool_mask1 | (nutrition_facts['food_group'] == 'Baked Foods')
bool_mask3 = bool_mask2 | (nutrition_facts['food_group'] == 'Fish')
nutrition_facts = nutrition_facts[bool_mask3]

In [14]:
nutrition_facts['food_group'].value_counts() / len(nutrition_facts)

food_group
Meats          0.444135
Vegetables     0.317518
Baked Foods    0.130653
Fish           0.107694
Name: count, dtype: float64

In [15]:
nutrition_facts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7187 entries, 0 to 13869
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   food_group    7187 non-null   object 
 1   protein       7187 non-null   float64
 2   carbohydrate  7187 non-null   float64
 3   cholesterol   7187 non-null   float64
 4   water         7187 non-null   float64
 5   sodium        7187 non-null   float64
dtypes: float64(5), object(1)
memory usage: 393.0+ KB


In [16]:
# split data
nf_train, nf_test = train_test_split(nutrition_facts,
                                     test_size=.15,
                                     random_state=123,
                                     stratify=nutrition_facts['food_group'])

# get X, y
target_col = "food_group"
cols_drop = ["food_group"]
X_train_nf = nf_train.drop(cols_drop, axis=1)
X_test_nf = nf_test.drop(cols_drop, axis=1)
y_train_nf = nf_train[target_col]
y_test_nf = nf_test[target_col]

# scale data
scale = StandardScaler()
scale.fit(X_train_nf)
X_train_scaled_nf = pd.DataFrame(data=scale.transform(X_train_nf),
                                 columns=X_train_nf.columns,
                                 index=X_train_nf.index)
X_test_scaled_nf = pd.DataFrame(data=scale.transform(X_test_nf),
                                columns=X_test_nf.columns,
                                index=X_test_nf.index)

### Baseline: Nutrition Facts

In [17]:
# calculat a baseline
act_pred_error_nf = pd.DataFrame({"actual": y_train_nf})
act_pred_error_nf["baseline_prediction"] = y_train_nf.value_counts().index[0]

baseline_acc_nf = accuracy_score(act_pred_error_nf["actual"], act_pred_error_nf["baseline_prediction"])

# print baseline accuracy
print(f"Baseline Accuracy Score: {round(baseline_acc_nf, 2)}%")

Baseline Accuracy Score: 0.44%
