## FV and Fat Outcomes Extraction
#### This notebook extracts 212 participants' self-reported fruit/vegetable credits and fat from raw dataset with daily level granularity
#### The inputs and outputs are in csv format

In [2]:
import pandas as pd
import numpy as np
import pandasql as ps
import time 
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#import tables
df_food = pd.read_csv('Raw Data/food_upload.csv') 
df_food_item = pd.read_csv('Raw Data/food_foods_fv.csv')
df_food_serving = pd.read_csv('Raw Data/food_servings.csv')
df_user = pd.read_csv('Raw Data/users.csv') 

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_food.head(5)

Unnamed: 0,event_time,serving_id,food_id,serving_time,amount,time_upload,user_id,aid,status,update_time,credit,favorites
0,2012-08-08 06:58:48,218791,1063878,Breakfast,0.5,2012-08-08 06:59:30,10000020,05656201278A000002C8187AF,,,1.0,0
1,2012-08-08 06:57:49,172122,1102188,Lunch,1.0,2012-08-08 06:59:30,10000020,14957201278A000002C8187AF,,,0.0,0
2,2012-08-08 06:58:11,190182,1063324,Lunch,0.75,2012-08-08 06:59:30,10000020,21158201278A000002C8187AF,,,0.03,0
3,2012-08-08 07:44:28,218791,1063878,Breakfast,0.75,2012-08-08 07:47:07,10000020,02844201278A000002C80E683,,,1.5,0
4,2012-08-08 07:44:37,218791,1063878,Lunch,0.5,2012-08-08 07:47:07,10000020,13644201278A000002C80E683,,,1.0,0


In [154]:
df_food_item.head(5)

Unnamed: 0,food_id,category_id,name,is_fv,fv_excl_crit,is_fv_auto,isfv_tagger1,why1,isfv_tagger2,why2,...,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1001004,8148,"Blue, bleu",,,False,,,,,...,353.0,21.4,28.74,2.34,0.5,0.0,528.0,1395.0,18.669,75.0
1,1001006,8148,Brie,,,False,,,,,...,334.0,20.75,27.68,0.45,0.45,0.0,184.0,629.0,17.41,100.0
2,1001009,8148,Cheddar,,,False,,,,,...,403.0,24.9,33.14,1.28,0.52,0.0,721.0,621.0,21.092,105.0
3,1001011,8148,Colby,,,False,,,,,...,394.0,23.76,32.11,2.57,0.52,0.0,685.0,604.0,20.218,95.0
4,1001026,8148,"Mozzarella, whole milk",,,False,,,,,...,300.0,22.17,22.35,2.19,1.03,0.0,505.0,627.0,13.152,79.0


In [155]:
df_food_serving.head(5)

Unnamed: 0,serving_id,food_id,name,size,fv_credit,fv_cup,fv_type
0,115152,1112899,"serving, 1/5 broccoflower (3.5 oz)",99.0,3.04,1.52,S = Standard x 2
1,191727,1063605,"melon, 15"" long x 7-1/2"" dia (9 lbs 15.4 oz)",4518.0,59.0,29.5,S = Standard x 2
2,200244,1173830,bag (4 lbs 8 oz),2041.0,58.32,29.16,S = Standard x 2
3,200217,1173827,bag (4 lbs 8 oz),2041.0,45.36,22.68,S = Standard x 2
4,200235,1122058,bag (3 lbs 8 oz),1588.0,37.36,18.68,S = Standard x 2


### Extract User (intervention + follow-up)

In [156]:
#all participants
id_list = []
for i in range(df_user.shape[0]):
    #extract users' name starting with 'mbc2'
    if (df_user['username'][i][0:4] == 'mbc2'):
        id_list.append(df_user['user_id'][i])
id_list.sort()
print('There are', len(id_list), 'participants in intervention/follow-up period')

There are 212 participants in intervention/follow-up period


In [157]:
# selecting rows for participants only
df_food_new = df_food[df_food['user_id'].isin(id_list)]
df_food_new = df_food_new.sort_values(by=['time_upload'])

In [158]:
print('Total number of daily food entries:', df_food.shape[0])
print('Total number of daily food entries for', len(id_list), 'participants only:', df_food_new.shape[0])

Total number of daily food entries: 280208
Total number of daily food entries for 212 participants only: 279358


In [159]:
#sanity check (sql)
ps.sqldf("SELECT count(distinct user_id) as Total_Number_User FROM df_food")

Unnamed: 0,Total_Number_User
0,226


In [160]:
#sanity check (sql)
ps.sqldf("SELECT count(distinct user_id) as Total_Number_Record FROM df_food_new")

Unnamed: 0,Total_Number_Record
0,206


In [161]:
#Missing Users
exsiting_user = list(df_food_new['user_id'])
print('User record not found in food upload:', np.setdiff1d(id_list,exsiting_user))

User record not found in food upload: [ 42 125 239 250 543 581]


## Merging food_upload / food_foods_fv / food_servings

In [162]:
#sanity check for duplicates(sql)
ps.sqldf("SELECT count(distinct food_id) as total_foods FROM df_food_item")

Unnamed: 0,total_foods
0,71853


In [163]:
df_food_item.shape

(71853, 30)

Since two numbers (71853) matched, each row is a unique food item

#### Merge food_upload + food_foods_fv

In [164]:
#Merge upload and item (sql)
df_test = ps.sqldf("SELECT d1.food_id as Food_ID, d2.food_id as Food_ID2, d1.credit, d1.serving_id, d1.time_upload, d1.user_id, d1.serving_time, d1.amount, d2.name, d2.base, d2.calories, d2.protein, d2.total_fat, d2.total_carbohydrate, d2.sugars, d2.fiber, d2.calcium, d2.sodium, d2.saturated_fatty_acids, d2.cholesterol FROM df_food_new as d1 LEFT JOIN df_food_item as d2 ON d1.food_id = d2.food_id")

In [165]:
df_test

Unnamed: 0,Food_ID,Food_ID2,credit,serving_id,time_upload,user_id,serving_time,amount,name,base,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1001129,1001129.0,0.0,190677,2012-08-13 15:53:24,14,Breakfast,2.00,"Chicken Egg, whole, hard-boiled",100.0,155.0,12.58,10.61,1.12,1.12,0.0,50.0,124.0,3.267,424.0
1,1159175,1159175.0,2.25,175688,2012-08-13 15:53:24,14,Lunch,1.50,"Crispy Lettuce, Original Iceberg Garden Salad",85.0,15.0,1.00,0.00,3.00,2.00,4.0,20.0,0.0,0.000,0.0
2,1063636,1063636.0,0.5,108974,2012-08-13 15:53:24,14,Lunch,0.25,"Strawberries, sweetened, sliced",100.0,96.0,0.53,0.13,25.92,24.01,1.9,11.0,3.0,0.007,0.0
3,1063878,1063878.0,4.0,218795,2012-08-13 16:31:43,14,Snacks,1.00,"Apple, with Skin, raw",100.0,52.0,0.26,0.17,13.81,10.39,2.4,6.0,1.0,0.028,0.0
4,1073329,1073329.0,1.5,218833,2012-08-14 08:04:37,9,Breakfast,1.00,"Banana, raw",100.0,89.0,1.09,0.33,22.84,12.23,2.6,5.0,1.0,0.112,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279353,1142000,1142000.0,4.0,158829,2015-03-29 13:55:36,596,Lunch,2.00,"Concord Grapes, raw",100.0,67.0,0.60,0.35,17.00,16.25,0.9,14.0,2.0,0.110,0.0
279354,1063719,1063719.0,4.08,191649,2015-03-29 13:56:13,596,Lunch,2.00,"Oranges, Average all Varieties, raw",100.0,47.0,0.94,0.12,11.75,9.35,2.4,40.0,0.0,0.015,0.0
279355,1071028,1071028.0,1.9,163508,2015-03-29 13:57:12,596,Dinner,2.00,"Spinach, raw, edible portion",100.0,23.0,2.86,0.39,3.63,0.42,2.2,99.0,79.0,0.063,0.0
279356,1069387,1069387.0,8.0,109110,2015-03-29 13:59:02,596,Dinner,2.00,"Raisins, golden seedless",100.0,302.0,3.39,0.46,79.52,59.19,4.0,53.0,12.0,0.151,0.0


In [166]:
#new columns
df_test.columns

Index(['Food_ID', 'Food_ID2', 'credit', 'serving_id', 'time_upload', 'user_id',
       'serving_time', 'amount', 'name', 'base', 'calories', 'protein',
       'total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium',
       'sodium', 'saturated_fatty_acids', 'cholesterol'],
      dtype='object')

In [169]:
df_food_new.shape

(279358, 12)

In [168]:
df_test.shape

(279358, 20)

279358 matches from food_upload and merged dataframe (sanity check)

In [170]:
#fill null fv base values with 1 (avoid zero dividing error)
df_test["base"] = df_test["base"].fillna(1)

In [171]:
#fill all null nutrition facts with 0
nutrition_List = ['calories', 'protein','total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium','sodium', 'saturated_fatty_acids', 'cholesterol']
for nutrition_item in nutrition_List:
    df_test[nutrition_item] = df_test[nutrition_item].fillna(0)

#### Merge food_upload + food_foods_fv + food_servings

In [172]:
#Merge upload and item (sql)
df_test2 = ps.sqldf("SELECT d1.Food_ID as Food_ID, d1.Food_ID2 as Food_ID2, d2.food_id as Food_ID3, d1.credit, d1.serving_id as Serving_ID1, d2.serving_id as Serving_ID2, d2.size, d2.fv_credit, d1.time_upload, d1.user_id, d1.serving_time, d1.amount, d1.name, d1.base, d1.calories, d1.protein, d1.total_fat, d1.total_carbohydrate, d1.sugars, d1.fiber, d1.calcium, d1.sodium, d1.saturated_fatty_acids, d1.cholesterol FROM df_test as d1 INNER JOIN df_food_serving as d2 ON d1.Food_ID = d2.food_id and d1.serving_id = d2.serving_id")

In [173]:
df_test2

Unnamed: 0,Food_ID,Food_ID2,Food_ID3,credit,Serving_ID1,Serving_ID2,size,fv_credit,time_upload,user_id,...,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1001129,1001129.0,1001129,0.0,190677,190677,50.0,0.00,2012-08-13 15:53:24,14,...,155.0,12.58,10.61,1.12,1.12,0.0,50.0,124.0,3.267,424.0
1,1159175,1159175.0,1159175,2.25,175688,175688,85.0,1.50,2012-08-13 15:53:24,14,...,15.0,1.00,0.00,3.00,2.00,4.0,20.0,0.0,0.000,0.0
2,1063636,1063636.0,1063636,0.5,108974,108974,255.0,2.00,2012-08-13 15:53:24,14,...,96.0,0.53,0.13,25.92,24.01,1.9,11.0,3.0,0.007,0.0
3,1063878,1063878.0,1063878,4.0,218795,218795,223.0,4.00,2012-08-13 16:31:43,14,...,52.0,0.26,0.17,13.81,10.39,2.4,6.0,1.0,0.028,0.0
4,1073329,1073329.0,1073329,1.5,218833,218833,136.0,1.50,2012-08-14 08:04:37,9,...,89.0,1.09,0.33,22.84,12.23,2.6,5.0,1.0,0.112,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274112,1142000,1142000.0,1142000,4.0,158829,158829,92.0,2.00,2015-03-29 13:55:36,596,...,67.0,0.60,0.35,17.00,16.25,0.9,14.0,2.0,0.110,0.0
274113,1063719,1063719.0,1063719,4.08,191649,191649,184.0,2.04,2015-03-29 13:56:13,596,...,47.0,0.94,0.12,11.75,9.35,2.4,40.0,0.0,0.015,0.0
274114,1071028,1071028.0,1071028,1.9,163508,163508,28.4,0.95,2015-03-29 13:57:12,596,...,23.0,2.86,0.39,3.63,0.42,2.2,99.0,79.0,0.063,0.0
274115,1069387,1069387.0,1069387,8.0,109110,109110,145.0,4.00,2015-03-29 13:59:02,596,...,302.0,3.39,0.46,79.52,59.19,4.0,53.0,12.0,0.151,0.0


In [174]:
print('About', df_test.shape[0] - df_test2.shape[0], 'number of food uploads could not be found with both food item and serving matched')

About 5241 number of food uploads could not be found with both food item and serving matched


<b>Calculation Part</b>

In [175]:
df_test2.columns

Index(['Food_ID', 'Food_ID2', 'Food_ID3', 'credit', 'Serving_ID1',
       'Serving_ID2', 'size', 'fv_credit', 'time_upload', 'user_id',
       'serving_time', 'amount', 'name', 'base', 'calories', 'protein',
       'total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium',
       'sodium', 'saturated_fatty_acids', 'cholesterol'],
      dtype='object')

In [176]:
nutrition_List

['calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

In [177]:
included_List = ['user_id', 'Food_ID','Serving_ID1','time_upload','serving_time','amount','base','size']
included_List = included_List + nutrition_List

In [197]:
#sample test
df_calcualte_test = df_test2.head(5)
df_calcualte_test = df_calcualte_test[included_List]

In [198]:
#Before Calculation
df_calcualte_test

Unnamed: 0,user_id,Food_ID,Serving_ID1,time_upload,serving_time,amount,base,size,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,14,1001129,190677,2012-08-13 15:53:24,Breakfast,2.0,100.0,50.0,155.0,12.58,10.61,1.12,1.12,0.0,50.0,124.0,3.267,424.0
1,14,1159175,175688,2012-08-13 15:53:24,Lunch,1.5,85.0,85.0,15.0,1.0,0.0,3.0,2.0,4.0,20.0,0.0,0.0,0.0
2,14,1063636,108974,2012-08-13 15:53:24,Lunch,0.25,100.0,255.0,96.0,0.53,0.13,25.92,24.01,1.9,11.0,3.0,0.007,0.0
3,14,1063878,218795,2012-08-13 16:31:43,Snacks,1.0,100.0,223.0,52.0,0.26,0.17,13.81,10.39,2.4,6.0,1.0,0.028,0.0
4,9,1073329,218833,2012-08-14 08:04:37,Breakfast,1.0,100.0,136.0,89.0,1.09,0.33,22.84,12.23,2.6,5.0,1.0,0.112,0.0


<b>Formula: (food_servings.size / food_foods_fv.base) * food_upload.amount * (nutrition metric)</b>

In [199]:
#After Calculation
import warnings
warnings.filterwarnings("ignore")
for nutrition_item in nutrition_List:
    for i in range(df_calcualte_test.shape[0]):
        df_calcualte_test[nutrition_item][i] = df_calcualte_test['size'][i] / df_calcualte_test['base'][i] * df_calcualte_test['amount'][i] * df_calcualte_test[nutrition_item][i]

In [200]:
df_calcualte_test

Unnamed: 0,user_id,Food_ID,Serving_ID1,time_upload,serving_time,amount,base,size,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,14,1001129,190677,2012-08-13 15:53:24,Breakfast,2.0,100.0,50.0,155.0,12.58,10.61,1.12,1.12,0.0,50.0,124.0,3.267,424.0
1,14,1159175,175688,2012-08-13 15:53:24,Lunch,1.5,85.0,85.0,22.5,1.5,0.0,4.5,3.0,6.0,30.0,0.0,0.0,0.0
2,14,1063636,108974,2012-08-13 15:53:24,Lunch,0.25,100.0,255.0,61.2,0.337875,0.082875,16.524,15.306375,1.21125,7.0125,1.9125,0.004462,0.0
3,14,1063878,218795,2012-08-13 16:31:43,Snacks,1.0,100.0,223.0,115.96,0.5798,0.3791,30.7963,23.1697,5.352,13.38,2.23,0.06244,0.0
4,9,1073329,218833,2012-08-14 08:04:37,Breakfast,1.0,100.0,136.0,121.04,1.4824,0.4488,31.0624,16.6328,3.536,6.8,1.36,0.15232,0.0


In [201]:
#Applying to entire dataframe
df_final = df_test2
for nutrition_item in nutrition_List:
    for i in range(df_final.shape[0]):
        df_final[nutrition_item][i] = df_final['size'][i] / df_final['base'][i] * df_final['amount'][i] * df_final[nutrition_item][i]

In [204]:
df_final.shape

(274117, 24)

In [203]:
df_final.head(5)

Unnamed: 0,Food_ID,Food_ID2,Food_ID3,credit,Serving_ID1,Serving_ID2,size,fv_credit,time_upload,user_id,...,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1001129,1001129.0,1001129,0.0,190677,190677,50.0,0.0,2012-08-13 15:53:24,14,...,155.0,12.58,10.61,1.12,1.12,0.0,50.0,124.0,3.267,424.0
1,1159175,1159175.0,1159175,2.25,175688,175688,85.0,1.5,2012-08-13 15:53:24,14,...,22.5,1.5,0.0,4.5,3.0,6.0,30.0,0.0,0.0,0.0
2,1063636,1063636.0,1063636,0.5,108974,108974,255.0,2.0,2012-08-13 15:53:24,14,...,61.2,0.337875,0.082875,16.524,15.306375,1.21125,7.0125,1.9125,0.004462,0.0
3,1063878,1063878.0,1063878,4.0,218795,218795,223.0,4.0,2012-08-13 16:31:43,14,...,115.96,0.5798,0.3791,30.7963,23.1697,5.352,13.38,2.23,0.06244,0.0
4,1073329,1073329.0,1073329,1.5,218833,218833,136.0,1.5,2012-08-14 08:04:37,9,...,121.04,1.4824,0.4488,31.0624,16.6328,3.536,6.8,1.36,0.15232,0.0


### Save the results (local csv)

In [205]:
#Save as csv
df_final.to_csv('Result/food_upload_combined.csv', index=False)