## FV and Fat Outcomes Extraction
#### This notebook extracts 212 participants' self-reported fruit/vegetable credits and fat from raw dataset with daily level granularity
#### The inputs and outputs are in csv format

In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
import time 
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import tables
df_food = pd.read_csv('Raw Data/food_upload.csv') 
df_food_item = pd.read_csv('Raw Data/food_foods_fv.csv')
df_food_serving = pd.read_csv('Raw Data/food_servings.csv')
df_user = pd.read_csv('Raw Data/users.csv') 
df_recipe = pd.read_csv('Raw Data/recipe_upload.csv') 

  interactivity=interactivity, compiler=compiler, result=result)


### Visualize raw food-related data 

In [3]:
df_food.head(5)

Unnamed: 0,event_time,serving_id,food_id,serving_time,amount,time_upload,user_id,aid,status,update_time,credit,favorites
0,2012-08-08 06:58:48,218791,1063878,Breakfast,0.5,2012-08-08 06:59:30,10000020,05656201278A000002C8187AF,,,1.0,0
1,2012-08-08 06:57:49,172122,1102188,Lunch,1.0,2012-08-08 06:59:30,10000020,14957201278A000002C8187AF,,,0.0,0
2,2012-08-08 06:58:11,190182,1063324,Lunch,0.75,2012-08-08 06:59:30,10000020,21158201278A000002C8187AF,,,0.03,0
3,2012-08-08 07:44:28,218791,1063878,Breakfast,0.75,2012-08-08 07:47:07,10000020,02844201278A000002C80E683,,,1.5,0
4,2012-08-08 07:44:37,218791,1063878,Lunch,0.5,2012-08-08 07:47:07,10000020,13644201278A000002C80E683,,,1.0,0


In [4]:
df_food_item.head(5)

Unnamed: 0,food_id,category_id,name,is_fv,fv_excl_crit,is_fv_auto,isfv_tagger1,why1,isfv_tagger2,why2,...,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1001004,8148,"Blue, bleu",,,False,,,,,...,353.0,21.4,28.74,2.34,0.5,0.0,528.0,1395.0,18.669,75.0
1,1001006,8148,Brie,,,False,,,,,...,334.0,20.75,27.68,0.45,0.45,0.0,184.0,629.0,17.41,100.0
2,1001009,8148,Cheddar,,,False,,,,,...,403.0,24.9,33.14,1.28,0.52,0.0,721.0,621.0,21.092,105.0
3,1001011,8148,Colby,,,False,,,,,...,394.0,23.76,32.11,2.57,0.52,0.0,685.0,604.0,20.218,95.0
4,1001026,8148,"Mozzarella, whole milk",,,False,,,,,...,300.0,22.17,22.35,2.19,1.03,0.0,505.0,627.0,13.152,79.0


In [5]:
df_food_serving.head(5)

Unnamed: 0,serving_id,food_id,name,size,fv_credit,fv_cup,fv_type
0,115152,1112899,"serving, 1/5 broccoflower (3.5 oz)",99.0,3.04,1.52,S = Standard x 2
1,191727,1063605,"melon, 15"" long x 7-1/2"" dia (9 lbs 15.4 oz)",4518.0,59.0,29.5,S = Standard x 2
2,200244,1173830,bag (4 lbs 8 oz),2041.0,58.32,29.16,S = Standard x 2
3,200217,1173827,bag (4 lbs 8 oz),2041.0,45.36,22.68,S = Standard x 2
4,200235,1122058,bag (3 lbs 8 oz),1588.0,37.36,18.68,S = Standard x 2


In [7]:
df_recipe.head(5)

Unnamed: 0,itemid,creation_time,food_id,item_amount,serving_id,recipe_name,user_id,recipe_id,server_time,creditrecipe,servamount,id
0,7318282012910A000002C80E683,2012-10-10 12:28:18,1141151,0.5,180700,applle,10000020,7334282012910A000002C80E683,2012-10-10 12:34:04,0.1,1,1
1,7323282012910A000002C80E683,2012-10-10 12:28:23,1129043,0.25,125669,applle,10000020,7334282012910A000002C80E683,2012-10-10 12:34:04,0.0,1,2
2,7318282012910A000002C80E683,2012-10-10 12:28:18,1141151,0.5,180700,applle,10000020,7457282012910A000002C80E683,2012-10-10 12:34:04,0.1,1,3
3,7323282012910A000002C80E683,2012-10-10 12:28:23,1129043,0.25,125669,applle,10000020,7457282012910A000002C80E683,2012-10-10 12:34:04,0.0,1,4
4,7318282012910A000002C80E683,2012-10-10 12:28:18,1141151,0.5,180700,applle,10000020,7911442012910A000002C80E683,2012-10-10 17:49:22,0.1,3,5


### Extract Valid User (intervention + follow-up)

In [8]:
# all participants
id_list = []
for i in range(df_user.shape[0]):
    #extract users' name starting with 'mbc2'
    if (df_user['username'][i][0:4] == 'mbc2'):
        id_list.append(df_user['user_id'][i])
id_list.sort()

print('There are', len(id_list), 'participants in intervention/follow-up period')

There are 212 participants in intervention/follow-up period


### Extract Recipe

In [9]:
# selecting rows for participants only
df_recipe = df_recipe[df_recipe['user_id'].isin(id_list)]
df_recipe = df_recipe.sort_values(by=['creation_time'])
df_recipe.head(5)

Unnamed: 0,itemid,creation_time,food_id,item_amount,serving_id,recipe_name,user_id,recipe_id,server_time,creditrecipe,servamount,id
27,865920121019A000002C81055D,2012-11-19 14:59:06,1143646,0.5,149246,Minestrone,69,8473020121019A000002C81055D,2012-11-19 19:38:53,4.0,9,28
48,865920121019A000002C81055D,2012-11-19 14:59:06,1143646,0.5,149246,Minestrone,69,21434320121022none,2012-11-23 05:21:36,4.0,9,49
65,865920121019A000002C81055D,2012-11-19 14:59:06,1143646,0.5,149246,Minestrone,69,5842520121027none,2012-11-27 23:16:21,4.0,9,66
41,865920121019A000002C81055D,2012-11-19 14:59:06,1143646,0.5,149246,Minestrone,69,1725320121022none,2012-11-22 15:00:59,4.0,9,42
34,865920121019A000002C81055D,2012-11-19 14:59:06,1143646,0.5,149246,Minestrone,69,16453620121020none,2012-11-20 23:45:47,4.0,9,35


In [10]:
# columns of interest
col = ['user_id', 'creation_time', 'food_id', 'serving_id', 'item_amount', 'recipe_name', 'creditrecipe', 'servamount']
df_recipe = df_recipe[col]
df_recipe.head(5)

Unnamed: 0,user_id,creation_time,food_id,serving_id,item_amount,recipe_name,creditrecipe,servamount
27,69,2012-11-19 14:59:06,1143646,149246,0.5,Minestrone,4.0,9
48,69,2012-11-19 14:59:06,1143646,149246,0.5,Minestrone,4.0,9
65,69,2012-11-19 14:59:06,1143646,149246,0.5,Minestrone,4.0,9
41,69,2012-11-19 14:59:06,1143646,149246,0.5,Minestrone,4.0,9
34,69,2012-11-19 14:59:06,1143646,149246,0.5,Minestrone,4.0,9


In [11]:
#Merge upload and item (sql)
df_test = ps.sqldf("SELECT d1.food_id as food_id, d1.creditrecipe, d1.serving_id, d1.creation_time as upload_time, d1.user_id, d1.item_amount as amount, d2.name, d2.base, d2.calories, d2.protein, d2.total_fat, d2.total_carbohydrate, d2.sugars, d2.fiber, d2.calcium, d2.sodium, d2.saturated_fatty_acids, d2.cholesterol FROM df_recipe as d1 LEFT JOIN df_food_item as d2 ON d1.food_id = d2.food_id")

In [12]:
df_test.head(6)

Unnamed: 0,food_id,creditrecipe,serving_id,upload_time,user_id,amount,name,base,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1143646,4.0,149246,2012-11-19 14:59:06,69,0.5,Mixed Vegetables,85.0,60.0,2.0,0.0,11.0,3.0,3.0,20.0,40.0,0.0,0.0
1,1143646,4.0,149246,2012-11-19 14:59:06,69,0.5,Mixed Vegetables,85.0,60.0,2.0,0.0,11.0,3.0,3.0,20.0,40.0,0.0,0.0
2,1143646,4.0,149246,2012-11-19 14:59:06,69,0.5,Mixed Vegetables,85.0,60.0,2.0,0.0,11.0,3.0,3.0,20.0,40.0,0.0,0.0
3,1143646,4.0,149246,2012-11-19 14:59:06,69,0.5,Mixed Vegetables,85.0,60.0,2.0,0.0,11.0,3.0,3.0,20.0,40.0,0.0,0.0
4,1143646,4.0,149246,2012-11-19 14:59:06,69,0.5,Mixed Vegetables,85.0,60.0,2.0,0.0,11.0,3.0,3.0,20.0,40.0,0.0,0.0
5,1143646,4.0,149246,2012-11-19 14:59:06,69,0.5,Mixed Vegetables,85.0,60.0,2.0,0.0,11.0,3.0,3.0,20.0,40.0,0.0,0.0


In [13]:
# drop duplicated records and fill missing value with 999999
df_test_clean = df_test.drop_duplicates()
df_test_clean["base"] = df_test_clean["base"].fillna(999999)
df_test_clean.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,food_id,creditrecipe,serving_id,upload_time,user_id,amount,name,base,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1143646,4.0,149246,2012-11-19 14:59:06,69,0.5,Mixed Vegetables,85.0,60.0,2.0,0.0,11.0,3.0,3.0,20.0,40.0,0.0,0.0
6,1053727,3.8,183733,2012-11-19 15:00:36,69,1.0,"Kidney Beans, Dark Red",127.0,110.0,8.0,0.0,20.0,2.0,6.0,60.0,340.0,0.0,0.0
12,1104261,3.26,130771,2012-11-19 15:02:30,69,1.0,"Plain Beans, Garbanzo",130.0,130.0,7.0,1.0,21.0,0.5,5.0,60.0,30.0,0.0,0.0
18,1135298,3.0,136437,2012-11-19 15:06:44,69,1.5,"Tomatoes, Diced",121.0,30.0,1.0,0.0,6.0,3.0,2.0,40.0,280.0,0.0,0.0
24,1093661,2.88,169393,2012-11-19 15:18:32,69,2.0,"100%, Low Sodium",236.6,50.0,2.0,0.0,10.0,8.0,2.0,20.0,140.0,0.0,0.0


In [14]:
# fill all empty nutrition facts with 0
nutrition_List = ['calories', 'protein','total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium','sodium', 'saturated_fatty_acids', 'cholesterol']
for nutrition_item in nutrition_List:
    df_test_clean[nutrition_item] = df_test_clean[nutrition_item].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [15]:
# merge upload and serving (sql)
df_test2 = ps.sqldf("SELECT d1.food_id as food_id, d1.creditrecipe, d1.serving_id as serving_id, d2.size, d2.fv_credit, d1.upload_time, d1.user_id, d1.amount, d1.name, d1.base, d1.calories, d1.protein, d1.total_fat, d1.total_carbohydrate, d1.sugars, d1.fiber, d1.calcium, d1.sodium, d1.saturated_fatty_acids, d1.cholesterol FROM df_test_clean as d1 INNER JOIN df_food_serving as d2 ON d1.Food_ID = d2.food_id and d1.serving_id = d2.serving_id")
df_test2.head(5)

Unnamed: 0,food_id,creditrecipe,serving_id,size,fv_credit,upload_time,user_id,amount,name,base,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1143646,4.0,149246,907.2,8.0,2012-11-19 14:59:06,69,0.5,Mixed Vegetables,85.0,60.0,2.0,0.0,11.0,3.0,3.0,20.0,40.0,0.0,0.0
1,1053727,3.8,183733,425.0,3.8,2012-11-19 15:00:36,69,1.0,"Kidney Beans, Dark Red",127.0,110.0,8.0,0.0,20.0,2.0,6.0,60.0,340.0,0.0,0.0
2,1104261,3.26,130771,425.0,3.26,2012-11-19 15:02:30,69,1.0,"Plain Beans, Garbanzo",130.0,130.0,7.0,1.0,21.0,0.5,5.0,60.0,30.0,0.0,0.0
3,1135298,3.0,136437,242.0,2.0,2012-11-19 15:06:44,69,1.5,"Tomatoes, Diced",121.0,30.0,1.0,0.0,6.0,3.0,2.0,40.0,280.0,0.0,0.0
4,1093661,2.88,169393,340.0,1.44,2012-11-19 15:18:32,69,2.0,"100%, Low Sodium",236.6,50.0,2.0,0.0,10.0,8.0,2.0,20.0,140.0,0.0,0.0


In [16]:
# extract columns of interest
nutrition_List = ['calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

included_List = ['user_id', 'food_id','serving_id','upload_time','name', 'amount','base','size','creditrecipe','fv_credit']
included_List = included_List + nutrition_List
df_test2 = df_test2[included_List]

In [17]:
# calculate total nutrients 
import warnings
warnings.filterwarnings("ignore")
df_final = df_test2
for nutrition_item in nutrition_List:
    for i in range(df_final.shape[0]):
        df_final[nutrition_item][i] = df_final['size'][i] / df_final['base'][i] * df_final['amount'][i] * df_final[nutrition_item][i]    

In [21]:
# calculate result fv credit (multiply by amount)
for i in range(df_final.shape[0]):
    df_final['fv_credit'][i] = df_final['fv_credit'][i] * df_final['amount'][i]

# export recipe final table
df_recipe_final = df_final
df_recipe_final.to_csv('Result/Food/recipe.csv', index=False)
#get user level
df_recipe_final['upload_time'] = df_recipe_final['upload_time'].astype('string').str[:10]
df_recipe_final.head(3)

Unnamed: 0,user_id,food_id,serving_id,upload_time,name,amount,base,size,creditrecipe,fv_credit,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,69,1143646,149246,2012-11-19,Mixed Vegetables,0.5,85.0,907.2,4.0,4.0,320.188235,10.672941,0.0,58.701176,16.009412,16.009412,106.729412,213.458824,0.0,0.0
1,69,1053727,183733,2012-11-19,"Kidney Beans, Dark Red",1.0,127.0,425.0,3.8,3.8,368.110236,26.771654,0.0,66.929134,6.692913,20.07874,200.787402,1137.795276,0.0,0.0
2,69,1104261,130771,2012-11-19,"Plain Beans, Garbanzo",1.0,130.0,425.0,3.26,3.26,425.0,22.884615,3.269231,68.653846,1.634615,16.346154,196.153846,98.076923,0.0,0.0


In [22]:
#aggregate day-level nutrients (sum)
df_recipe_user = ps.sqldf("SELECT user_id, upload_time as upload_time, sum(calories) as calories, sum(protein) as protein, sum(total_fat) as total_fat, sum(total_carbohydrate) as total_carbohydrate, sum(sugars) as sugars, sum(fiber) as fiber, sum(calcium) as calciumm, sum(sodium) as sodium, sum(saturated_fatty_acids) as saturated_fatty_acids, sum(cholesterol) as cholesterol, sum(fv_credit) as fv_credit FROM df_recipe_final group by user_id, upload_time")

In [23]:
df_recipe_user.head(5)

Unnamed: 0,user_id,upload_time,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calciumm,sodium,saturated_fatty_acids,cholesterol,fv_credit
0,13,2013-02-13,336.845114,23.452183,4.490644,54.404366,10.984407,5.489605,309.376299,1558.170478,1.496881,39.91684,3.19
1,13,2013-02-22,355.845114,23.452183,4.490644,64.404366,16.984407,5.489605,309.376299,2678.170478,1.496881,39.91684,4.55
2,13,2013-06-21,338.144114,23.730363,4.602314,56.621356,14.008067,5.588205,324.623299,2115.517478,1.51914,39.91684,2.6125
3,51,2012-12-07,1808.693214,131.653757,70.720755,155.357035,54.792319,40.196097,608.006774,5226.455968,18.724713,456.026786,16.87
4,51,2012-12-12,273.2,7.8573,12.7254,31.365325,8.680025,3.08975,127.9475,668.73,3.960848,18.9125,3.445


In [24]:
df_recipe_user.to_csv('Result/Food/recipe_userLevel.csv', index=False)

### Regular food 

In [25]:
# selecting rows for participants only
df_food_new = df_food[df_food['user_id'].isin(id_list)]
df_food_new = df_food_new.sort_values(by=['time_upload'])

In [26]:
print('Total number of daily food entries:', df_food.shape[0])
print('Total number of daily food entries for', len(id_list), 'participants only:', df_food_new.shape[0])

Total number of daily food entries: 280208
Total number of daily food entries for 212 participants only: 279358


In [27]:
#sanity check (sql)
ps.sqldf("SELECT count(distinct user_id) as Total_Number_User FROM df_food")

Unnamed: 0,Total_Number_User
0,226


In [28]:
#sanity check (sql)
ps.sqldf("SELECT count(distinct user_id) as Total_Number_Record FROM df_food_new")

Unnamed: 0,Total_Number_Record
0,206


In [29]:
#Missing Users
exsiting_user = list(df_food_new['user_id'])
print('User record not found in food upload:', np.setdiff1d(id_list,exsiting_user))

User record not found in food upload: [ 42 125 239 250 543 581]


## Merging food_upload / food_foods_fv / food_servings

#### Merge food_upload + food_foods_fv

In [30]:
#Merge upload and item (sql)
df_test = ps.sqldf("SELECT d1.food_id as Food_ID, d2.food_id as Food_ID2, d1.credit, d1.serving_id, d1.time_upload, d1.user_id, d1.serving_time, d1.amount, d2.name, d2.base, d2.calories, d2.protein, d2.total_fat, d2.total_carbohydrate, d2.sugars, d2.fiber, d2.calcium, d2.sodium, d2.saturated_fatty_acids, d2.cholesterol FROM df_food_new as d1 LEFT JOIN df_food_item as d2 ON d1.food_id = d2.food_id")

In [31]:
#new columns
df_test.columns

Index(['Food_ID', 'Food_ID2', 'credit', 'serving_id', 'time_upload', 'user_id',
       'serving_time', 'amount', 'name', 'base', 'calories', 'protein',
       'total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium',
       'sodium', 'saturated_fatty_acids', 'cholesterol'],
      dtype='object')

In [32]:
df_test.shape

(279358, 20)

279358 matches from food_upload and merged dataframe (sanity check)

In [33]:
#fill null fv base values with 1 (avoid zero dividing error)
df_test["base"] = df_test["base"].fillna(1)

In [34]:
list(df_test)

['Food_ID',
 'Food_ID2',
 'credit',
 'serving_id',
 'time_upload',
 'user_id',
 'serving_time',
 'amount',
 'name',
 'base',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

In [35]:
#fill all null nutrition facts with 0
nutrition_List = ['calories', 'protein','total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium','sodium', 'saturated_fatty_acids', 'cholesterol']
for nutrition_item in nutrition_List:
    df_test[nutrition_item] = df_test[nutrition_item].fillna(0)

#### Merge food_upload + food_foods_fv + food_servings

In [36]:
#Merge upload and item (sql)
df_test2 = ps.sqldf("SELECT d1.Food_ID as Food_ID, d1.Food_ID2 as Food_ID2, d2.food_id as Food_ID3, d1.credit, d1.serving_id as Serving_ID1, d2.serving_id as Serving_ID2, d2.size, d2.fv_credit, d1.time_upload, d1.user_id, d1.serving_time, d1.amount, d1.name, d1.base, d1.calories, d1.protein, d1.total_fat, d1.total_carbohydrate, d1.sugars, d1.fiber, d1.calcium, d1.sodium, d1.saturated_fatty_acids, d1.cholesterol FROM df_test as d1 INNER JOIN df_food_serving as d2 ON d1.Food_ID = d2.food_id and d1.serving_id = d2.serving_id")

In [37]:
df_test2.head(5)

Unnamed: 0,Food_ID,Food_ID2,Food_ID3,credit,Serving_ID1,Serving_ID2,size,fv_credit,time_upload,user_id,...,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1001129,1001129.0,1001129,0.0,190677,190677,50.0,0.0,2012-08-13 15:53:24,14,...,155.0,12.58,10.61,1.12,1.12,0.0,50.0,124.0,3.267,424.0
1,1159175,1159175.0,1159175,2.25,175688,175688,85.0,1.5,2012-08-13 15:53:24,14,...,15.0,1.0,0.0,3.0,2.0,4.0,20.0,0.0,0.0,0.0
2,1063636,1063636.0,1063636,0.5,108974,108974,255.0,2.0,2012-08-13 15:53:24,14,...,96.0,0.53,0.13,25.92,24.01,1.9,11.0,3.0,0.007,0.0
3,1063878,1063878.0,1063878,4.0,218795,218795,223.0,4.0,2012-08-13 16:31:43,14,...,52.0,0.26,0.17,13.81,10.39,2.4,6.0,1.0,0.028,0.0
4,1073329,1073329.0,1073329,1.5,218833,218833,136.0,1.5,2012-08-14 08:04:37,9,...,89.0,1.09,0.33,22.84,12.23,2.6,5.0,1.0,0.112,0.0


In [38]:
print('About', df_test.shape[0] - df_test2.shape[0], 'number of food uploads could not be found with both food item and serving matched')

About 5241 number of food uploads could not be found with both food item and serving matched


In [39]:
df_test2.columns

Index(['Food_ID', 'Food_ID2', 'Food_ID3', 'credit', 'Serving_ID1',
       'Serving_ID2', 'size', 'fv_credit', 'time_upload', 'user_id',
       'serving_time', 'amount', 'name', 'base', 'calories', 'protein',
       'total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium',
       'sodium', 'saturated_fatty_acids', 'cholesterol'],
      dtype='object')

<b>Calculation Part</b>

In [40]:
nutrition_List = ['calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

In [41]:
included_List = ['user_id', 'Food_ID','Serving_ID1','time_upload','serving_time','amount','base','size','credit','fv_credit']
included_List = included_List + nutrition_List
included_List

['user_id',
 'Food_ID',
 'Serving_ID1',
 'time_upload',
 'serving_time',
 'amount',
 'base',
 'size',
 'credit',
 'fv_credit',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

In [42]:
#sample test
df_test2 = df_test2[included_List]
df_test2.columns

Index(['user_id', 'Food_ID', 'Serving_ID1', 'time_upload', 'serving_time',
       'amount', 'base', 'size', 'credit', 'fv_credit', 'calories', 'protein',
       'total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium',
       'sodium', 'saturated_fatty_acids', 'cholesterol'],
      dtype='object')

<b>Formula: (food_servings.size / food_foods_fv.base) * food_upload.amount * (nutrition metric)</b>

In [43]:
#Applying to entire dataframe
#After Calculation
import warnings
warnings.filterwarnings("ignore")
df_final = df_test2
for nutrition_item in nutrition_List:
    for i in range(df_final.shape[0]):
        df_final[nutrition_item][i] = df_final['size'][i] / df_final['base'][i] * df_final['amount'][i] * df_final[nutrition_item][i]    

In [44]:
#calculate result fv credit (multiply by amount)
for i in range(df_final.shape[0]):
    df_final['fv_credit'][i] = df_final['fv_credit'][i] * df_final['amount'][i]

In [45]:
df_final.head(5)

Unnamed: 0,user_id,Food_ID,Serving_ID1,time_upload,serving_time,amount,base,size,credit,fv_credit,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,14,1001129,190677,2012-08-13 15:53:24,Breakfast,2.0,100.0,50.0,0.0,0.0,155.0,12.58,10.61,1.12,1.12,0.0,50.0,124.0,3.267,424.0
1,14,1159175,175688,2012-08-13 15:53:24,Lunch,1.5,85.0,85.0,2.25,2.25,22.5,1.5,0.0,4.5,3.0,6.0,30.0,0.0,0.0,0.0
2,14,1063636,108974,2012-08-13 15:53:24,Lunch,0.25,100.0,255.0,0.5,0.5,61.2,0.337875,0.082875,16.524,15.306375,1.21125,7.0125,1.9125,0.004462,0.0
3,14,1063878,218795,2012-08-13 16:31:43,Snacks,1.0,100.0,223.0,4.0,4.0,115.96,0.5798,0.3791,30.7963,23.1697,5.352,13.38,2.23,0.06244,0.0
4,9,1073329,218833,2012-08-14 08:04:37,Breakfast,1.0,100.0,136.0,1.5,1.5,121.04,1.4824,0.4488,31.0624,16.6328,3.536,6.8,1.36,0.15232,0.0


In [46]:
#drop rows with null size/missing values
df_final2 = df_final.dropna(subset=['size'])

In [47]:
print('About', df_final.shape[0] - df_final2.shape[0], 'number of food uploads are missing from upload information')

About 25209 number of food uploads are missing from upload information


In [48]:
df_final2.to_csv('Result/Food/food_upload_combined.csv', index=False)

## Daily Level Aggregation 

In [49]:
#get day-level date
df_final2 = pd.read_csv('Result/Food/food_upload_combined.csv') 
df_final2['time_upload'] = df_final2['time_upload'].astype('string').str[:10]
df_final2.head(3)

Unnamed: 0,user_id,Food_ID,Serving_ID1,time_upload,serving_time,amount,base,size,credit,fv_credit,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,14,1001129,190677,2012-08-13,Breakfast,2.0,100.0,50.0,0.0,0.0,155.0,12.58,10.61,1.12,1.12,0.0,50.0,124.0,3.267,424.0
1,14,1159175,175688,2012-08-13,Lunch,1.5,85.0,85.0,2.25,2.25,22.5,1.5,0.0,4.5,3.0,6.0,30.0,0.0,0.0,0.0
2,14,1063636,108974,2012-08-13,Lunch,0.25,100.0,255.0,0.5,0.5,61.2,0.337875,0.082875,16.524,15.306375,1.21125,7.0125,1.9125,0.004463,0.0


In [50]:
#aggregate day-level nutrients (sum)

df_final3= ps.sqldf("SELECT user_id, time_upload as upload_time, sum(calories) as calories, sum(protein) as protein, sum(total_fat) as total_fat, sum(total_carbohydrate) as total_carbohydrate, sum(sugars) as sugars, sum(fiber) as fiber, sum(calcium) as calciumm, sum(sodium) as sodium, sum(saturated_fatty_acids) as saturated_fatty_acids, sum(cholesterol) as cholesterol, sum(fv_credit) as fv_credit FROM df_final2 group by user_id, time_upload")

In [51]:
df_final3.head(5)

Unnamed: 0,user_id,upload_time,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calciumm,sodium,saturated_fatty_acids,cholesterol,fv_credit
0,1,2012-08-27,1664.16975,48.666667,62.0,248.934197,179.934197,7.666667,0.0,2968.688416,22.833333,163.0,0.0
1,1,2012-08-29,767.16975,2.0,16.0,173.934197,171.934197,0.0,0.0,472.688416,9.0,60.0,0.0
2,1,2012-08-30,362.711416,2.5,0.0,96.559197,94.059197,2.5,25.0,310.063416,0.0,0.0,1.25
3,1,2012-08-31,62.5,2.5,0.0,12.5,10.0,2.5,25.0,175.0,0.0,0.0,1.25
4,1,2012-09-02,1812.5,40.5,63.0,278.0,184.5,9.5,45.0,1262.5,19.0,125.0,1.75


In [52]:
df_final3.to_csv('Result/Food/food_upload_combined_userlevel.csv', index=False)

### Combine Recipe with Food Items

In [53]:
df_recipe = pd.read_csv('Result/Food/recipe.csv')
df_food = pd.read_csv('Result/Food/food_upload_combined.csv')
df_food = df_food[pd.to_numeric(df_food['Food_ID'], errors='coerce').notnull()]

In [54]:
#rename column and select columns of interest
df_recipe.columns = ['user_id',
 'food_id',
 'serving_id',
 'time_upload',
 'name',
 'amount',
 'base',
 'size',
 'creditrecipe',
 'fv_credit',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

df_food.columns = ['user_id',
 'food_id',
 'serving_id',
 'time_upload',
 'serving_time',
 'amount',
 'base',
 'size',
 'credit',
 'fv_credit',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

In [55]:
food_item = df_food_item[['food_id', 'name']]
df_food = pd.merge(food_item , df_food, on=['food_id'])

In [56]:
columns_interest = ['user_id',
 'food_id',
 'serving_id',
 'time_upload',
 'name',
 'amount',
 'base',
 'size',
 'fv_credit',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

df_food = df_food[columns_interest]
df_recipe = df_recipe[columns_interest]

In [57]:
df_food_all = df_food.append(df_recipe)

In [58]:
df_food_all.head(5)

Unnamed: 0,user_id,food_id,serving_id,time_upload,name,amount,base,size,fv_credit,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,45,1001006,191067,2012-10-20 16:02:58,Brie,2.0,100.0,17.0,0.0,113.56,7.055,9.4112,0.153,0.153,0.0,62.56,213.86,5.9194,34.0
1,112,1001006,191067,2012-12-24 19:29:42,Brie,3.0,100.0,17.0,0.0,170.34,10.5825,14.1168,0.2295,0.2295,0.0,93.84,320.79,8.8791,51.0
2,112,1001006,191069,2013-01-13 10:48:24,Brie,1.0,100.0,128.0,0.0,427.52,26.56,35.4304,0.576,0.576,0.0,235.52,805.12,22.2848,128.0
3,113,1001006,191070,2013-01-15 16:09:38,Brie,0.5,100.0,144.0,0.0,240.48,14.94,19.9296,0.324,0.324,0.0,132.48,452.88,12.5352,72.0
4,71,1001006,191067,2013-01-24 20:57:33,Brie,1.5,100.0,17.0,0.0,85.17,5.29125,7.0584,0.11475,0.11475,0.0,46.92,160.395,4.43955,25.5


In [59]:
df_food_all.to_csv('Result/Food/food_all.csv', index=False)

In [60]:
# aggregate day-level nutrients (sum)
df_food_all['time_upload'] = df_food_all['time_upload'].astype('string').str[:10]
df_food_all_userlevel = ps.sqldf("SELECT user_id, time_upload as upload_time, sum(calories) as calories, sum(protein) as protein, sum(total_fat) as total_fat, sum(total_carbohydrate) as total_carbohydrate, sum(sugars) as sugars, sum(fiber) as fiber, sum(calcium) as calciumm, sum(sodium) as sodium, sum(saturated_fatty_acids) as saturated_fatty_acids, sum(cholesterol) as cholesterol, sum(fv_credit) as fv_credit FROM df_food_all group by user_id, time_upload")

In [61]:
df_food_all_userlevel.to_csv('Result/Food/food_all_userlevel.csv', index=False)

### Tag Variables (by gender, IQR)

In [62]:
df_food = pd.read_csv('Result/Food/food_all_userlevel.csv')
df_food.head(5)

Unnamed: 0,user_id,upload_time,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calciumm,sodium,saturated_fatty_acids,cholesterol,fv_credit
0,1,2012-08-27,1664.16975,48.666667,62.0,248.934197,179.934197,7.666667,0.0,2968.688416,22.833333,163.0,0.0
1,1,2012-08-29,767.16975,2.0,16.0,173.934197,171.934197,0.0,0.0,472.688416,9.0,60.0,0.0
2,1,2012-08-30,362.711416,2.5,0.0,96.559197,94.059197,2.5,25.0,310.063416,0.0,0.0,1.25
3,1,2012-08-31,62.5,2.5,0.0,12.5,10.0,2.5,25.0,175.0,0.0,0.0,1.25
4,1,2012-09-02,1812.5,40.5,63.0,278.0,184.5,9.5,45.0,1262.5,19.0,125.0,1.75


In [63]:
def outliers(df, column, IQR_level):
    l = list(df[column])
    for i in range(0, len(l)): 
        l[i] = float(l[i])
    
    #outlier percentage
    q1 = np.percentile(l, 25)  
    q3 = np.percentile(l, 75)
    IQR = q3-q1
    low = q1 - IQR_level*IQR
    high = q3 + IQR_level*IQR
    
    return([low, high])

# add gender
df_redcap = pd.read_csv('Raw Data/mbc2_redcap.csv') 
df_gender = df_redcap[['Participant ID', 'Sex']]
df_gender.columns = ['study_id', 'gender']
df_user_info = df_user[['study_id','user_id']]
df_info = df_gender.merge(df_user_info, on = 'study_id')
df_info['gender'] = df_info['gender'].astype('string').str[0]

#merge gender with info
df_food_gender = df_food.merge(df_info, on = 'user_id')

#remove user's ID
df_food_gender = df_food_gender.drop(['user_id'], axis=1)
df_food_gender.head(5)

Unnamed: 0,upload_time,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calciumm,sodium,saturated_fatty_acids,cholesterol,fv_credit,study_id,gender
0,2012-08-27,1664.16975,48.666667,62.0,248.934197,179.934197,7.666667,0.0,2968.688416,22.833333,163.0,0.0,1436,1
1,2012-08-29,767.16975,2.0,16.0,173.934197,171.934197,0.0,0.0,472.688416,9.0,60.0,0.0,1436,1
2,2012-08-30,362.711416,2.5,0.0,96.559197,94.059197,2.5,25.0,310.063416,0.0,0.0,1.25,1436,1
3,2012-08-31,62.5,2.5,0.0,12.5,10.0,2.5,25.0,175.0,0.0,0.0,1.25,1436,1
4,2012-09-02,1812.5,40.5,63.0,278.0,184.5,9.5,45.0,1262.5,19.0,125.0,1.75,1436,1


In [64]:
# split by gender 
df_male = df_food_gender[df_food_gender['gender'] == '1']
df_female = df_food_gender[df_food_gender['gender'] == '2']

#get outliers info
cal_male = outliers(df_male, 'calories', 1.5)
cal_female = outliers(df_female, 'calories', 1.5)
cal_male2 = outliers(df_male, 'calories', 3.0)
cal_female2 = outliers(df_female, 'calories', 3.0)

fv_male = outliers(df_male, 'fv_credit', 1.5)
fv_female = outliers(df_female, 'fv_credit', 1.5)
fv_male2 = outliers(df_male, 'fv_credit', 3.0)
fv_female2 = outliers(df_female, 'fv_credit', 3.0)

fat_male = outliers(df_male, 'total_fat', 1.5)
fat_female = outliers(df_female, 'total_fat', 1.5)
fat_male2 = outliers(df_male, 'total_fat', 3.0)
fat_female2 = outliers(df_female, 'total_fat', 3.0)

In [65]:
high_male1 = cal_male[1]
high_male2 = cal_male2[1]
high_female1= cal_female[1]
high_female2 = cal_female2[1]

def outliers_cal(row):
    if (row['gender'] == '1'): #male
        if(row['calories'] > high_male2):
            val = 2
        elif(row['calories'] > high_male1):
            val = 1
        else:
            val = 0
    else: #female
        if(row['calories'] > high_female2):
            val = 2
        elif(row['calories'] > high_female1):
            val = 1
        else:
            val = 0
    return val

df_food_gender['calories_outlier'] = df_food_gender.apply(outliers_cal, axis=1)

In [66]:
high_male1 = fv_male[1]
high_male2 = fv_male2[1]
high_female1= fv_female[1]
high_female2 = fv_female2[1]

def outliers_fv(row):
    if (row['gender'] == '1'): #male
        if(row['fv_credit'] > high_male2):
            val = 2
        elif(row['fv_credit'] > high_male1):
            val = 1
        else:
            val = 0
    else: #female
        if(row['fv_credit'] > high_female2):
            val = 2
        elif(row['fv_credit'] > high_female1):
            val = 1
        else:
            val = 0
    return val

df_food_gender['fv_outlier'] = df_food_gender.apply(outliers_fv, axis=1)

In [67]:
high_male1 = fat_male[1]
high_male2 = fat_male2[1]
high_female1= fat_female[1]
high_female2 = fat_female2[1]

def outliers_fat(row):
    if (row['gender'] == '1'): #male
        if(row['total_fat'] > high_male2):
            val = 2
        elif(row['total_fat'] > high_male1):
            val = 1
        else:
            val = 0
    else: #female
        if(row['total_fat'] > high_female2):
            val = 2
        elif(row['total_fat'] > high_female1):
            val = 1
        else:
            val = 0
    return val

df_food_gender['fat_outlier'] = df_food_gender.apply(outliers_fat, axis=1)

In [69]:
df_food_gender.head(5)

Unnamed: 0,upload_time,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calciumm,sodium,saturated_fatty_acids,cholesterol,fv_credit,study_id,gender,calories_outlier,fv_outlier,fat_outlier
0,2012-08-27,1664.16975,48.666667,62.0,248.934197,179.934197,7.666667,0.0,2968.688416,22.833333,163.0,0.0,1436,1,0,0,0
1,2012-08-29,767.16975,2.0,16.0,173.934197,171.934197,0.0,0.0,472.688416,9.0,60.0,0.0,1436,1,0,0,0
2,2012-08-30,362.711416,2.5,0.0,96.559197,94.059197,2.5,25.0,310.063416,0.0,0.0,1.25,1436,1,0,0,0
3,2012-08-31,62.5,2.5,0.0,12.5,10.0,2.5,25.0,175.0,0.0,0.0,1.25,1436,1,0,0,0
4,2012-09-02,1812.5,40.5,63.0,278.0,184.5,9.5,45.0,1262.5,19.0,125.0,1.75,1436,1,0,0,0


In [70]:
# save final food result table
df_food_gender.to_csv('Result/Food/food_final.csv',  index=False)