## FV and Fat Outcomes Extraction
#### This notebook extracts 212 participants' self-reported fruit/vegetable credits and fat from raw dataset with daily level granularity
#### The inputs and outputs are in csv format

In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
import time 
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#import tables
df_food = pd.read_csv('Raw Data/food_upload.csv') 
df_food_item = pd.read_csv('Raw Data/food_foods_fv.csv')
df_food_serving = pd.read_csv('Raw Data/food_servings.csv')
df_user = pd.read_csv('Raw Data/users.csv') 
df_recipe = pd.read_csv('Raw Data/recipe_upload.csv') 

In [3]:
df_food.head(5)

Unnamed: 0,event_time,serving_id,food_id,serving_time,amount,time_upload,user_id,aid,status,update_time,credit,favorites
0,2012-07-18 09:31:14,202550,1092354,Breakfast,0.75,2012-07-18 09:33:10,1000001,014312012618A000002C80E683,,,0.39,0
1,2012-07-18 14:53:38,55917,1083753,Breakfast,1.0,2012-07-18 14:56:44,1,038532012618A000002C813830,,,0.0,0
2,2012-07-18 14:55:32,190733,1067522,Breakfast,1.0,2012-07-18 14:56:44,1,132552012618A000002C813830,,,1.0,0
3,2012-07-18 14:57:15,19474,1055153,Breakfast,1.0,2012-07-18 15:33:44,1,215572012618A000002C813830,,,0.0,0
4,2012-07-19 13:08:00,40275,1052381,Breakfast,0.5,2012-07-19 15:58:14,3,05972012619A000002C816464,,,0.0,0


In [4]:
df_food_item.head(5)

Unnamed: 0,food_id,category_id,name,is_fv,fv_excl_crit,is_fv_auto,isfv_tagger1,why1,isfv_tagger2,why2,...,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1001004,8148,"Blue, bleu",,,False,,,,,...,353.0,21.4,28.74,2.34,0.5,0.0,528.0,1395.0,18.669,75.0
1,1001006,8148,Brie,,,False,,,,,...,334.0,20.75,27.68,0.45,0.45,0.0,184.0,629.0,17.41,100.0
2,1001009,8148,Cheddar,,,False,,,,,...,403.0,24.9,33.14,1.28,0.52,0.0,721.0,621.0,21.092,105.0
3,1001011,8148,Colby,,,False,,,,,...,394.0,23.76,32.11,2.57,0.52,0.0,685.0,604.0,20.218,95.0
4,1001026,8148,"Mozzarella, whole milk",,,False,,,,,...,300.0,22.17,22.35,2.19,1.03,0.0,505.0,627.0,13.152,79.0


In [5]:
df_food_serving.head(5)

Unnamed: 0,serving_id,food_id,name,size,fv_credit,fv_cup,fv_type
0,115152,1112899,"serving, 1/5 broccoflower (3.5 oz)",99.0,3.04,1.52,S = Standard x 2
1,191727,1063605,"melon, 15"" long x 7-1/2"" dia (9 lbs 15.4 oz)",4518.0,59.0,29.5,S = Standard x 2
2,200244,1173830,bag (4 lbs 8 oz),2041.0,58.32,29.16,S = Standard x 2
3,200217,1173827,bag (4 lbs 8 oz),2041.0,45.36,22.68,S = Standard x 2
4,200235,1122058,bag (3 lbs 8 oz),1588.0,37.36,18.68,S = Standard x 2


In [6]:
df_food_serving[df_food_serving['food_id']==1122058]

Unnamed: 0,serving_id,food_id,name,size,fv_credit,fv_cup,fv_type
4,200235,1122058,bag (3 lbs 8 oz),1588.0,37.36,18.68,S = Standard x 2
345,124520,1122058,box (10 oz),284.0,6.68,3.34,S = Standard x 2
1770,103368,1122058,cup (3 oz),85.0,2.0,1.0,S = Standard x 2


In [7]:
df_recipe.head(5)

Unnamed: 0,itemid,creation_time,food_id,item_amount,serving_id,recipe_name,user_id,recipe_id,server_time,creditrecipe,servamount,id
0,154082013115A000002CC5E5C1,2013-02-15 12:08:40,1063879,0.75,218681,apple,1000001,1949272013115A000002CC5E5C1,2013-02-15 12:47:07,2.19,2,25331
1,154982013115A000002CC5E5C1,2013-02-15 12:08:49,1062683,0.75,110732,apple,1000001,1949272013115A000002CC5E5C1,2013-02-15 12:47:07,0.15,2,25332
2,154082013115A000002CC5E5C1,2013-02-15 12:08:40,1063879,0.75,218681,apple,1000001,2147292013115A000002CC5E5C1,2013-02-15 12:47:07,2.19,2,25333
3,154982013115A000002CC5E5C1,2013-02-15 12:08:49,1062683,0.75,110732,apple,1000001,2147292013115A000002CC5E5C1,2013-02-15 12:47:07,0.15,2,25334
4,4723232013115A000002C81488E,2013-02-15 17:23:23,1109969,1.0,88248,Breakfast,244,4756242013115A000002C81488E,2013-02-15 18:00:48,0.0,1,25335


### Extract User (intervention + follow-up)

In [8]:
#all participants
id_list = []
for i in range(df_user.shape[0]):
    #extract users' name starting with 'mbc2'
    if (df_user['username'][i][0:4] == 'mbc2'):
        id_list.append(df_user['user_id'][i])
id_list.sort()

print('There are', len(id_list), 'participants in intervention/follow-up period')

There are 212 participants in intervention/follow-up period


### Extract Recipe

In [9]:
# selecting rows for participants only
df_recipe = df_recipe[df_recipe['user_id'].isin(id_list)]
df_recipe = df_recipe.sort_values(by=['creation_time'])
df_recipe.head(5)

Unnamed: 0,itemid,creation_time,food_id,item_amount,serving_id,recipe_name,user_id,recipe_id,server_time,creditrecipe,servamount,id
146,16432013122A000002C8119D3,2013-02-22 12:43:06,1068093,2.0,190899,Turkey Sandwich,291,3021823201333A000002C8119D3,2013-04-03 11:24:09,0.0,1,25477
147,13442013122A000002C8119D3,2013-02-22 12:44:03,1179404,3.0,210370,Turkey Sandwich,291,3021823201333A000002C8119D3,2013-04-03 11:24:09,0.0,1,25478
148,145442013122A000002C8119D3,2013-02-22 12:44:45,1001009,1.0,191079,Turkey Sandwich,291,3021823201333A000002C8119D3,2013-04-03 11:24:09,0.0,1,25479
149,139452013122A000002C8119D3,2013-02-22 12:45:39,1111083,1.0,172296,Turkey Sandwich,291,3021823201333A000002C8119D3,2013-04-03 11:24:09,0.0,1,25480
24,1081432013126A000002C81309B,2013-02-26 12:43:01,1159186,1.0,175721,Whole Foods Salad,288,12421412013127none,2013-02-27 19:29:49,3.0,1,25355


In [10]:
#columns of interest
col = ['user_id', 'creation_time', 'food_id', 'serving_id', 'item_amount', 'recipe_name', 'creditrecipe', 'servamount']
df_recipe = df_recipe[col]
df_recipe.head(5)

Unnamed: 0,user_id,creation_time,food_id,serving_id,item_amount,recipe_name,creditrecipe,servamount
146,291,2013-02-22 12:43:06,1068093,190899,2.0,Turkey Sandwich,0.0,1
147,291,2013-02-22 12:44:03,1179404,210370,3.0,Turkey Sandwich,0.0,1
148,291,2013-02-22 12:44:45,1001009,191079,1.0,Turkey Sandwich,0.0,1
149,291,2013-02-22 12:45:39,1111083,172296,1.0,Turkey Sandwich,0.0,1
24,288,2013-02-26 12:43:01,1159186,175721,1.0,Whole Foods Salad,3.0,1


In [11]:
#Merge upload and item (sql)
df_test = ps.sqldf("SELECT d1.food_id as food_id, d1.creditrecipe, d1.serving_id, d1.creation_time as upload_time, d1.user_id, d1.item_amount as amount, d2.name, d2.base, d2.calories, d2.protein, d2.total_fat, d2.total_carbohydrate, d2.sugars, d2.fiber, d2.calcium, d2.sodium, d2.saturated_fatty_acids, d2.cholesterol FROM df_recipe as d1 LEFT JOIN df_food_item as d2 ON d1.food_id = d2.food_id")

In [12]:
df_test.head(6)

Unnamed: 0,food_id,creditrecipe,serving_id,upload_time,user_id,amount,name,base,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1068093,0.0,190899,2013-02-22 12:43:06,291,2.0,White Bread,100.0,266.0,7.64,3.29,50.61,4.31,2.4,151.0,681.0,0.717,0.0
1,1179404,0.0,210370,2013-02-22 12:44:03,291,3.0,"Deep Fried, Turkey Breast, Extra Thin Sliced, ...",56.0,60.0,10.0,1.5,3.0,1.0,0.0,0.0,550.0,0.0,25.0
2,1001009,0.0,191079,2013-02-22 12:44:45,291,1.0,Cheddar,100.0,403.0,24.9,33.14,1.28,0.52,0.0,721.0,621.0,21.092,105.0
3,1111083,0.0,172296,2013-02-22 12:45:39,291,1.0,"Mustard, Classic Yellow",5.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,55.0,0.0,0.0
4,1159186,3.0,175721,2013-02-26 12:43:01,288,1.0,"Tender Lettuce Mixes, Fancy Field Greens",85.0,20.0,1.0,0.0,3.0,1.0,2.0,20.0,15.0,0.0,0.0
5,1159186,3.0,175721,2013-02-26 12:43:01,288,1.0,"Tender Lettuce Mixes, Fancy Field Greens",85.0,20.0,1.0,0.0,3.0,1.0,2.0,20.0,15.0,0.0,0.0


In [13]:
df_test_clean = df_test.drop_duplicates()
df_test_clean["base"] = df_test_clean["base"].fillna(1)
df_test_clean.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,food_id,creditrecipe,serving_id,upload_time,user_id,amount,name,base,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1068093,0.0,190899,2013-02-22 12:43:06,291,2.0,White Bread,100.0,266.0,7.64,3.29,50.61,4.31,2.4,151.0,681.0,0.717,0.0
1,1179404,0.0,210370,2013-02-22 12:44:03,291,3.0,"Deep Fried, Turkey Breast, Extra Thin Sliced, ...",56.0,60.0,10.0,1.5,3.0,1.0,0.0,0.0,550.0,0.0,25.0
2,1001009,0.0,191079,2013-02-22 12:44:45,291,1.0,Cheddar,100.0,403.0,24.9,33.14,1.28,0.52,0.0,721.0,621.0,21.092,105.0
3,1111083,0.0,172296,2013-02-22 12:45:39,291,1.0,"Mustard, Classic Yellow",5.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,55.0,0.0,0.0
4,1159186,3.0,175721,2013-02-26 12:43:01,288,1.0,"Tender Lettuce Mixes, Fancy Field Greens",85.0,20.0,1.0,0.0,3.0,1.0,2.0,20.0,15.0,0.0,0.0


In [14]:
#fill all null nutrition facts with 0
nutrition_List = ['calories', 'protein','total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium','sodium', 'saturated_fatty_acids', 'cholesterol']
for nutrition_item in nutrition_List:
    df_test_clean[nutrition_item] = df_test_clean[nutrition_item].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [15]:
#Merge upload and serving (sql)
df_test2 = ps.sqldf("SELECT d1.food_id as food_id, d1.creditrecipe, d1.serving_id as serving_id, d2.size, d2.fv_credit, d1.upload_time, d1.user_id, d1.amount, d1.name, d1.base, d1.calories, d1.protein, d1.total_fat, d1.total_carbohydrate, d1.sugars, d1.fiber, d1.calcium, d1.sodium, d1.saturated_fatty_acids, d1.cholesterol FROM df_test_clean as d1 INNER JOIN df_food_serving as d2 ON d1.Food_ID = d2.food_id and d1.serving_id = d2.serving_id")
df_test2.head(5)

Unnamed: 0,food_id,creditrecipe,serving_id,size,fv_credit,upload_time,user_id,amount,name,base,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1068093,0.0,190899,30.0,0.0,2013-02-22 12:43:06,291,2.0,White Bread,100.0,266.0,7.64,3.29,50.61,4.31,2.4,151.0,681.0,0.717,0.0
1,1179404,0.0,210370,8.0,0.0,2013-02-22 12:44:03,291,3.0,"Deep Fried, Turkey Breast, Extra Thin Sliced, ...",56.0,60.0,10.0,1.5,3.0,1.0,0.0,0.0,550.0,0.0,25.0
2,1001009,0.0,191079,28.0,0.0,2013-02-22 12:44:45,291,1.0,Cheddar,100.0,403.0,24.9,33.14,1.28,0.52,0.0,721.0,621.0,21.092,105.0
3,1111083,0.0,172296,5.0,0.0,2013-02-22 12:45:39,291,1.0,"Mustard, Classic Yellow",5.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,55.0,0.0,0.0
4,1159186,3.0,175721,85.0,3.0,2013-02-26 12:43:01,288,1.0,"Tender Lettuce Mixes, Fancy Field Greens",85.0,20.0,1.0,0.0,3.0,1.0,2.0,20.0,15.0,0.0,0.0


In [16]:
#calculation
nutrition_List = ['calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

included_List = ['user_id', 'food_id','serving_id','upload_time','name', 'amount','base','size','creditrecipe','fv_credit']
included_List = included_List + nutrition_List
df_test2 = df_test2[included_List]

In [17]:
#Applying to entire dataframe
#After Calculation
import warnings
warnings.filterwarnings("ignore")
df_final = df_test2
for nutrition_item in nutrition_List:
    for i in range(df_final.shape[0]):
        df_final[nutrition_item][i] = df_final['size'][i] / df_final['base'][i] * df_final['amount'][i] * df_final[nutrition_item][i]    

In [18]:
#calculate result fv credit (multiply by amount)
for i in range(df_final.shape[0]):
    df_final['fv_credit'][i] = df_final['fv_credit'][i] * df_final['amount'][i]

In [19]:
df_final.head(5)

Unnamed: 0,user_id,food_id,serving_id,upload_time,name,amount,base,size,creditrecipe,fv_credit,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,291,1068093,190899,2013-02-22 12:43:06,White Bread,2.0,100.0,30.0,0.0,0.0,159.6,4.584,1.974,30.366,2.586,1.44,90.6,408.6,0.4302,0.0
1,291,1179404,210370,2013-02-22 12:44:03,"Deep Fried, Turkey Breast, Extra Thin Sliced, ...",3.0,56.0,8.0,0.0,0.0,25.714286,4.285714,0.642857,1.285714,0.428571,0.0,0.0,235.714286,0.0,10.714286
2,291,1001009,191079,2013-02-22 12:44:45,Cheddar,1.0,100.0,28.0,0.0,0.0,112.84,6.972,9.2792,0.3584,0.1456,0.0,201.88,173.88,5.90576,29.4
3,291,1111083,172296,2013-02-22 12:45:39,"Mustard, Classic Yellow",1.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,55.0,0.0,0.0
4,288,1159186,175721,2013-02-26 12:43:01,"Tender Lettuce Mixes, Fancy Field Greens",1.0,85.0,85.0,3.0,3.0,20.0,1.0,0.0,3.0,1.0,2.0,20.0,15.0,0.0,0.0


In [20]:
df_recipe_final = df_final
df_recipe_final.to_csv('Result/Food/recipe.csv', index=False)

In [21]:
#get user level
df_recipe_final['upload_time'] = df_recipe_final['upload_time'].astype('string').str[:10]
df_recipe_final.head(3)

Unnamed: 0,user_id,food_id,serving_id,upload_time,name,amount,base,size,creditrecipe,fv_credit,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,291,1068093,190899,2013-02-22,White Bread,2.0,100.0,30.0,0.0,0.0,159.6,4.584,1.974,30.366,2.586,1.44,90.6,408.6,0.4302,0.0
1,291,1179404,210370,2013-02-22,"Deep Fried, Turkey Breast, Extra Thin Sliced, ...",3.0,56.0,8.0,0.0,0.0,25.714286,4.285714,0.642857,1.285714,0.428571,0.0,0.0,235.714286,0.0,10.714286
2,291,1001009,191079,2013-02-22,Cheddar,1.0,100.0,28.0,0.0,0.0,112.84,6.972,9.2792,0.3584,0.1456,0.0,201.88,173.88,5.90576,29.4


In [22]:
#aggregate day-level nutrients (sum)
df_recipe_user = ps.sqldf("SELECT user_id, upload_time as upload_time, sum(calories) as calories, sum(protein) as protein, sum(total_fat) as total_fat, sum(total_carbohydrate) as total_carbohydrate, sum(sugars) as sugars, sum(fiber) as fiber, sum(calcium) as calciumm, sum(sodium) as sodium, sum(saturated_fatty_acids) as saturated_fatty_acids, sum(cholesterol) as cholesterol, sum(fv_credit) as fv_credit FROM df_recipe_final group by user_id, upload_time")

In [23]:
df_recipe_user.head(5)

Unnamed: 0,user_id,upload_time,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calciumm,sodium,saturated_fatty_acids,cholesterol,fv_credit
0,288,2013-02-26,357.37,16.18315,16.47155,38.61695,18.7398,8.06925,308.0575,251.5975,6.780085,29.6625,6.59
1,288,2013-03-05,385.472,31.86657,18.95547,24.0766,12.6959,7.0122,352.2075,630.48978,8.391765,70.9125,5.055
2,291,2013-02-22,298.154286,15.841714,11.896057,33.010114,3.660171,1.44,292.48,873.194286,6.33596,40.114286,0.0
3,291,2013-03-03,1428.0,90.0,56.0,145.0,17.0,6.5,1600.0,2740.0,28.0,150.0,3.4
4,291,2013-03-11,759.335,26.27575,64.114875,32.297125,2.5,10.2475,341.925,259.83,13.338722,31.25,2.2


In [24]:
df_recipe_user.to_csv('Result/Food/recipe_userLevel.csv', index=False)

### Regular food 

In [25]:
# selecting rows for participants only
df_food_new = df_food[df_food['user_id'].isin(id_list)]
df_food_new = df_food_new.sort_values(by=['time_upload'])

In [26]:
print('Total number of daily food entries:', df_food.shape[0])
print('Total number of daily food entries for', len(id_list), 'participants only:', df_food_new.shape[0])

Total number of daily food entries: 57738
Total number of daily food entries for 212 participants only: 37270


In [27]:
#sanity check (sql)
ps.sqldf("SELECT count(distinct user_id) as Total_Number_User FROM df_food")

Unnamed: 0,Total_Number_User
0,463


In [28]:
#sanity check (sql)
ps.sqldf("SELECT count(distinct user_id) as Total_Number_Record FROM df_food_new")

Unnamed: 0,Total_Number_Record
0,212


In [29]:
#Missing Users
exsiting_user = list(df_food_new['user_id'])
print('User record not found in food upload:', np.setdiff1d(id_list,exsiting_user))

User record not found in food upload: []


## Merging food_upload / food_foods_fv / food_servings

#### Merge food_upload + food_foods_fv

In [30]:
#Merge upload and item (sql)
df_test = ps.sqldf("SELECT d1.food_id as Food_ID, d2.food_id as Food_ID2, d1.credit, d1.serving_id, d1.time_upload, d1.user_id, d1.serving_time, d1.amount, d2.name, d2.base, d2.calories, d2.protein, d2.total_fat, d2.total_carbohydrate, d2.sugars, d2.fiber, d2.calcium, d2.sodium, d2.saturated_fatty_acids, d2.cholesterol FROM df_food_new as d1 LEFT JOIN df_food_item as d2 ON d1.food_id = d2.food_id")

In [31]:
#new columns
df_test.columns

Index(['Food_ID', 'Food_ID2', 'credit', 'serving_id', 'time_upload', 'user_id',
       'serving_time', 'amount', 'name', 'base', 'calories', 'protein',
       'total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium',
       'sodium', 'saturated_fatty_acids', 'cholesterol'],
      dtype='object')

In [32]:
df_test.shape

(37270, 20)

279358 matches from food_upload and merged dataframe (sanity check)

In [33]:
#fill null fv base values with 1 (avoid zero dividing error)
df_test["base"] = df_test["base"].fillna(1)

In [34]:
list(df_test)

['Food_ID',
 'Food_ID2',
 'credit',
 'serving_id',
 'time_upload',
 'user_id',
 'serving_time',
 'amount',
 'name',
 'base',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

In [35]:
#fill all null nutrition facts with 0
nutrition_List = ['calories', 'protein','total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium','sodium', 'saturated_fatty_acids', 'cholesterol']
for nutrition_item in nutrition_List:
    df_test[nutrition_item] = df_test[nutrition_item].fillna(0)

#### Merge food_upload + food_foods_fv + food_servings

In [36]:
#Merge upload and item (sql)
df_test2 = ps.sqldf("SELECT d1.Food_ID as Food_ID, d1.Food_ID2 as Food_ID2, d2.food_id as Food_ID3, d1.credit, d1.serving_id as Serving_ID1, d2.serving_id as Serving_ID2, d2.size, d2.fv_credit, d1.time_upload, d1.user_id, d1.serving_time, d1.amount, d1.name, d1.base, d1.calories, d1.protein, d1.total_fat, d1.total_carbohydrate, d1.sugars, d1.fiber, d1.calcium, d1.sodium, d1.saturated_fatty_acids, d1.cholesterol FROM df_test as d1 INNER JOIN df_food_serving as d2 ON d1.Food_ID = d2.food_id and d1.serving_id = d2.serving_id")

In [37]:
df_test2.head(5)

Unnamed: 0,Food_ID,Food_ID2,Food_ID3,credit,Serving_ID1,Serving_ID2,size,fv_credit,time_upload,user_id,...,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1083753,1083753.0,1083753,0.0,55917,55917,164.0,0.0,2012-07-18 14:56:44,1,...,420.0,15.0,18.0,48.0,15.0,2.0,200.0,1110.0,8.0,240.0
1,1067522,1067522.0,1067522,1.0,190733,190733,257.0,1.0,2012-07-18 14:56:44,1,...,45.0,0.7,0.2,10.4,8.4,0.2,11.0,1.0,0.024,0.0
2,1055153,1055153.0,1055153,0.0,19474,19474,56.0,0.0,2012-07-18 15:33:44,1,...,150.0,1.0,9.0,15.0,0.0,2.0,0.0,310.0,1.5,0.0
3,1101026,1101026.0,1101026,0.0,77345,77345,198.0,0.0,2012-07-19 16:43:30,1,...,510.0,29.0,26.0,40.0,9.0,3.0,300.0,1190.0,12.0,90.0
4,1055117,1055117.0,1055117,0.0,31270,31270,117.0,0.0,2012-07-19 16:43:30,1,...,500.0,6.0,25.0,63.0,0.0,6.0,20.0,350.0,3.5,0.0


In [38]:
print('About', df_test.shape[0] - df_test2.shape[0], 'number of food uploads could not be found with both food item and serving matched')

About 143 number of food uploads could not be found with both food item and serving matched


In [39]:
df_test2.columns

Index(['Food_ID', 'Food_ID2', 'Food_ID3', 'credit', 'Serving_ID1',
       'Serving_ID2', 'size', 'fv_credit', 'time_upload', 'user_id',
       'serving_time', 'amount', 'name', 'base', 'calories', 'protein',
       'total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium',
       'sodium', 'saturated_fatty_acids', 'cholesterol'],
      dtype='object')

<b>Calculation Part</b>

In [40]:
nutrition_List = ['calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

In [41]:
included_List = ['user_id', 'Food_ID','Serving_ID1','time_upload','serving_time','amount','base','size','credit','fv_credit']
included_List = included_List + nutrition_List
included_List

['user_id',
 'Food_ID',
 'Serving_ID1',
 'time_upload',
 'serving_time',
 'amount',
 'base',
 'size',
 'credit',
 'fv_credit',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

In [42]:
#sample test
df_test2 = df_test2[included_List]
df_test2.columns

Index(['user_id', 'Food_ID', 'Serving_ID1', 'time_upload', 'serving_time',
       'amount', 'base', 'size', 'credit', 'fv_credit', 'calories', 'protein',
       'total_fat', 'total_carbohydrate', 'sugars', 'fiber', 'calcium',
       'sodium', 'saturated_fatty_acids', 'cholesterol'],
      dtype='object')

<b>Formula: (food_servings.size / food_foods_fv.base) * food_upload.amount * (nutrition metric)</b>

In [43]:
#Applying to entire dataframe
#After Calculation
import warnings
warnings.filterwarnings("ignore")
df_final = df_test2
for nutrition_item in nutrition_List:
    for i in range(df_final.shape[0]):
        df_final[nutrition_item][i] = df_final['size'][i] / df_final['base'][i] * df_final['amount'][i] * df_final[nutrition_item][i]    

In [44]:
#calculate result fv credit (multiply by amount)
for i in range(df_final.shape[0]):
    df_final['fv_credit'][i] = df_final['fv_credit'][i] * df_final['amount'][i]

In [45]:
df_final.head(5)

Unnamed: 0,user_id,Food_ID,Serving_ID1,time_upload,serving_time,amount,base,size,credit,fv_credit,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1,1083753,55917,2012-07-18 14:56:44,Breakfast,1.0,164.0,164.0,0.0,0.0,420.0,15.0,18.0,48.0,15.0,2.0,200.0,1110.0,8.0,240.0
1,1,1067522,190733,2012-07-18 14:56:44,Breakfast,1.0,95.5,257.0,1.0,1.0,121.099476,1.88377,0.53822,27.987435,22.605236,0.53822,29.602094,2.691099,0.064586,0.0
2,1,1055153,19474,2012-07-18 15:33:44,Breakfast,1.0,56.0,56.0,0.0,0.0,150.0,1.0,9.0,15.0,0.0,2.0,0.0,310.0,1.5,0.0
3,1,1101026,77345,2012-07-19 16:43:30,Lunch,1.0,198.0,198.0,0.0,0.0,510.0,29.0,26.0,40.0,9.0,3.0,300.0,1190.0,12.0,90.0
4,1,1055117,31270,2012-07-19 16:43:30,Lunch,1.0,154.0,117.0,0.0,0.0,379.87013,4.558442,18.993506,47.863636,0.0,4.558442,15.194805,265.909091,2.659091,0.0


In [46]:
#drop rows with null size/missing values
df_final2 = df_final.dropna(subset=['size'])

In [47]:
print('About', df_final.shape[0] - df_final2.shape[0], 'number of food uploads are missing from upload information')

About 4359 number of food uploads are missing from upload information


In [48]:
df_final2.to_csv('Result/Food/food_upload_combined.csv', index=False)

## Daily Level Aggregation 

In [50]:
#get day-level date
df_final2 = pd.read_csv('Result/Food/food_upload_combined.csv') 
df_final2['time_upload'] = df_final2['time_upload'].astype('string').str[:10]
df_final2.head(3)

Unnamed: 0,user_id,Food_ID,Serving_ID1,time_upload,serving_time,amount,base,size,credit,fv_credit,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,1,1083753,55917,2012-07-18,Breakfast,1.0,164.0,164.0,0.0,0.0,420.0,15.0,18.0,48.0,15.0,2.0,200.0,1110.0,8.0,240.0
1,1,1067522,190733,2012-07-18,Breakfast,1.0,95.5,257.0,1.0,1.0,121.099476,1.88377,0.53822,27.987435,22.605236,0.53822,29.602094,2.691099,0.064586,0.0
2,1,1055153,19474,2012-07-18,Breakfast,1.0,56.0,56.0,0.0,0.0,150.0,1.0,9.0,15.0,0.0,2.0,0.0,310.0,1.5,0.0


In [51]:
#aggregate day-level nutrients (sum)

df_final3= ps.sqldf("SELECT user_id, time_upload as upload_time, sum(calories) as calories, sum(protein) as protein, sum(total_fat) as total_fat, sum(total_carbohydrate) as total_carbohydrate, sum(sugars) as sugars, sum(fiber) as fiber, sum(calcium) as calciumm, sum(sodium) as sodium, sum(saturated_fatty_acids) as saturated_fatty_acids, sum(cholesterol) as cholesterol, sum(fv_credit) as fv_credit FROM df_final2 group by user_id, time_upload")

In [52]:
df_final3.head(5)

Unnamed: 0,user_id,upload_time,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calciumm,sodium,saturated_fatty_acids,cholesterol,fv_credit
0,1,2012-07-18,691.099476,17.88377,27.53822,90.987435,37.605236,4.53822,229.602094,1422.691099,9.564586,240.0,1.0
1,1,2012-07-19,1169.87013,33.558442,44.993506,163.863636,9.0,7.558442,315.194805,1467.909091,14.659091,90.0,0.0
2,1,2012-07-20,450.0,27.5,37.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2012-07-25,1602.0,48.0,63.0,249.0,121.0,7.0,0.0,2648.0,25.0,677.0,0.0
4,1,2012-07-28,915.0,13.9,35.0,123.1,72.5,3.5,20.0,1060.052854,5.5,7.5,0.0


In [53]:
df_final3.to_csv('Result/Food/food_upload_combined_userlevel.csv', index=False)

## Statistical Summary

### Combine Recipe with Food Items

In [54]:
df_recipe = pd.read_csv('Result/Food/recipe.csv')
df_food = pd.read_csv('Result/Food/food_upload_combined.csv')
df_food = df_food[pd.to_numeric(df_food['Food_ID'], errors='coerce').notnull()]

In [55]:
#rename column and select columns of interest
df_recipe.columns = ['user_id',
 'food_id',
 'serving_id',
 'time_upload',
 'name',
 'amount',
 'base',
 'size',
 'creditrecipe',
 'fv_credit',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

df_food.columns = ['user_id',
 'food_id',
 'serving_id',
 'time_upload',
 'serving_time',
 'amount',
 'base',
 'size',
 'credit',
 'fv_credit',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

In [56]:
food_item = df_food_item[['food_id', 'name']]
df_food = pd.merge(food_item , df_food, on=['food_id'])

In [57]:
columns_interest = ['user_id',
 'food_id',
 'serving_id',
 'time_upload',
 'name',
 'amount',
 'base',
 'size',
 'fv_credit',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol']

df_food = df_food[columns_interest]
df_recipe = df_recipe[columns_interest]

In [58]:
df_food_all = df_food.append(df_recipe)

In [59]:
df_food_all.head(5)

Unnamed: 0,user_id,food_id,serving_id,time_upload,name,amount,base,size,fv_credit,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calcium,sodium,saturated_fatty_acids,cholesterol
0,343,1001004,191059,2013-05-10 13:19:00,"Blue, bleu",2.0,100.0,17.0,0.0,120.02,7.276,9.7716,0.7956,0.17,0.0,179.52,474.3,6.34746,25.5
1,105,1001006,191069,2012-11-10 15:26:16,Brie,0.5,100.0,128.0,0.0,213.76,13.28,17.7152,0.288,0.288,0.0,117.76,402.56,11.1424,64.0
2,113,1001006,191070,2012-11-11 23:09:44,Brie,1.5,100.0,144.0,0.0,721.44,44.82,59.7888,0.972,0.972,0.0,397.44,1358.64,37.6056,216.0
3,200,1001006,191067,2012-12-15 23:04:37,Brie,1.0,100.0,17.0,0.0,56.78,3.5275,4.7056,0.0765,0.0765,0.0,31.28,106.93,2.9597,17.0
4,352,1001006,191067,2013-05-20 21:49:14,Brie,1.0,100.0,17.0,0.0,56.78,3.5275,4.7056,0.0765,0.0765,0.0,31.28,106.93,2.9597,17.0


In [60]:
df_food_all.to_csv('Result/Food/food_all.csv', index=False)

In [61]:
# aggregate day-level nutrients (sum)
df_food_all_userlevel = ps.sqldf("SELECT user_id, time_upload as upload_time, sum(calories) as calories, sum(protein) as protein, sum(total_fat) as total_fat, sum(total_carbohydrate) as total_carbohydrate, sum(sugars) as sugars, sum(fiber) as fiber, sum(calcium) as calciumm, sum(sodium) as sodium, sum(saturated_fatty_acids) as saturated_fatty_acids, sum(cholesterol) as cholesterol, sum(fv_credit) as fv_credit FROM df_food_all group by user_id, time_upload")

In [62]:
df_food_all_userlevel.to_csv('Result/Food/food_all_userlevel.csv', index=False)

### Tag Variables (by gender, IQR)

In [63]:
df_food = pd.read_csv('Result/Food/food_all_userlevel.csv')
df_food.head(5)

Unnamed: 0,user_id,upload_time,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calciumm,sodium,saturated_fatty_acids,cholesterol,fv_credit
0,1,2012-07-18 14:56:44,541.099476,16.88377,18.53822,75.987435,37.605236,2.53822,229.602094,1112.691099,8.064586,240.0,1.0
1,1,2012-07-18 15:33:44,150.0,1.0,9.0,15.0,0.0,2.0,0.0,310.0,1.5,0.0,0.0
2,1,2012-07-19 16:43:30,889.87013,33.558442,44.993506,87.863636,9.0,7.558442,315.194805,1455.909091,14.659091,90.0,0.0
3,1,2012-07-19 16:52:49,280.0,0.0,0.0,76.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0
4,1,2012-07-20 09:40:04,450.0,27.5,37.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
def outliers(df, column, IQR_level):
    l = list(df[column])
    for i in range(0, len(l)): 
        l[i] = float(l[i])
    
    #outlier percentage
    q1 = np.percentile(l, 25)  
    q3 = np.percentile(l, 75)
    IQR = q3-q1
    low = q1 - IQR_level*IQR
    high = q3 + IQR_level*IQR
    
    return([low, high])

# add gender
df_redcap = pd.read_csv('Raw Data/mbc2_redcap.csv') 
df_gender = df_redcap[['Participant ID', 'Sex']]
df_gender.columns = ['study_id', 'gender']
df_user_info = df_user[['study_id','user_id']]
df_info = df_gender.merge(df_user_info, on = 'study_id')
df_info['gender'] = df_info['gender'].astype('string').str[0]

#merge gender with info
df_food_gender = df_food.merge(df_info, on = 'user_id')

#remove user's ID
df_food_gender = df_food_gender.drop(['user_id'], axis=1)
df_food_gender.head(5)

Unnamed: 0,upload_time,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calciumm,sodium,saturated_fatty_acids,cholesterol,fv_credit,study_id,gender
0,2012-07-18 14:56:44,541.099476,16.88377,18.53822,75.987435,37.605236,2.53822,229.602094,1112.691099,8.064586,240.0,1.0,1436,1
1,2012-07-18 15:33:44,150.0,1.0,9.0,15.0,0.0,2.0,0.0,310.0,1.5,0.0,0.0,1436,1
2,2012-07-19 16:43:30,889.87013,33.558442,44.993506,87.863636,9.0,7.558442,315.194805,1455.909091,14.659091,90.0,0.0,1436,1
3,2012-07-19 16:52:49,280.0,0.0,0.0,76.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,1436,1
4,2012-07-20 09:40:04,450.0,27.5,37.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1436,1


In [65]:
# split by gender 
df_male = df_food_gender[df_food_gender['gender'] == '1']
df_female = df_food_gender[df_food_gender['gender'] == '2']

#get outliers info
cal_male = outliers(df_male, 'calories', 1.5)
cal_female = outliers(df_female, 'calories', 1.5)
cal_male2 = outliers(df_male, 'calories', 3.0)
cal_female2 = outliers(df_female, 'calories', 3.0)

fv_male = outliers(df_male, 'fv_credit', 1.5)
fv_female = outliers(df_female, 'fv_credit', 1.5)
fv_male2 = outliers(df_male, 'fv_credit', 3.0)
fv_female2 = outliers(df_female, 'fv_credit', 3.0)

fat_male = outliers(df_male, 'total_fat', 1.5)
fat_female = outliers(df_female, 'total_fat', 1.5)
fat_male2 = outliers(df_male, 'total_fat', 3.0)
fat_female2 = outliers(df_female, 'total_fat', 3.0)

In [66]:
high_male1 = cal_male[1]
high_male2 = cal_male2[1]
high_female1= cal_female[1]
high_female2 = cal_female2[1]

def outliers_cal(row):
    if (row['gender'] == '1'): #male
        if(row['calories'] > high_male2):
            val = 2
        elif(row['calories'] > high_male1):
            val = 1
        else:
            val = 0
    else: #female
        if(row['calories'] > high_female2):
            val = 2
        elif(row['calories'] > high_female1):
            val = 1
        else:
            val = 0
    return val

df_food_gender['calories_outlier'] = df_food_gender.apply(outliers_cal, axis=1)

In [67]:
high_male1 = fv_male[1]
high_male2 = fv_male2[1]
high_female1= fv_female[1]
high_female2 = fv_female2[1]

def outliers_fv(row):
    if (row['gender'] == '1'): #male
        if(row['fv_credit'] > high_male2):
            val = 2
        elif(row['fv_credit'] > high_male1):
            val = 1
        else:
            val = 0
    else: #female
        if(row['fv_credit'] > high_female2):
            val = 2
        elif(row['fv_credit'] > high_female1):
            val = 1
        else:
            val = 0
    return val

df_food_gender['fv_outlier'] = df_food_gender.apply(outliers_fv, axis=1)

In [68]:
high_male1 = fat_male[1]
high_male2 = fat_male2[1]
high_female1= fat_female[1]
high_female2 = fat_female2[1]

def outliers_fat(row):
    if (row['gender'] == '1'): #male
        if(row['total_fat'] > high_male2):
            val = 2
        elif(row['total_fat'] > high_male1):
            val = 1
        else:
            val = 0
    else: #female
        if(row['total_fat'] > high_female2):
            val = 2
        elif(row['total_fat'] > high_female1):
            val = 1
        else:
            val = 0
    return val

df_food_gender['fat_outlier'] = df_food_gender.apply(outliers_fat, axis=1)

In [69]:
df_food_gender.head(5)

Unnamed: 0,upload_time,calories,protein,total_fat,total_carbohydrate,sugars,fiber,calciumm,sodium,saturated_fatty_acids,cholesterol,fv_credit,study_id,gender,calories_outlier,fv_outlier,fat_outlier
0,2012-07-18 14:56:44,541.099476,16.88377,18.53822,75.987435,37.605236,2.53822,229.602094,1112.691099,8.064586,240.0,1.0,1436,1,0,2,0
1,2012-07-18 15:33:44,150.0,1.0,9.0,15.0,0.0,2.0,0.0,310.0,1.5,0.0,0.0,1436,1,0,0,0
2,2012-07-19 16:43:30,889.87013,33.558442,44.993506,87.863636,9.0,7.558442,315.194805,1455.909091,14.659091,90.0,0.0,1436,1,0,0,0
3,2012-07-19 16:52:49,280.0,0.0,0.0,76.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,1436,1,0,0,0
4,2012-07-20 09:40:04,450.0,27.5,37.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1436,1,0,0,0


In [70]:
df_food_gender.to_csv('Result/Food/food_final.csv',  index=False)