In [56]:
import pandas as pd
import numpy as np 
import sys, os
import importlib.util

hp = os.path.abspath('../helper_functions.py')
spec_a = importlib.util.spec_from_file_location("helper_functions", hp)
helper_func = importlib.util.module_from_spec(spec_a)
spec_a.loader.exec_module(helper_func)

#need to get our product JSON from parent folder
p = os.path.abspath('../product_info_JSON.py')
spec = importlib.util.spec_from_file_location("product_info_JSON", p)
json_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(json_module)
#print(json_module.PRODUCT_INFORMATION)

# ---------------------------- End of Lookup Table work ---------------------------------

# Create the Abbott product table
Ncare_products_original = pd.read_csv("../../Data/Ncare/Ncare_scrape_data.csv",  usecols= json_module.PRODUCT_INFORMATION.keys()) #This could be replaced with output from other parser
Ncare_products = Ncare_products_original.copy() #to be used by future analysis
Ncare_products.drop_duplicates(keep='first', inplace=True)


In [57]:
Ncare_products.head()

Unnamed: 0,url,store,name,price,size_or_weight,availability,item_id,description,ingredients,allergin_info,...,Sizes,Form,Flavours,Case of X,Case of Y,clinical_indications,benefits,feature_table_rows,usage,entry_date
0,https://www.ncare.net.au/nutrition-products/dy...,Ncare,RESOURCE ThickenUp 1.2g,,,,12151076,Nestle Health Science RESOURCE ThickenUp Clear...,"Maltodextrin (Corn, Potato), Thickener (Xantha...",,...,,,Neutral,,,,,,- First add desired quantity of powder into em...,26/05/2020 10:18:42
1,https://www.ncare.net.au/nutrition-products/dy...,Ncare,RESOURCE ThickenUp 125g,,,,12132987,Nestle Health Science RESOURCE ThickenUp Clear...,"Maltodextrin (Corn, Potato), Thickener (Xantha...",,...,,,Neutral,,,,,,- Use the dosage scoop included in the tin.\n-...,26/05/2020 10:18:42
2,https://www.ncare.net.au/nutrition-products/dy...,Ncare,RESOURCE ThickenUp 900g,,,,12114005,Nestle Health Science RESOURCE ThickenUp Clear...,"Maltodextrin (Corn, Potato), Thickener (Xantha...",,...,,,Neutral,,,,,,- Use the dosage scoop included in the tin.\n-...,26/05/2020 10:18:42
3,https://www.ncare.net.au/nutrition-products/op...,Ncare,OPTIFAST VLCD Shake 18,,,,12317536,,"Vanilla Flavour: Skimmed Milk Powder (31%), Mi...",,...,,,"Chocolate, Vanilla",,,,,,Add one sachet of OPTIFAST® VLCD™ to 200-250mL...,26/05/2020 10:18:42
4,https://www.ncare.net.au/nutrition-products/op...,Ncare,OPTIFAST VLCD Bars,,,,12371262,These delicious and convenient Bars are one th...,"Chocolate Bar: Milk Proteins, Milk Chocolate (...",,...,,,"Cappuccino, Berry Crunch, Cranberry, Assorted,...",,,,,,Instructions\nOpen wrapper and consumer bar. O...,26/05/2020 10:18:42


In [58]:
#fix up id
Ncare_products['item_id'] = Ncare_products['item_id'].str.replace("CASE", '', regex=False)
Ncare_products['item_id'] = Ncare_products['item_id'].str.replace("EACH", '', regex=False)

#removing alternate name for now, better regex will fix this in the future
Ncare_products['ingredients'] = Ncare_products['ingredients'].str.replace("\\(.*?\\)", '', regex=True)
Ncare_products['ingredients'] = Ncare_products['ingredients'].str.replace(".", ',', regex=False)
Ncare_products['ingredients'] = Ncare_products['ingredients'].str.replace("May contain", '', regex=False)
Ncare_products['ingredients'] = Ncare_products['ingredients'].str.replace("Contains", '', regex=False)
#Ncare_products['ingredients']

In [59]:
df = pd.DataFrame(columns=['item_id','ingredient'])
for i in range(len(Ncare_products)):
    row = Ncare_products.iloc[i]
    ingredients = row['ingredients'].split(',')
    ingredients = [i.strip() for i in ingredients if i != '\n ' or i != '' or i != None]
    #print(ingredients)
    item_id = row['item_id'] 
    df2 = pd.DataFrame({'item_id': [item_id]*len(ingredients), 'ingredient': ingredients} )
    df = pd.concat([df, df2])

    
df = df[df['ingredient'] != '']
df

Unnamed: 0,item_id,ingredient
0,12151076,Maltodextrin
1,12151076,Thickener
2,12151076,Mineral Salt
3,12151076,milk
0,12132987,Maltodextrin
...,...,...
10,12297832,Choline Chloride
11,12297832,Taurine
12,12297832,L-carnitine
13,12297832,Anti-Forming Agent


In [60]:
df.to_csv("../../Data/Ncare/ingredients.csv", index=False)

In [61]:
Ncare_products['Flavours'] = Ncare_products['Flavours'].fillna('')
Ncare_products['Flavours'].isnull().sum()

0

In [62]:


df_flav = pd.DataFrame(columns=['item_id','flavour'])
for i in range(len(Ncare_products)):
    row_f = Ncare_products.iloc[i]
    flav = (row_f['Flavours'].split(','))
    item_id = row_f['item_id']
    df2 = pd.DataFrame({'item_id': [item_id]*len(flav), 'flavour': flav} )
    df_flav = pd.concat([df_flav, df2])
    


In [63]:
df_flav.to_csv('../../Data/Ncare/flavours.csv', index=False)

In [64]:
Ncare_products['benefits']

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
12   NaN
13   NaN
14   NaN
15   NaN
16   NaN
17   NaN
18   NaN
19   NaN
Name: benefits, dtype: float64

In [65]:
Ncare_products.head()

Unnamed: 0,url,store,name,price,size_or_weight,availability,item_id,description,ingredients,allergin_info,...,Sizes,Form,Flavours,Case of X,Case of Y,clinical_indications,benefits,feature_table_rows,usage,entry_date
0,https://www.ncare.net.au/nutrition-products/dy...,Ncare,RESOURCE ThickenUp 1.2g,,,,12151076,Nestle Health Science RESOURCE ThickenUp Clear...,"Maltodextrin , Thickener , Mineral Salt , mil...",,...,,,Neutral,,,,,,- First add desired quantity of powder into em...,26/05/2020 10:18:42
1,https://www.ncare.net.au/nutrition-products/dy...,Ncare,RESOURCE ThickenUp 125g,,,,12132987,Nestle Health Science RESOURCE ThickenUp Clear...,"Maltodextrin , Thickener , Mineral Salt , mil...",,...,,,Neutral,,,,,,- Use the dosage scoop included in the tin.\n-...,26/05/2020 10:18:42
2,https://www.ncare.net.au/nutrition-products/dy...,Ncare,RESOURCE ThickenUp 900g,,,,12114005,Nestle Health Science RESOURCE ThickenUp Clear...,"Maltodextrin , Thickener , Mineral Salt , mil...",,...,,,Neutral,,,,,,- Use the dosage scoop included in the tin.\n-...,26/05/2020 10:18:42
3,https://www.ncare.net.au/nutrition-products/op...,Ncare,OPTIFAST VLCD Shake 18,,,,12317536,,"Vanilla Flavour: Skimmed Milk Powder , Milk Pr...",,...,,,"Chocolate, Vanilla",,,,,,Add one sachet of OPTIFAST® VLCD™ to 200-250mL...,26/05/2020 10:18:42
4,https://www.ncare.net.au/nutrition-products/op...,Ncare,OPTIFAST VLCD Bars,,,,12371262,These delicious and convenient Bars are one th...,"Chocolate Bar: Milk Proteins, Milk Chocolate ...",,...,,,"Cappuccino, Berry Crunch, Cranberry, Assorted,...",,,,,,Instructions\nOpen wrapper and consumer bar. O...,26/05/2020 10:18:42


In [66]:
# ------------- Setup dataframe and columns -----------------

del_cols = ['price', 'size_or_weight', 'availability', 'allergin_info', 'Case of X', 'Case of Y',
           'Flavours', 'Sizes', 'Form', 'ingredients', 'clinical_indications', 'benefits', 'feature_table_rows', 'footnotes']

for c in del_cols:
    if c in Ncare_products.columns:
        del Ncare_products[c]
        


In [67]:

del_cols = list()

for i in range(1, 4):
    del_cols.append('serving_size_' + str(i))
    del_cols.append('nutrient_table_' + str(i))
    del_cols.append('vitamin_table_' + str(i))
    del_cols.append('mineral_table_' + str(i))
    
for c in del_cols:
    if c in Ncare_products.columns:
        del Ncare_products[c]


In [68]:
Ncare_products.head()

Unnamed: 0,url,store,name,item_id,description,usage,entry_date
0,https://www.ncare.net.au/nutrition-products/dy...,Ncare,RESOURCE ThickenUp 1.2g,12151076,Nestle Health Science RESOURCE ThickenUp Clear...,- First add desired quantity of powder into em...,26/05/2020 10:18:42
1,https://www.ncare.net.au/nutrition-products/dy...,Ncare,RESOURCE ThickenUp 125g,12132987,Nestle Health Science RESOURCE ThickenUp Clear...,- Use the dosage scoop included in the tin.\n-...,26/05/2020 10:18:42
2,https://www.ncare.net.au/nutrition-products/dy...,Ncare,RESOURCE ThickenUp 900g,12114005,Nestle Health Science RESOURCE ThickenUp Clear...,- Use the dosage scoop included in the tin.\n-...,26/05/2020 10:18:42
3,https://www.ncare.net.au/nutrition-products/op...,Ncare,OPTIFAST VLCD Shake 18,12317536,,Add one sachet of OPTIFAST® VLCD™ to 200-250mL...,26/05/2020 10:18:42
4,https://www.ncare.net.au/nutrition-products/op...,Ncare,OPTIFAST VLCD Bars,12371262,These delicious and convenient Bars are one th...,Instructions\nOpen wrapper and consumer bar. O...,26/05/2020 10:18:42


In [69]:
df_flav.to_csv('../../Data/Ncare/Ncare_products.csv', index=False)

In [70]:
#nutrition tables
Ncare_nutr = pd.read_csv("../../Data/Ncare/Nutrition_tables/Ncare9517630_nutrition_table.csv") #This could be replaced with output from other parser


In [71]:
Ncare_nutr

Unnamed: 0.1,Unnamed: 0,ARGINAID® Arginine Powder Orange,ARGINAID® Arginine Powder Orange.1,ARGINAID® Arginine Powder Orange.2,ARGINAID® Arginine Powder Orange.3,ARGINAID® Arginine Powder Orange.4,ARGINAID® Arginine Powder Orange.5,ARGINAID® Arginine Powder Orange.6,ARGINAID® Arginine Powder Orange.7,ARGINAID® Arginine Powder Orange.8,ARGINAID® Arginine Powder Orange.9,ARGINAID® Arginine Powder Orange.10,ARGINAID® Arginine Powder Orange.11
0,,Servings Per Pack: 14 Serving Size: 9.2g (Powder),Average Quantityper Serving,Average Quantityper 100g,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,Energy,105 kJ,,,,,,,,,,
2,1.0,,25 kcal,,,,,,,,,,
3,2.0,Protein,,,,,,,,,,,
4,3.0,- L-Arginine,4.5 g,,,,,,,,,,
5,4.0,Total Nitrogen,1.4 g,,,,,,,,,,
6,5.0,Carbohydrate,2.0 g,,,,,,,,,,
7,6.0,Fat,0.0 g,,,,,,,,,,
8,7.0,- Saturated,0.0 g,,,,,,,,,,
9,8.0,Sodium,30.0 mg,,,,,,,,,,


In [72]:
Ncare_nutr.iloc[:, 2]

0                           Average Quantityper Serving
1                                                105 kJ
2                                               25 kcal
3                                                   NaN
4                                                 4.5 g
5                                                 1.4 g
6                                                 2.0 g
7                                                 0.0 g
8                                                 0.0 g
9                                               30.0 mg
10                                               5.0 mg
11                                             156.0 mg
12                                              60.0 mg
13                                                  NaN
14    Contains 4.5g per packet of the amino acid L-a...
Name: ARGINAID® Arginine Powder Orange.1, dtype: object

In [73]:
import re

#nutrition tables
full_df = pd.DataFrame(columns = ['nutrient', 'measure',' value', 'item_id'])
for it_id in Ncare_products['item_id'].values:
    pth = '../../Data/Ncare/Nutrition_tables/Ncare' + str(it_id) + '_nutrition_table.csv'
    Ncare_nutr = pd.read_csv("../../Data/Ncare/Nutrition_tables/Ncare9517630_nutrition_table.csv") 
    nutrient = Ncare_nutr.iloc[:, 1]
    nutrient_list = list()

    quantity = Ncare_nutr.iloc[:, 2]
    quantity_list = list()
    measure_list = list()
    value_list = list()

    for i in range(1, len(nutrient)):
        #print(type(nutrient.iloc[i]))
        if (isinstance(nutrient.iloc[i],str)):
            nutrient_list.append(nutrient.iloc[i].lstrip('-'))
            v = str(quantity.iloc[i]).strip()
            val = [i for i in v if (i.isnumeric()) or i == '.']
            val = ''.join(val)
            measures = ''.join(re.findall('[a-zA-Z]+',v)) 
            quantity_list.append(str(quantity.iloc[i]).strip())
            measure_list.append(measures)
            value_list.append(val)

    nutrient_list.pop(len(nutrient_list)-1)
    quantity_list.pop(len(quantity_list)-1)
    measure_list.pop(len(measure_list)-1)
    value_list.pop(len(value_list)-1)

    df = pd.DataFrame({'nutrient': nutrient_list, 'measure':measure_list,' value':value_list, 'item_id': str(it_id)})
    full_df = pd.concat([full_df, df])
                                 
full_df

Unnamed: 0,nutrient,measure,value,item_id
0,Energy,kJ,105,12151076
1,Protein,,,12151076
2,L-Arginine,g,4.5,12151076
3,Total Nitrogen,g,1.4,12151076
4,Carbohydrate,g,2.0,12151076
...,...,...,...,...
6,Saturated,g,0.0,12297832
7,Sodium,mg,30.0,12297832
8,Potassium,mg,5.0,12297832
9,Vitamin C,mg,156.0,12297832


In [237]:
full_df.to_csv('../../Data/Ncare/nutrients.csv', index=False)