In [1]:
import difflib
import re
import numpy as np
import pandas as pd
import ast
from fuzzywuzzy import fuzz 

import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
# Reads the food with 6,000 rows
with open("../Reference and Datasets/foodList.txt") as f:
    foodList = f.read().splitlines()
    
# Reads the large food data with 200,000 rows
food_nutrient_large = pd.read_csv('../NutrientData/Food Data/food_nutrients_dict.csv')

# Reads categorized food with 16,000 rows
food_categ = pd.read_csv('../NutrientData/food_categorized_nurtrients_w_name.csv')

In [61]:
# Reads expiration date
expr = pd.read_csv('../Reference and Datasets/expiration_data.csv')
expr_dates = dict(zip(expr['name'], expr['expireIn']))

# Cleans up the one nan float value in expr_names
expr_names = list(expr_dates.keys())
epxr_names = [e for e in expr_names if type(e) != float] 

# Maps from category index to expr date
expr_categ = {
    1: 2,
    2: 7,
    3: 10,
    4: 365,
    5: 7,
    6: 3,
    7: 3,
    8: 240,
    9: 3,
    10: 2,
    11: 7,
    12: 180,
    13: 3,
    14: 30,
    15: 3,
    16: 365,
    17: 3,
    18: 5,
    19: 180,
    20: 30, # pasta
    21: 7, #FAST FOOD
    22: 3,
    23: 365,
    25: 2,
    26: 7
}

In [3]:
# Cleans up foodList
temp = list(filter(lambda x: x not in ['baby', 'producer', 'red', '85% lean', 'baked', 'leg','greater than 3% juice', 'family style'
                                      ,'polish', 'greek', 'on the border', 'tlc', 'low calorie', 'milk producer', 'producer milk', 'green', 'grade'], foodList))
foodList = temp


In [4]:
# Creates a series of food_names with comma, for regex extracting purposes
food_comma = food_categ['food_name'] + ','

# Extracts the food's generic names 
food_categ['generic'] = food_comma.str.extract(r'^([^,]*),')[0]

In [5]:
# Get rid of commas, colons, and other special characters 
food_clean_name = food_categ['food_name'].str.replace(r'[^a-zA-Z0-9 ]', "").str.replace(r' +', ' ')

In [6]:
# Compiles a list of foodnames with 200k data and with 16k data respectively
foodlist_large = food_nutrient_large['name'].tolist()
foodlist_categ = food_clean_name.tolist()
foodlist_generic = food_categ['generic'].tolist()

In [8]:
# RegEx generator to match with decimals surrounded by indices
regexp = re.compile(r'^[^a-z]+\d\.\d')
findFoodName = re.compile("([^\.]+) \d+\.")

In [9]:
# sample receipts. Input 4 works the best.
inp_one = [ '1805 University AVe. 2.53', 'Berkeley CA 94703 2.53', 'Store # 186 (510) 204-9074 2.53', 'UPEN 8:0OAM 10 9:0OPM DAILY 2.53', '21 SEASONI NG SALUE 2,202 2.53', '0 SUNFL OWER 2.53', 'SEA SALI FINE CRYSTALS 26.502 2.53', 'SOURDOUGT BATARD 2.53', 'GRAPES GREEN SEEDI ESS 2LB 2.53', 'BIRTHDAY CARO 2.53', 'BBQ CUT SALMON 2.53', 'SLCD SAL AMI PPRED COLUMUS 2.53', 'ORCANTC CHICKL TENDERS 2.53', 'PECORTNO ROMAW 2.53', 'SPICE INO BIACK PEPPER 207 2.53', '8.0 2.53', '5.75 2.53', '1.99 2.53', '3.49 2.53', '5.49 2.53', '1.38 2.53', 'EGGS EXIRA LARGE CAGE FRE BRU 2.53', 'BACON ABr UNCURED DAY RUBBED 2.53', 'POTATO RUSSET EACH 2.53', '@ 0.69/A 2.53', '2EA 2.53', 'AVDCADO EACH HASS 2.53', '@1.59/EA 2.53', '0.98 2.53', 'A- LAR 1C EACIH A9/t) UNCA 2.53', '0.49/EA 2.53', 'IRSH KFRRYOOD UNSA I 2.53', '2LA 2.53', 'BUTTLR 2.53', 'BuftA EACII 2.53' ]
inp_two = [ 'LIUR TED 2.53','Regular Price 2.53','Card Savings 2.53','6.49 S 2.53','8.49 2.53','2.00- 2.53','SEAFOOD 2.53','SALMON ATLANTIC 2.53','SMOKED SALMON LOX 2.53','13.98 S 2.53','9.99 S 2.53','PRODUCE 2.53','1 2.53','QTY 2.53','CUCUMBERS 2.53','1.00 S 2.53','0.39 1b @ $1.99 /lb 2.53','WT 2.53','0.78 S 2.53','RED ROMA TOMATOES 2.53','3.99 S 2.53','SWT YOUNG COCONUT 2.53','BOK CHOY BABY 2.53','GINGER ROOT 2.53','0.51 lb @ $2.99 /1b 2.53','1.52 S 2.53','WT 2.53','0.10 lb @ $3.99 /lb 2.53','0.40 S 2.53','3.98 S 2.53','WT 2.53','2 QTY PORTABELLA 2.53','0.21 lb @ $2.99 /lb 2.53','WT 2.53','0.63 S 2.53','6.99 S 2.53','SHALLOTS 2.53','0 ORGNC GAPES RED 2.53','1.68 Ib @ $0.99 /1b 2.53','WT 2.53','1.66 S 2.53','ORG BANANAS 2.53','DELI 2.53','6.80 2.53','PORT SALUT WHEEL 2.53' ]
inp_three = [ 'GROCERY 2.53','99 S 2.53','HIME SHALLOTS NOR1 2.53','549 2.53','O50- 2.53','Regular Price 2.53','Card Savings 2.53','DYNASTY RICE 5 LB 2.53','KASHI CRL GOLEA 2.53','Regular Price 2.53','Card Savings 2.53','2 QTY DE CECCO 8 2.53','KIKKOMAN GRGNC SO 2.53','Regular Price 2.53','Card Savings 2.53','PANDA EXPRESS MNDR 2.53','Resular Price 2.53','Card Savinss 2.53','MARINONNECRN R 2.53','LUNDBERG RTCE HT 2.53','LUNDBERG RICE 2.53','GROCERY COUPON 2.53','Resular Price 2.53','Card Savinas 2.53','0TY MP PRENIUN 2.53','1195 2.53','395 S 2.53','4.99 2.53','1.00- 2.53','2.95 S 2.53','3.99 2.53','1.00- 2.53','350S 2.53','0.01 2.53','0.0t- 2.53','NP TONAT0 ASIL 2.53','200S 2.53','GROCERY 2.53' ]
inp_four = [ 'PRODUCE 2.53','MSHRM PORTABE! LO 2.53','S FARMS SPACH 2.53','Regular Price 2.53','Card Savings 2.53','2 QTY CANTALQUPE 2.53','Regular 2.53','Card Savings 2.53','3.99 c 2.53','1.99 2.53','0.32- 2.53','Price 2.53','5.98 2.53','0.98- 2.53','O. 66 lb @ $i.49 /lb 2.53','JUt 30 YLW ONIONS 2.53','GREEN SDLSS GRAPES 2.53','CABBAGE SAVOY 2.53','GINGER ROOT 2.53','0.98 2.53','7.71 2.53','5.63 2.53','WT 2.53','WT 2.53','WT 2.53','2.58 lb @ $2 99 /lb 2.53','2.83 lb @ $1..99 /lb 2.53','0.41 lb @ $3.99 /1b 2.53','1.00 lb @ $2.99 /1b 2.53','1.64 2.53','WT 2.53','TOMATOES ON VINE 2.53','RASPBERRIES RED 2.53','WT 2.53','2.99 2.53','6.9: 2.53','5.99 2.53','0 ORGNC SWEET CORN 2.53','1 QTY 2.53','0 ORGNC MSHRH LIHTE 2.53','0 ORGNC FRESH HERB 2.53','3.50 2.53','5.00 2.53','ADDITIONAL DISCOUNTS 2.53','5.0 2.53','BASKET $5 OFf 0un Brands 2.53' ]


In [136]:
def findExpr(name, categ):
    bestRatio = 0
    best = ""
    for e in epxr_names:
        r = fuzz.ratio(name, e)
        if r > bestRatio:
            bestRatio = r
            best = e
    if bestRatio <= 60:
        return name, expr_categ.get(categ,7)
    return best, expr_dates[best]

# TEST: findExpr('ginger root', 11)

('ginger root', 7)

In [10]:
# Finds the best match of the food item on the receipt from the database. Outputs a list of dictionaries.
all_scores = []
def matchReceipt(inp):  
    food = []
    for line in inp:
        if regexp.search(line):
            if findFoodName.search(line):
                words = findFoodName.search(line).group(0)
                bestRatio = 0
                pos = 0
                name = ""
                for i, f in enumerate(foodlist_categ):
                    r = fuzz.token_set_ratio(words, f)
                    if r > bestRatio:
                        pos = i
                        name = f
                        bestRatio = r
                if bestRatio > 50:
                    d = food_categ.loc[pos, 'nutrients']
                    gen = food_categ.loc[pos, 'generic']
                    food.append([words, name, gen, ast.literal_eval(d)])  
    return food

In [138]:
def advanceMatchReceipt(inp):
    final = []
    clean_orig_pair = {}
    for line in inp:
        if regexp.search(line):
            if findFoodName.search(line):
                orig = findFoodName.search(line).group(0)
                bestRatio = 0
                name = ""
                for i, f in enumerate(foodList):
                    r = fuzz.token_set_ratio(orig, f)
                    if r > bestRatio:
                        name = f
                        bestRatio = r
                if bestRatio > 65:
                    clean_orig_pair[name] = [orig, bestRatio]
    for clean in clean_orig_pair.keys():
        bestRatio = 0
        pos = 0
        name = ""
        for i, f in enumerate(foodlist_categ):
            r = fuzz.ratio(clean, f)*0.5 + fuzz.token_set_ratio(clean_orig_pair[clean][0].lower(), f)*0.5
            if r > bestRatio:
                pos = i
                name = f
                bestRatio = r
        if bestRatio > 60:
            gen = food_categ.loc[pos, 'generic']
            categ = int(food_categ.loc[pos, 'food_category_id'])
            e = findExpr(gen, categ)
            d = food_categ.loc[pos, 'nutrients']
            nutri = ast.literal_eval(d)
            final.append([clean_orig_pair[clean][0], clean, nutri, e[1]])
#             final.append([clean_orig_pair[clean][0], clean, name, e])
    return final 

In [139]:
advanceMatchReceipt(inp_two)

[['SMOKED SALMON LOX 2.',
  'salmon',
  {'Energy': 117.0,
   'Folate, DFE': 2.0,
   'Retinol': 26.0,
   'Vitamin A, RAE': 26.0,
   'Protein': 18.28,
   'Ash': 2.62,
   'Iron, Fe': 0.85,
   'Magnesium, Mg': 18.0,
   'Phosphorus, P': 164.0,
   'Sodium, Na': 2000.0,
   'Copper, Cu': 0.23,
   'Manganese, Mn': 0.02,
   'Vitamin A, IU': 88.0,
   'Thiamin': 0.02,
   'Riboflavin': 0.1,
   'Folate, total': 2.0,
   'Vitamin B-12': 3.26,
   'Folate, food': 2.0,
   'Tryptophan': 0.2,
   'Threonine': 0.8,
   'Methionine': 0.54,
   'Phenylalanine': 0.71,
   'Tyrosine': 0.62,
   'Alanine': 1.11,
   'Glutamic acid': 2.73,
   'Glycine': 0.88,
   'Proline': 0.65,
   'Cholesterol': 23.0,
   'Fatty acids, total saturated': 0.93,
   '14:0': 0.18,
   '22:6 n-3 (DHA)': 0.27,
   '16:1': 0.3,
   '20:5 n-3 (EPA)': 0.18,
   '22:5 n-3 (DPA)': 0.07,
   'Total lipid (fat)': 4.32,
   'Water': 72.0,
   'Calcium, Ca': 11.0,
   'Potassium, K': 175.0,
   'Zinc, Zn': 0.31,
   'Niacin': 4.72,
   'Pantothenic acid': 0.87,


In [126]:
[e for e in epxr_names if e == 'tomato']
food_categ[food_categ['generic'] == 'ginger root']
list(food_categ['food_category_id'].unique())

[11.0,
 7.0,
 1.0,
 4.0,
 16.0,
 12.0,
 9.0,
 18.0,
 2.0,
 6.0,
 5.0,
 15.0,
 19.0,
 25.0,
 13.0,
 10.0,
 14.0,
 23.0,
 17.0,
 20.0,
 21.0,
 22.0,
 3.0,
 24.0,
 8.0]

In [65]:
matchReceipt(inp_two)

[['LIUR TED 2.',
  'butter salted',
  'butter',
  {'4:0': 3.23,
   '6:0': 2.01,
   '10:0': 2.53,
   '16:0': 21.7,
   '18:0': 10.0,
   '20:0': 0.14,
   '20:1': 0.1,
   '18:1 t': 2.98,
   '18:2 CLAs': 0.27,
   '16:1 c': 0.96,
   '18:3 n-3 c,c,c (ALA)': 0.32,
   '17:0': 0.56,
   '18:2 i': 0.3,
   '18:1 c': 16.98,
   '18:2 n-6 c,c': 2.17,
   '8:0': 1.19,
   '12:0': 2.59,
   '14:0': 7.44,
   'Iron, Fe': 0.02,
   'Magnesium, Mg': 2.0,
   'Phosphorus, P': 24.0,
   'Cholesterol': 215.0,
   'Vitamin B-12': 0.17,
   'Potassium, K': 24.0,
   'Zinc, Zn': 0.09,
   'Fluoride, F': 2.8,
   'Sodium, Na': 643.0,
   'Folate, total': 3.0,
   'Energy': 2999.0,
   'Folate, DFE': 3.0,
   'Vitamin E (alpha-tocopherol)': 2.32,
   '18:1': 19.96,
   '18:2': 2.73,
   '18:3': 0.32,
   'Fatty acids, total monounsaturated': 21.02,
   'Fatty acids, total polyunsaturated': 3.04,
   'Fatty acids, total trans-polyenoic': 0.3,
   'Fatty acids, total trans': 3.28,
   'Fatty acids, total saturated': 51.37,
   'Vitamin K (p

In [58]:
advanceMatchReceipt(["ORANGE 2.0", "RAW APPLE 3.0"])

TypeError: object of type 'float' has no len()

In [46]:
fuzz.ratio('apple', 'apple juice beverage') <  fuzz.ratio('apple', 'apple')


True

In [14]:
#         for w in words:
#             r = 0
#             pos = 0
#             name = ""
#             for i, f in enumerate(foodlist_categ):
#                 s = difflib.SequenceMatcher(lambda x: x == ".", w.lower(), f)
#                 if s.ratio() > r:
#                     r = s.ratio()
#                     pos = i
#                     name = f
#             if r > 0.7:
#                 d = food_categ.loc[pos, 'nutrients']
#                 food.append((name, ast.literal_eval(d)))   

In [31]:
expr.shape

(455, 2)

In [32]:
len(foodList)

6750

In [59]:
expr.head()

Unnamed: 0,name,expireIn
0,apple juice canned,540
1,apples applesauce canned,720
2,apricots canned,720
3,baby cereal del monte oatmeal based,30
4,baby cereal del monte rice based,60
