In [1]:
% matplotlib inline
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('burritos_01022018.csv')

In [3]:
df.head()

Unnamed: 0,Location,Burrito,Date,Neighborhood,Address,URL,Yelp,Google,Chips,Cost,...,Nopales,Lobster,Queso,Egg,Mushroom,Bacon,Sushi,Avocado,Corn,Zucchini
0,Donato's taco shop,California,1/18/2016,Miramar,6780 Miramar Rd,http://donatostacoshop.net/,3.5,4.2,,6.49,...,,,,,,,,,,
1,Oscar's Mexican food,California,1/24/2016,San Marcos,225 S Rancho Santa Fe Rd,http://www.yelp.com/biz/oscars-mexican-food-sa...,3.5,3.3,,5.45,...,,,,,,,,,,
2,Oscar's Mexican food,Carnitas,1/24/2016,,,,,,,4.85,...,,,,,,,,,,
3,Oscar's Mexican food,Carne asada,1/24/2016,,,,,,,5.25,...,,,,,,,,,,
4,Pollos Maria,California,1/27/2016,Carlsbad,3055 Harding St,http://pollosmaria.com/,4.0,3.8,x,6.59,...,,,,,,,,,,


Oh wow this dataset is ugly. Lets clean it up. All of the `NaN` values corresponding to the right side need to have thier `'x'` / `NaN` encoding fixed.

In [4]:
df.columns

Index(['Location', 'Burrito', 'Date', 'Neighborhood', 'Address', 'URL', 'Yelp',
       'Google', 'Chips', 'Cost', 'Hunger', 'Mass (g)', 'Density (g/mL)',
       'Length', 'Circum', 'Volume', 'Tortilla', 'Temp', 'Meat', 'Fillings',
       'Meat:filling', 'Uniformity', 'Salsa', 'Synergy', 'Wrap', 'overall',
       'Rec', 'Reviewer', 'Notes', 'Unreliable', 'NonSD', 'Beef', 'Pico',
       'Guac', 'Cheese', 'Fries', 'Sour cream', 'Pork', 'Chicken', 'Shrimp',
       'Fish', 'Rice', 'Beans', 'Lettuce', 'Tomato', 'Bell peper', 'Carrots',
       'Cabbage', 'Sauce', 'Salsa.1', 'Cilantro', 'Onion', 'Taquito',
       'Pineapple', 'Ham', 'Chile relleno', 'Nopales', 'Lobster', 'Queso',
       'Egg', 'Mushroom', 'Bacon', 'Sushi', 'Avocado', 'Corn', 'Zucchini'],
      dtype='object')

In [5]:
ingredients = ['Chips', 'Beef', 'Pico',
       'Guac', 'Cheese', 'Fries', 'Sour cream', 'Pork', 'Chicken', 'Shrimp',
       'Fish', 'Rice', 'Beans', 'Lettuce', 'Tomato', 'Bell peper', 'Carrots',
       'Cabbage', 'Sauce', 'Salsa.1', 'Cilantro', 'Onion', 'Taquito',
       'Pineapple', 'Ham', 'Chile relleno', 'Nopales', 'Lobster', 'Queso',
       'Egg', 'Mushroom', 'Bacon', 'Sushi', 'Avocado', 'Corn', 'Zucchini']

In [6]:
def fix_ingredients(arg):
    if arg == 'x' or arg == 1: return 1
    else: return 0

In [7]:
df = df.join(df[ingredients].applymap(fix_ingredients), rsuffix='_1h').drop(ingredients, axis = 1)

Thats fixes some of our encoding woes. Lets see about rounding off these prices to something less precise and more bin-able. Lets see how many don't have prices and likely just drop them.

In [8]:
df.shape

(385, 66)

In [9]:
df.dropna(axis=0, subset=['Cost'], inplace=True)

In [10]:
df['Cost'] = df['Cost'].apply(round)

In [11]:
df.head()

Unnamed: 0,Location,Burrito,Date,Neighborhood,Address,URL,Yelp,Google,Cost,Hunger,...,Nopales_1h,Lobster_1h,Queso_1h,Egg_1h,Mushroom_1h,Bacon_1h,Sushi_1h,Avocado_1h,Corn_1h,Zucchini_1h
0,Donato's taco shop,California,1/18/2016,Miramar,6780 Miramar Rd,http://donatostacoshop.net/,3.5,4.2,6,3.0,...,0,0,0,0,0,0,0,0,0,0
1,Oscar's Mexican food,California,1/24/2016,San Marcos,225 S Rancho Santa Fe Rd,http://www.yelp.com/biz/oscars-mexican-food-sa...,3.5,3.3,5,3.5,...,0,0,0,0,0,0,0,0,0,0
2,Oscar's Mexican food,Carnitas,1/24/2016,,,,,,5,1.5,...,0,0,0,0,0,0,0,0,0,0
3,Oscar's Mexican food,Carne asada,1/24/2016,,,,,,5,2.0,...,0,0,0,0,0,0,0,0,0,0
4,Pollos Maria,California,1/27/2016,Carlsbad,3055 Harding St,http://pollosmaria.com/,4.0,3.8,7,4.0,...,0,0,0,0,0,0,0,0,0,0


The built in round function suits this case pretty well. 

And lets infill the missing yelp/google ratings with the means, not too useful, but will make it usable with sklearn. 

In [12]:
df['Yelp'] = df['Yelp'].fillna(df['Yelp'].mean())
df['Google'] = df['Google'].fillna(df['Google'].mean())

In [13]:
from sklearn.neighbors import NearestNeighbors

In [27]:
burrito_friend = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute', n_jobs=-1)

In [28]:
features = ingredients
features = [x + "_1h" for x in features]
features.append('Cost')
features.append('Yelp')
features.append('Google')

In [29]:
burrito_friend.fit(df[features])

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

In [30]:
q_num = 5
df.iloc[q_num]
scores, cases = burrito_friend.kneighbors(df.iloc[q_num][features].reshape(1, -1))

  This is separate from the ipykernel package so we can avoid doing imports until


In [31]:
scores[0]

array([ 0.        ,  0.02377312,  0.02945989,  0.02945989,  0.02945989])

In [32]:
cases[0]

array([  5, 110, 219, 216, 217])

In [35]:
recs = []
for case, score in zip(cases[0], scores[0]):
    print(df.iloc[case][['Location','Burrito', 'Cost', 'Yelp', 'Google']])
    print()
    recs.append(df.iloc[case][['Location','Burrito', 'Cost', 'Yelp', 'Google']])

Location     Pollos Maria
Burrito     combo chicken
Cost                    7
Yelp              3.89878
Google            4.17439
Name: 5, dtype: object

Location    Goody's
Burrito      Custom
Cost              8
Yelp        3.89878
Google      4.17439
Name: 110, dtype: object

Location    Lucha Libre North Park
Burrito                  Holy Moly
Cost                             8
Yelp                       3.89878
Google                     4.17439
Name: 221, dtype: object

Location    Lucha Libre North Park
Burrito                  Holy Moly
Cost                             8
Yelp                       3.89878
Google                     4.17439
Name: 218, dtype: object

Location    Lucha Libre North Park
Burrito                  Holy Moly
Cost                             8
Yelp                       3.89878
Google                     4.17439
Name: 219, dtype: object



Well, thats the k neighbors

Lets see if we can incorporate ratings into our recs.

In [36]:
sorted(recs, key=lambda x: (x['Yelp'] + x['Google'])/2)

[Location     Pollos Maria
 Burrito     combo chicken
 Cost                    7
 Yelp              3.89878
 Google            4.17439
 Name: 5, dtype: object, Location    Goody's
 Burrito      Custom
 Cost              8
 Yelp        3.89878
 Google      4.17439
 Name: 110, dtype: object, Location    Lucha Libre North Park
 Burrito                  Holy Moly
 Cost                             8
 Yelp                       3.89878
 Google                     4.17439
 Name: 221, dtype: object, Location    Lucha Libre North Park
 Burrito                  Holy Moly
 Cost                             8
 Yelp                       3.89878
 Google                     4.17439
 Name: 218, dtype: object, Location    Lucha Libre North Park
 Burrito                  Holy Moly
 Cost                             8
 Yelp                       3.89878
 Google                     4.17439
 Name: 219, dtype: object]

Looks good, lets pickle our dataset and k-nearest-neighbors estimator

In [38]:
import pickle

In [None]:
pickle.dump(df, open('clean-burrito-pandas.pkl'))
p