In [53]:
%load_ext autoreload
%autoreload 2

In [110]:
# coding: utf-8
import pickle
import pandas as pd
import spacy
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

from utils import NlpLite, DocLite, ModelWrapper

class NotEnoughDataException(Exception):
    pass

# TODO - this should be easier to specify.
CAT_LABEL, HOW = [('catRegion_label', 'Regional'), ('cat152_label', 'Svd')][0]  # Change to [0] for Regional

In [111]:
# Initialize all models.
model = joblib.load('models/model_withCluster_Svd')
nlp = NlpLite('models/nlplite')

In [115]:
import copy

cols = (pd.read_csv('yelp_dataset/businesses_withOtherBusinesses_withCluster_{}.csv'.format(HOW), nrows=1)
          .drop(['business_id', 'categories', 'rating'], axis=1)
          .columns
          .tolist())

print(cols)

def preprocess_user_data(business_preferences, business_category):
    business_preferences = copy.deepcopy(business_preferences)
    catmap_keys = sorted(df[CAT_LABEL].map({v:k for k,v in catmap.items()}).unique())

    vecs1 = [nlp(c1) for c1 in business_category.split(',')]
    vecs2 = [nlp(c2) for c2 in catmap_keys]
    idx = pd.np.argmax([max(c1.similarity(c2) for c1 in vecs1) for c2 in vecs2])
    most_similar_business_category = catmap_keys[idx]

    print("Your business is most similar to the category:", most_similar_business_category, "({})".format(idx))

    # Update business_preferences dictionary to include category label for a more accurate prediction.
    business_preferences[CAT_LABEL] = idx
    
    new = pd.DataFrame(business_preferences, index=[0]).assign(category=[business_category])
    return new.drop(model.fdict.keys(), 1).join(new.apply(model.fdict)).reindex(cols, axis=1).T.squeeze()

['Alcohol', 'BikeParking', 'BusinessAcceptsCreditCards', 'Caters', 'GoodForKids', 'HasTV', 'NoiseLevel', 'OutdoorSeating', 'RestaurantsAttire', 'RestaurantsDelivery', 'RestaurantsGoodForGroups', 'RestaurantsPriceRange2', 'RestaurantsReservations', 'RestaurantsTableService', 'RestaurantsTakeOut', 'WheelchairAccessible', 'WiFi', 'elevation', 'catRegion_label']


### New Business Details

In [113]:
# Replace with user input 
# TODO: don't throw keyerror if key is not present here.
business_category = 'Italian' 
business_preferences = {
    'Alcohol': 'none',
    'BikeParking':0,
    'BusinessAcceptsCreditCards':1,
    'Caters':0,
    'GoodForKids':0,
    'NoiseLevel':'loud',
    'OutdoorSeating':1,
    'RestaurantsAttire':'formal',
    'RestaurantsDelivery':1,
    'RestaurantsGoodForGroups':0,
    'RestaurantsPriceRange2':2,
    'RestaurantsReservations':1,
    'RestaurantsTableService':0,
    'RestaurantsTakeOut':1,
    'WheelchairAccessible':0,
    'WiFi':'no',
    # CAT_LABEL: business_category
}

In [114]:
new = preprocess_user_data(business_preferences, business_category)
new

KeyError: 'catRegion_label'

In [106]:
# Replace with model prediction.
business_predicted_rating = model.clf.predict([new.tolist()])[0]
#business_predicted_rating = 4 # This value will come from phase 1 of the project
business_predicted_rating

6.771461

### Suggesting Improvements

In [50]:
# Attributes to suggest improvements on.
suggested_attributes_list = [
    'Alcohol','BikeParking','BusinessAcceptsCreditCards','Caters','GoodForKids',
    'HasTV','NoiseLevel','OutdoorSeating','RestaurantsAttire','RestaurantsDelivery',
    'RestaurantsGoodForGroups','RestaurantsPriceRange2','RestaurantsReservations',
    'RestaurantsTableService','RestaurantsTakeOut','WheelchairAccessible','WiFi'
]

In [64]:
# Read the id of the business entered by the user 
catmap = pd.read_csv(
        'yelp_dataset/catmap_{}.csv'.format(HOW), header=None, index_col=[0], squeeze=True
    ).to_dict()

df = (pd.read_csv('yelp_dataset/businesses_withCluster_{}.csv'.format(HOW))
        .drop('business_id', 1)
        .dropna(subset=[CAT_LABEL]))
df['rating'] = MinMaxScaler((1, 10)).fit_transform(df['rating'].values[:, None])

df.head()

Unnamed: 0,Alcohol,BikeParking,BusinessAcceptsCreditCards,Caters,GoodForKids,HasTV,NoiseLevel,OutdoorSeating,RestaurantsAttire,RestaurantsDelivery,...,RestaurantsPriceRange2,RestaurantsReservations,RestaurantsTableService,RestaurantsTakeOut,WheelchairAccessible,WiFi,categories,elevation,rating,cat152_label
0,,0.0,1.0,,1.0,1.0,average,0.0,casual,0.0,...,2.0,1.0,,1.0,,,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",1076.0,6.335367,29.0
1,none,0.0,1.0,1.0,1.0,0.0,,1.0,casual,0.0,...,2.0,0.0,0.0,1.0,1.0,no,"Chicken Wings, Burgers, Caterers, Street Vendo...",992.0,6.203374,15.0
2,beer_and_wine,1.0,0.0,0.0,1.0,1.0,average,0.0,casual,0.0,...,2.0,1.0,1.0,0.0,,free,"Breakfast & Brunch, Restaurants, French, Sandw...",67.0,6.183439,13.0
3,,1.0,1.0,0.0,,,,1.0,,,...,1.0,,,1.0,1.0,free,"Coffee & Tea, Food",462.0,6.541526,5.0
4,,,,,,,,,,,...,1.0,,,,,,"Food, Bakeries",113.0,7.02435,9.0


In [66]:
m = (df[CAT_LABEL].eq(catmap[most_similar_business_category]) 
     & df['RestaurantsPriceRange2']
            .sub(business_preferences['RestaurantsPriceRange2'])
            .abs()
            .lt(2))
all_businesses_in_category = df[m]

# consider only ratings that are greater than the predicted business rating
all_better_businesses_in_category = all_businesses_in_category[
    all_businesses_in_category['rating'] > business_predicted_rating]

# Take top 10 percent of the ratings, or 20 businesses, whichever is higher.
k = max(20, int(len(all_better_businesses_in_category) * 0.1)) 

# best_k = all_better_businesses_in_category.nlargest(k, 'rating', keep='first')
# print("Cluster Size:", len(best_k))
best_k = (all_better_businesses_in_category
            .sort_values('rating', ascending=False).reset_index(drop=True))
best_k.head()

Unnamed: 0,Alcohol,BikeParking,BusinessAcceptsCreditCards,Caters,GoodForKids,HasTV,NoiseLevel,OutdoorSeating,RestaurantsAttire,RestaurantsDelivery,...,RestaurantsPriceRange2,RestaurantsReservations,RestaurantsTableService,RestaurantsTakeOut,WheelchairAccessible,WiFi,categories,elevation,rating,cat152_label
0,,,,,1.0,,,,,1.0,...,1.0,0.0,,1.0,,,"Restaurants, Pizza",195.0,9.149428,29.0
1,none,1.0,1.0,1.0,1.0,0.0,,1.0,,0.0,...,2.0,0.0,0.0,1.0,1.0,no,"Restaurants, Pizza",378.0,9.077596,29.0
2,,1.0,1.0,,1.0,0.0,average,1.0,casual,1.0,...,2.0,0.0,,1.0,,,"Pizza, Restaurants, Gluten-Free",619.0,8.584493,29.0
3,none,1.0,1.0,,1.0,0.0,quiet,0.0,casual,0.0,...,2.0,0.0,0.0,1.0,,,"Restaurants, Food, Do-It-Yourself Food, Pizza",307.0,8.58074,29.0
4,none,0.0,1.0,,1.0,0.0,quiet,0.0,casual,1.0,...,2.0,0.0,,1.0,,,"Restaurants, American (New), Pizza",157.0,8.390077,29.0


In [67]:
if not len(best_k):
    raise NotEnoughDataException("Not enough data for prediction.")

# print(best_k[['rating'] + suggested_attributes_list].head())

best_output = {}
for attribute in suggested_attributes_list:
    t = best_k.dropna(subset=[attribute])
    best_output[attribute] = (
        t.assign(rating=t.rating * 1/pd.np.arange(1, len(t)+1))
          .groupby(attribute).rating
          .sum()
          .sort_values(ascending=False)
          .index[0])

diff = {}
for key, value in best_output.items():
    if(business_preferences[key] != value):
        diff[key] = [business_preferences[key], value]

print("These are areas you can improve on: ")
improved_df = pd.DataFrame.from_dict(diff, orient='index', columns=['Current','Improved'])
print(improved_df)

These are areas you can improve on: 
                         Current Improved
BikeParking                    0        1
Caters                         0        1
GoodForKids                    0        1
HasTV                          1        0
NoiseLevel                  loud  average
OutdoorSeating                 1        0
RestaurantsAttire         formal   casual
RestaurantsGoodForGroups       0        1
RestaurantsReservations        1        0
WheelchairAccessible           0        1


In [105]:
import copy 

business_preferences2 = copy.deepcopy(business_preferences)
business_preferences2.update(improved_df['Improved'].to_dict())

model.clf.predict([preprocess_user_data(business_preferences2, business_category).tolist()])[0]

Your business is most similar to the category: Pizza (11)


9.218099