In [663]:
import pandas as pd
import boto3
from StringIO import StringIO
from sklearn.feature_extraction.text import CountVectorizer
from helper_functions import split_and_add_to_set, clean_value
from text_processing import run_sklearn_nmf, custom_tokenizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
pd.options.mode.chained_assignment = None
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Init s3 client

In [664]:
s3 = boto3.client('s3') 

### Load Tickets

In [665]:
tickets_csv_string = s3.get_object(Bucket='braydencleary-data', Key='feastly/cleaned/tickets.csv')['Body'].read().decode('utf-8')
tickets = pd.read_csv(StringIO(tickets_csv_string), header=0, delimiter='|')
tickets['meal_created_date'] = pd.to_datetime(tickets['meal_created_date'])
tickets['meal_date'] = pd.to_datetime(tickets['meal_date'])

### Load Meals

In [666]:
meals_csv_string = s3.get_object(Bucket='braydencleary-data', Key='feastly/cleaned/meals.csv')['Body'].read().decode('utf-8')
meals = pd.read_csv(StringIO(meals_csv_string), header=0)

### Load Cooks

In [667]:
cooks_csv_string = s3.get_object(Bucket='braydencleary-data', Key='feastly/cleaned/cooks.csv')['Body'].read().decode('utf-8')
cooks = pd.read_csv(StringIO(cooks_csv_string), header=0)

### Load Menus

In [668]:
menus_csv_string = s3.get_object(Bucket='braydencleary-data', Key='feastly/cleaned/menus.csv')['Body'].read().decode('utf-8')
menus = pd.read_csv(StringIO(menus_csv_string), header=0)

### Load Venues

In [669]:
venues_csv_string = s3.get_object(Bucket='braydencleary-data', Key='feastly/cleaned/venues.csv')['Body'].read().decode('utf-8')
venues = pd.read_csv(StringIO(venues_csv_string), header=0)

### Load Menu Dishes

In [670]:
menus_dishes_csv_string = s3.get_object(Bucket='braydencleary-data', Key='feastly/cleaned/menu_dishes.csv')['Body'].read().decode('utf-8')
menu_dishes = pd.read_csv(StringIO(menus_dishes_csv_string), header=0)

### Load Menu Course Counts

In [671]:
menu_course_counts_csv_string = s3.get_object(Bucket='braydencleary-data', Key='feastly/cleaned/menu_course_counts.csv')['Body'].read().decode('utf-8')
menu_course_counts = pd.read_csv(StringIO(menu_course_counts_csv_string), header=0, delimiter='|')

### Load Meal Addon Counts

In [672]:
meal_addon_counts_csv_string = s3.get_object(Bucket='braydencleary-data', Key='feastly/cleaned/meal_addon_counts.csv')['Body'].read().decode('utf-8')
meal_addon_counts = pd.read_csv(StringIO(meal_addon_counts_csv_string), header=0, delimiter='|')

### Load meal_inferred_types

In [673]:
meal_inferred_types_csv_string = s3.get_object(Bucket='braydencleary-data', Key='feastly/cleaned/meal_inferred_types.csv')['Body'].read().decode('utf-8')
meal_inferred_types = pd.read_csv(StringIO(meal_inferred_types_csv_string), header=0, delimiter='|')

### Load meal_is_interactive 

In [674]:
meal_is_interactive_csv_string = s3.get_object(Bucket='braydencleary-data', Key='feastly/cleaned/meal_is_interactive.csv')['Body'].read().decode('utf-8')
meal_is_interactive = pd.read_csv(StringIO(meal_is_interactive_csv_string), header=0, delimiter='|')

### Load meal_cuisine_types

In [675]:
meal_cuisine_types_csv_string = s3.get_object(Bucket='braydencleary-data', Key='feastly/cleaned/meal_cuisine_types.csv')['Body'].read().decode('utf-8')
meal_cuisine_types = pd.read_csv(StringIO(meal_cuisine_types_csv_string), header=0, delimiter='|')

### Filter meals to only include those present in tickets df

In [676]:
meals = meals[meals.id.isin(tickets['meal_id'].unique())]

### Set initial feature matrix (to be expanded on as notebook progresses)

##### Including meal_id, menu_id, percentage_of_seats_sold, sold (target), days_to_sell, and meal_date in feature matrix now but will remove later

In [677]:
X = tickets[['meal_id', 'master_menu_id', 'percentage_of_seats_sold', 'sold', 'meal_date', 'days_to_sell', 'number_of_seats', 'ticket_price']]

### Calculate listed_days feature and add to feature matrix

In [678]:
tickets['listed_days'] = (tickets['meal_date'] - tickets['meal_created_date'])
tickets['listed_days'] = tickets['listed_days'].apply(lambda listed_days: listed_days.days)
X['meal_listed_days'] = tickets['listed_days']

### Calculate meal day of week dummies and add to feature matrix

In [679]:
tickets['meal_day_of_week'] = tickets['meal_date'].apply(lambda x: x.weekday())
X['meal_is_on_monday'] = tickets['meal_day_of_week'].apply(lambda x: 1 if x == 0 else 0)
X['meal_is_on_tuesday'] = tickets['meal_day_of_week'].apply(lambda x: 1 if x == 1 else 0)
X['meal_is_on_wednesday'] = tickets['meal_day_of_week'].apply(lambda x: 1 if x == 2 else 0)
X['meal_is_on_thursday'] = tickets['meal_day_of_week'].apply(lambda x: 1 if x == 3 else 0)
X['meal_is_on_friday'] = tickets['meal_day_of_week'].apply(lambda x: 1 if x == 4 else 0)
X['meal_is_on_saturday'] = tickets['meal_day_of_week'].apply(lambda x: 1 if x == 5 else 0)
X['meal_is_on_sunday'] = tickets['meal_day_of_week'].apply(lambda x: 1 if x == 6 else 0)
X['meal_is_on_weekday'] = tickets['meal_day_of_week'].apply(lambda x: 1 if x < 5 else 0)

### Calculate cook days on platform and add to feature matrix

In [680]:
def compute_cook_days_on_platform(row):
    if (row['meal_date'] - row['cook_joined_date']).days < 0:
        return None
    else:
        return (row['meal_date'] - row['cook_joined_date']).days

In [681]:
cooks['joined_date'] = pd.to_datetime(cooks['joined_date'])
X['cook_joined_date'] = cooks['joined_date']
X['cook_days_on_platform'] = X.apply(compute_cook_days_on_platform, axis=1)
X['couldnt_compute_cook_days_on_platform'] = X['cook_days_on_platform'].apply(lambda x: 0 if x > 0 else 1)
X['cook_days_on_platform'].fillna(X['cook_days_on_platform'].mean(), inplace=True)
del X['cook_joined_date']

### Add dummies for venue style to feature matrix

In [682]:
unqiue_venue_style = set()

venues['venue_style'].apply(split_and_add_to_set, args=(unqiue_venue_style, ',', ))

for category in unqiue_venue_style:
    if len(category) > 0:
        column_name = 'is_venue_style_' + clean_value(category)
        venues[column_name] = venues['venue_style'].apply(lambda x: 1 if category in x.split(',') else 0)

In [683]:
X = pd.merge(X, meals[['id','venue_id']], left_on='meal_id', right_on='id', how='inner')
del X['id']
X = pd.merge(X, venues[['id', 'is_venue_style_pop-up-space', 'is_venue_style_apartment', 'is_venue_style_restaurant', 'is_venue_style_farm', 'is_venue_style_house', 'is_venue_style_brown-stone']], left_on='venue_id', right_on='id', how='inner')
del X['id']
del X['venue_id']


### Add menu course count columns to feature matrix

In [684]:
X = pd.merge(X, meals[['id','menu_id']], left_on='meal_id', right_on='id', how='inner')
del X['id']
X = pd.merge(X, menu_course_counts[['menu_id', 'count_of_first_courses', 'count_of_second_courses', 'count_of_third_courses', 'count_of_appetizers', 'count_of_desserts', 'count_of_small_plates', 'count_of_entrees', 'count_of_beverages']], on='menu_id', how='inner')
del X['menu_id']

### Add addon counts/info to feature matrix (this is a bit questionable becuase I only have meal_addon_counts data for about 15% of meals...definitely remove and see if performance improves without)

In [685]:
X = pd.merge(X, meal_addon_counts[['meal_id', 'count_of_addons', 'total_price_of_addons', 'max_price_of_addons', 'min_price_of_addons']], on='meal_id', how='left')
X['missing_count_of_addons'] = X['count_of_addons'].apply(lambda x: 0 if x > 0 else 1)
X['count_of_addons'].fillna(X['count_of_addons'].mean(), inplace=True)
X['missing_total_price_of_addons'] = X['total_price_of_addons'].apply(lambda x: 0 if x > 0 else 1)
X['total_price_of_addons'].fillna(X['total_price_of_addons'].mean(), inplace=True)
X['missing_max_price_of_addons'] = X['max_price_of_addons'].apply(lambda x: 0 if x > 0 else 1)
X['max_price_of_addons'].fillna(X['max_price_of_addons'].mean(), inplace=True)
X['missing_min_price_of_addons'] = X['min_price_of_addons'].apply(lambda x: 0 if x > 0 else 1)
X['min_price_of_addons'].fillna(X['min_price_of_addons'].mean(), inplace=True)

### Add cooking experience years to feature matrix

In [686]:
cooks['cooking_experience_years'].fillna('missing', inplace=True)

unique_cooking_experience_years_values = set()

cooks['cooking_experience_years'].apply(split_and_add_to_set, args=(unique_cooking_experience_years_values, ',', ))

for category in unique_cooking_experience_years_values:
    if len(category) > 0:
        column_name = 'is_cooking_experience_years_' + clean_value(category)
        cooks[column_name] = cooks['cooking_experience_years'].apply(lambda x: 1 if category in x.split(',') else 0)

In [687]:
X = pd.merge(X, meals[['id','cook_id']], left_on='meal_id', right_on='id', how='inner')
del X['id']
X = pd.merge(X, cooks[['id', 'is_cooking_experience_years_8+', 'is_cooking_experience_years_1-3', 'is_cooking_experience_years_4-7', 'is_cooking_experience_years_0', 'is_cooking_experience_years_missing']], left_on='cook_id', right_on='id', how='inner')
del X['id']
del X['cook_id']

### Add cook referrer dummies to feature matrix

In [688]:
cooks['referrer'].fillna('missing', inplace=True)

unique_referrer_values = set()

cooks['referrer'].apply(split_and_add_to_set, args=(unique_referrer_values, ',', ))

for category in unique_referrer_values:
    if len(category) > 0:
        column_name = 'is_cook_referrer_' + clean_value(category)
        cooks[column_name] = cooks['referrer'].apply(lambda x: 1 if category in x.split(',') else 0)

In [689]:
X = pd.merge(X, meals[['id','cook_id']], left_on='meal_id', right_on='id', how='inner')
del X['id']
X = pd.merge(X, cooks[['id', 'is_cook_referrer_google', 'is_cook_referrer_missing', 'is_cook_referrer_job', 'is_cook_referrer_other', 'is_cook_referrer_social', 'is_cook_referrer_news', 'is_cook_referrer_referral', 'is_cook_referrer_meal', 'is_cook_referrer_friend']], left_on='cook_id', right_on='id', how='inner')
del X['id']
del X['cook_id']

### Adding cooking experience (qualitiative) dummies to feature matrix

In [690]:
cooks['cooking_experience'].fillna('missing', inplace=True)

unique_cooking_experience_values = set()

cooks['cooking_experience'].apply(split_and_add_to_set, args=(unique_cooking_experience_values, ' or ', ))

for category in unique_cooking_experience_values:
    if len(category) > 0:
        column_name = 'is_cooking_experience_' + clean_value(category)
        cooks[column_name] = cooks['cooking_experience'].apply(lambda x: 1 if category in x.split(',') else 0)

In [691]:
X = pd.merge(X, meals[['id','cook_id']], left_on='meal_id', right_on='id', how='inner')
del X['id']
X = pd.merge(X, cooks[['id', 'is_cooking_experience_current-chef', 'is_cooking_experience_avid', 'is_cooking_experience_former-chef', 'is_cooking_experience_missing', 'is_cooking_experience_home_cook', 'is_cooking_experience_ownerf', 'is_cooking_experience_novice', 'is_cooking_experience_private', 'is_cooking_experience_entrepreneur', 'is_cooking_experience_caterer', 'is_cooking_experience_chef_de_partie', 'is_cooking_experience_personal_chef', 'is_cooking_experience_chef_de_cuisine', 'is_cooking_experience_chef-in-traning', 'is_cooking_experience_commis', 'is_cooking_experience_sous_chef']], left_on='cook_id', right_on='id', how='inner')
del X['id']
del X['cook_id']

### Add cooks cooking reason dummies to feature matrix

In [692]:
cooks['is_cooking_reason_meet'] = cooks['is_reason_meet']
cooks['is_cooking_reason_brand'] = cooks['is_reason_brand']
cooks['is_cooking_reason_money'] = cooks['is_reason_money']

In [693]:
X = pd.merge(X, meals[['id','cook_id']], left_on='meal_id', right_on='id', how='inner')
del X['id']
X = pd.merge(X, cooks[['id', 'is_cooking_reason_meet', 'is_cooking_reason_brand', 'is_cooking_reason_money']], left_on='cook_id', right_on='id', how='inner')
del X['id']
del X['cook_id']

### Add menu_style dummies to feature matrix

In [694]:
unique_menu_style_values = set()

menus['menu_style'].apply(split_and_add_to_set, args=(unique_menu_style_values, ','))

for category in unique_menu_style_values:
    if len(category) > 0:
        column_name = 'is_menu_style_' + clean_value(category)
        menus[column_name] = menus['menu_style'].apply(lambda x: 1 if category in x.split(',') else 0)

In [695]:
X = pd.merge(X, meals[['id','menu_id']], left_on='meal_id', right_on='id', how='inner')
del X['id']
X = pd.merge(X, menus[['id', 'is_menu_style_not-defined', 'is_menu_style_finedining-elegant', 'is_menu_style_semicasual-upscale', 'is_menu_style_casual-homestyle']], left_on='menu_id', right_on='id', how='inner')
del X['id']
del X['menu_id']

### Add is_interactive dummy to feature matrix

In [696]:
X = pd.merge(X, meal_is_interactive[['meal_id','is_interactive']], on='meal_id', how='inner')

### Add inferred_type meal type to feature matrix (breakfast, brunch, lunch, dinner)

In [697]:
X = pd.merge(X, meal_inferred_types[['meal_id', 'is_inferred_by_text_columns_breakfast', 'is_inferred_by_text_columns_brunch', 'is_inferred_by_text_columns_lunch', 'is_inferred_by_text_columns_dinner']], on='meal_id', how='inner')

### Filter meals from feature matrix that have no cuisine type (134 out of 4800)

In [698]:
meals['cuisine_type'].fillna('', inplace=True)
meals_with_cuisine_type = meals[meals['cuisine_type'].apply(lambda x: len(x) > 0)]['id']
X = X[X['meal_id'].isin(meals_with_cuisine_type)]

### Add dummies for meal_cuisine_types in feature_matrix

##### First level of cuisine type

In [699]:
meal_cuisine_types['ct1'].fillna('', inplace=True)

unique_meal_ct1_values = set()

meal_cuisine_types['ct1'].apply(split_and_add_to_set, args=(unique_meal_ct1_values, ','))

for category in unique_meal_ct1_values:
    if len(category) > 0:
        column_name = 'is_ct1_' + clean_value(category)
        meal_cuisine_types[column_name] = meal_cuisine_types['ct1'].apply(lambda x: 1 if category in x.split(',') else 0)

###### Second level of cuisine type

In [700]:
meal_cuisine_types['ct2'].fillna('', inplace=True)

unique_meal_ct2_values = set()

meal_cuisine_types['ct2'].apply(split_and_add_to_set, args=(unique_meal_ct2_values, ','))

for category in unique_meal_ct2_values:
    if len(category) > 0:
        column_name = 'is_ct2_' + clean_value(category)
        meal_cuisine_types[column_name] = meal_cuisine_types['ct2'].apply(lambda x: 1 if category in x.split(',') else 0)

###### Third level of cuisine type

In [701]:
meal_cuisine_types['ct3'].fillna('', inplace=True)

unique_meal_ct3_values = set()

meal_cuisine_types['ct3'].apply(split_and_add_to_set, args=(unique_meal_ct3_values, ','))

for category in unique_meal_ct3_values:
    if len(category) > 0:
        column_name = 'is_ct3_' + clean_value(category)
        meal_cuisine_types[column_name] = meal_cuisine_types['ct3'].apply(lambda x: 1 if category in x.split(',') else 0)

In [702]:
X = pd.merge(X, meal_cuisine_types[['meal_id', 'is_ct1_portuguese', 'is_ct1_irish', 'is_ct1_mexican', 'is_ct1_chinese', 'is_ct1_german', 'is_ct1_chamorro', 'is_ct1_central_american', 'is_ct1_mediterranean', 'is_ct1_japanese', 'is_ct1_singaporean', 'is_ct1_desserts_bakeries', 'is_ct1_ecuadorian', 'is_ct1_persian', 'is_ct1_asian', 'is_ct1_latin_american', 'is_ct1_spanish', 'is_ct1_ice_cream_gelato', 'is_ct1_barbecue', 'is_ct1_cafe', 'is_ct1_brunch', 'is_ct1_paleo', 'is_ct1_caribbean', 'is_ct1_argentinian', 'is_ct1_vietnamese', 'is_ct1_tapas_small_plates', 'is_ct1_american', 'is_ct1_nordic', 'is_ct1_south_american', 'is_ct1_gastropub_food', 'is_ct1_peruvian', 'is_ct1_indian', 'is_ct1_guatemalan', 'is_ct1_brazilian', 'is_ct1_korean', 'is_ct1_health_food', 'is_ct1_european', 'is_ct1_indonesian', 'is_ct1_lao', 'is_ct1_hawaiian', 'is_ct1_jewish', 'is_ct1_african', 'is_ct1_middle_eastern', 'is_ct1_french', 'is_ct1_asian_noodle_soup', 'is_ct1_vegan', 'is_ct1_russian', 'is_ct1_thai', 'is_ct1_australian', 'is_ct1_other', 'is_ct1_balkan', 'is_ct1_cuban', 'is_ct1_filipino', 'is_ct1_east_european', 'is_ct1_seafood', 'is_ct1_turkish', 'is_ct1_malaysian', 'is_ct1_british', 'is_ct1_salvadorian', 'is_ct1_north_african', 'is_ct1_greek', 'is_ct1_burmese', 'is_ct1_hispanic', 'is_ct1_pizza', 'is_ct1_cajun_creole', 'is_ct1_north_american', 'is_ct1_californian', 'is_ct1_vegetarian', 'is_ct1_soul_food', 'is_ct1_italian', 'is_ct2_portuguese', 'is_ct2_irish', 'is_ct2_cajun_creole', 'is_ct2_chinese', 'is_ct2_german', 'is_ct2_hispanic', 'is_ct2_central_american', 'is_ct2_mediterranean', 'is_ct2_japanese', 'is_ct2_singaporean', 'is_ct2_spanish', 'is_ct2_ecuadorian', 'is_ct2_pizza', 'is_ct2_persian', 'is_ct2_asian', 'is_ct2_latin_american', 'is_ct2_barbecue', 'is_ct2_ice_cream_gelato', 'is_ct2_mexican', 'is_ct2_cafe', 'is_ct2_thai', 'is_ct2_caribbean', 'is_ct2_turkish', 'is_ct2_tapas_small_plates', 'is_ct2_burmese', 'is_ct2_desserts_bakeries', 'is_ct2_south_american', 'is_ct2_gastropub_food', 'is_ct2_peruvian', 'is_ct2_indian', 'is_ct2_korean', 'is_ct2_colombian', 'is_ct2_european', 'is_ct2_british', 'is_ct2_indonesian', 'is_ct2_balkan', 'is_ct2_srilankan', 'is_ct2_hawaiian', 'is_ct2_jewish', 'is_ct2_taiwanese', 'is_ct2_african', 'is_ct2_middle_eastern', 'is_ct2_vegan', 'is_ct2_asian_noodle_soup', 'is_ct2_seafood', 'is_ct2_french', 'is_ct2_polynesian', 'is_ct2_russian', 'is_ct2_brunch', 'is_ct2_australian', 'is_ct2_cuban', 'is_ct2_filipino', 'is_ct2_vegetarian', 'is_ct2_vietnamese', 'is_ct2_malaysian', 'is_ct2_lao', 'is_ct2_health_food', 'is_ct2_north_african', 'is_ct2_greek', 'is_ct2_american', 'is_ct2_east_european', 'is_ct2_nordic', 'is_ct2_north_american', 'is_ct2_italian', 'is_ct2_other', 'is_ct2_soul_food', 'is_ct2_californian', 'is_ct3_portuguese', 'is_ct3_mexican', 'is_ct3_chinese', 'is_ct3_thai', 'is_ct3_peruvian', 'is_ct3_central_american', 'is_ct3_mediterranean', 'is_ct3_japanese', 'is_ct3_singaporean', 'is_ct3_persian', 'is_ct3_asian', 'is_ct3_latin_american', 'is_ct3_spanish', 'is_ct3_ice_cream_gelato', 'is_ct3_barbecue', 'is_ct3_south_american', 'is_ct3_caribbean', 'is_ct3_vietnamese', 'is_ct3_tapas_small_plates', 'is_ct3_desserts_bakeries', 'is_ct3_gastropub_food', 'is_ct3_other', 'is_ct3_korean', 'is_ct3_health_food', 'is_ct3_european', 'is_ct3_indonesian', 'is_ct3_indian', 'is_ct3_hawaiian', 'is_ct3_jewish', 'is_ct3_taiwanese', 'is_ct3_north_american', 'is_ct3_middle_eastern', 'is_ct3_french', 'is_ct3_asian_noodle_soup', 'is_ct3_vegan', 'is_ct3_german', 'is_ct3_brunch', 'is_ct3_cuban', 'is_ct3_filipino', 'is_ct3_vegetarian', 'is_ct3_turkish', 'is_ct3_malaysian', 'is_ct3_north_african', 'is_ct3_hispanic', 'is_ct3_american', 'is_ct3_pizza', 'is_ct3_african', 'is_ct3_californian', 'is_ct3_seafood', 'is_ct3_soul_food', 'is_ct3_italian']], on='meal_id', how='inner')

### Add features from about column on menus

In [703]:
menus['about'].fillna('', inplace=True)
menus_to_analyze = menus[menus['id'].isin(X['master_menu_id'])]

In [704]:
menu_about_vectorizer = CountVectorizer(tokenizer=custom_tokenizer, ngram_range=(1,1))

In [705]:
menu_about_vect = menu_about_vectorizer.fit_transform(menus_to_analyze['about'])
menu_about_feature_names = menu_about_vectorizer.get_feature_names()
menu_about_latent_features = run_sklearn_nmf(menu_about_vect, menu_about_feature_names, 7)

components: [[  7.66159001e-05   0.00000000e+00   0.00000000e+00 ...,   1.95350974e-02
    1.57127431e-03   5.33907334e-03]
 [  3.33656966e-04   0.00000000e+00   0.00000000e+00 ...,   6.09958021e-06
    0.00000000e+00   9.07189089e-05]
 [  0.00000000e+00   6.91203507e-04   5.97519737e-03 ...,   0.00000000e+00
    2.37662200e-02   5.87578423e-03]
 ..., 
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  5.98406767e-04   3.29990779e-03   0.00000000e+00 ...,   0.00000000e+00
    7.25793541e-04   4.58818950e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
(7, 9770)
[u'chinese', u'rice', u'menu', u'table', u'taste', u'come', u'experience', u'cuisine', u'flavor', u'dish']
[u'kamayan', u'local', u'food', u'school', u'philippine', u'gather', u'explore', u'salo', u'ingredient', u'filipino']
[u'new', u'people', u'love', u'wine', u'share', u'come', u'join', u'good'

In [706]:
labels_for_menu_about_latent_features = ['menu_about_related_to_chinese_food', 'menu_about_related_to_philippino_food', 'menu_about_related_to_street_food', 'menu_about_related_to_a_celebration', 'menu_about_related_to_jewish_food', 'menu_about_related_to_sushi', 'menu_about_related_to_italian_food']

In [707]:
menu_about_latent_features_with_menu_ids = np.insert(menu_about_latent_features, 0, np.array([menus_to_analyze.id.values]), axis=1)

In [708]:
menu_about_latent_features_column_names = ['menu_id'] + labels_for_menu_about_latent_features

In [709]:
menu_about_latent_features = pd.DataFrame(menu_about_latent_features_with_menu_ids, columns=menu_about_latent_features_column_names)

In [710]:
X = pd.merge(X, menu_about_latent_features, left_on='master_menu_id', right_on='menu_id', how='left')
del X['menu_id']

### Add features from menu dishes description column

In [711]:
menu_dishes['about'].fillna('', inplace=True)
menu_dishes_to_analyze = menu_dishes[menu_dishes['menu_id'].isin(X['master_menu_id'])]

In [712]:
menu_dishes_about_vectorizer = CountVectorizer(tokenizer=custom_tokenizer, ngram_range=(1,1))

In [713]:
menu_dishes_about_vect = menu_dishes_about_vectorizer.fit_transform(menu_dishes_to_analyze['about'])
menu_dishes_about_feature_names = menu_dishes_about_vectorizer.get_feature_names()
menu_dishes_about_latent_features = run_sklearn_nmf(menu_dishes_about_vect, menu_dishes_about_feature_names, 10)

components: [[  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   2.13047831e-06 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   2.74124841e-03
    0.00000000e+00   3.13897499e-03]
 ..., 
 [  2.72872073e-03   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    2.69820192e-02   0.00000000e+00]
 [  0.00000000e+00   6.78038115e-04   1.46118969e-04 ...,   0.00000000e+00
    2.26177905e-02   1.13072004e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    4.17221492e-04   0.00000000e+00]]
(10, 10706)
[u'olive', u'ginger', u'roasted', u'oil', u'red', u'pepper', u'tomato', u'green', u'onion', u'garlic']
[u'vinegar', u'sugar', u'dessert', u'spicy', u'curry', u'toast', u'cake', u'sweet', u'milk', u'coconut']
[u'good', u'cheese', u'home', u'learn', u'dinner', u'house', u'use', u'roll', u'sushi', 

In [714]:
labels_for_menu_dishes_about_latent_features = ['menu_dishes_about_related_to_salad_ingredients', 'menu_dishes_about_related_to_curry_ingredients', 'menu_dishes_about_related_to_sushi_making', 'menu_dishes_about_related_to_bread_drinks', 'menu_dishes_about_related_to_beefy_potato', 'menu_dishes_about_related_to_stir_fry', 'menu_dishes_related_to_dessert', 'menu_dishes_about_related_to_cake_noodles', 'menu_dishes_about_related_to_organic_ingredients', 'menu_dishes_about_related_to_marinated_stir_fry']

In [715]:
menu_dishes_about_latent_features_with_menu_ids = np.insert(menu_dishes_about_latent_features, 0, np.array([menu_dishes_to_analyze.menu_id.values]), axis=1)


In [716]:
menu_dishes_about_latent_features_column_names = ['menu_id'] + labels_for_menu_dishes_about_latent_features

In [717]:
menu_dishes_about_latent_features = pd.DataFrame(menu_dishes_about_latent_features_with_menu_ids, columns=menu_dishes_about_latent_features_column_names)

In [718]:
at_menu_id_granularity = menu_dishes_about_latent_features.groupby('menu_id').sum().reset_index()

In [719]:
X = pd.merge(X, at_menu_id_granularity, left_on='master_menu_id', right_on='menu_id', how='left')
del X['menu_id']

In [730]:
# X = X[(X.ticket_price > 10) & (X.ticket_price < 100)]
price_ceiling_to_ignore = 100
price_floor_to_ignore = 10
meal_ids_to_ignore_because_price_outlier = X[(X.ticket_price < price_floor_to_ignore) | (X.ticket_price > price_ceiling_to_ignore)].meal_id.unique()
X = X[~X['meal_id'].isin(meal_ids_to_ignore_because_price_outlier)]

### Model Time!

In [731]:
y = X.sold
del X['sold']

In [732]:
X_train, X_validate, y_train, y_validate = train_test_split(X.values, y)

In [733]:
removing_features_train = X_train[:, :5]
removing_features_validate = X_validate[:, :5]
X_train = np.delete(X_train, [0, 1, 2, 3, 4], axis=1)
X_validate = np.delete(X_validate, [0, 1, 2, 3, 4], axis=1)

In [734]:
clf = RandomForestClassifier(n_estimators = 100, max_depth = 30)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [735]:
sum(abs(removing_features_validate[:, 2] - clf.predict_proba(X_validate)[:, 1]))

1081.239454779333

In [736]:
sorted_features = reversed(sorted([(index, x) for index, x in enumerate(clf.feature_importances_)], key=lambda x: x[1]))

In [737]:
for feature in sorted_features:
    importance = feature[1]
    print feature[0]
    print X.columns[feature[0] + 5], importance

1
ticket_price 0.140438363591
2
meal_listed_days 0.0781022098788
0
number_of_seats 0.0622712640131
279
menu_dishes_about_related_to_organic_ingredients 0.0270403951013
275
menu_dishes_about_related_to_beefy_potato 0.0250052294362
273
menu_dishes_about_related_to_sushi_making 0.0242625399691
280
menu_dishes_about_related_to_marinated_stir_fry 0.0237618147227
11
cook_days_on_platform 0.0233656449668
271
menu_dishes_about_related_to_salad_ingredients 0.0232073710957
272
menu_dishes_about_related_to_curry_ingredients 0.0228752146833
267
menu_about_related_to_a_celebration 0.022585845932
277
menu_dishes_related_to_dessert 0.022250856196
269
menu_about_related_to_sushi 0.0221581780892
276
menu_dishes_about_related_to_stir_fry 0.0221138796597
274
menu_dishes_about_related_to_bread_drinks 0.0216220352084
278
menu_dishes_about_related_to_cake_noodles 0.0213750232531
264
menu_about_related_to_chinese_food 0.0207802211695
266
menu_about_related_to_street_food 0.0190782639252
265
menu_about_relate

In [738]:
model = LogisticRegression()

In [739]:
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [740]:
# sum(abs(percentage_seats_sold_validate - model.predict_proba(X_validate)[:, 1]))

In [741]:
def predict_optimal_price(row_in_validation_set_to_predict):
    price_probabilities = []
    for price in range(1, 100):
        something = np.array(X_validate[row_in_validation_set_to_predict])
        something[1] = price
        price_probabilities.append((price, clf.predict_proba(np.array([something]))[0][1]))
    return sorted(price_probabilities, key=lambda x: x[1])[-1][0]

In [750]:
def predict_optimal_price_by_maximizing_rev(row_in_validation_set_to_predict):
    price_probabilities = []
    for price in range(1, 100):
        something = np.array(X_validate[row_in_validation_set_to_predict])
        something[1] = price
        price_probabilities.append((price, (float(price) * (clf.predict_proba(np.array([something]))[0][1] * X_validate[row_in_validation_set_to_predict][0]))))
    return sorted(price_probabilities, key=lambda x: x[1])[-1][0]

In [752]:
for i in range(20602):
    print i, predict_optimal_price(i), predict_optimal_price_by_maximizing_rev(i)

 0 75 99
1 11 99
2 56 99
3 74 99
4 35 99
5 23 99
6 24 99
7 33 99
8 63 99
9 57 99
10 45 99
11 22 99
12 79 99
13 47 99
14 11 88
15 36 99
16 27 99
17 52 99
18 42 99
19 21 99
20 20 99
21 40 99
22 49 99
23 61 99
24 33 99
25 11 99
26 90 99
27 21 99
28 56 99
29 23 99
30 51 99
31 67 99
32 57 99
33 44 99
34 24 99
35 12 99
36 84 99
37 55 99
38 50 99
39 57 99
40 52 99
41 11 99
42 75 99
43 10 99
44 55 99
45 45 99
46 57 99
47 90 99
48 80 99
49 63 99
50 24 99
51 38 99
52 82 99
53 40 99
54 31 99
55 23 99
56 56 99
57 23 99
58 59 99
59 65 99
60 39 99
61

KeyboardInterrupt: 

In [746]:
# 0 32
# 1 82
# 2 81
# 3 11
# 4 82
# 5 37
# 6 40
removing_features_validate[1]
X_validate[1]




array([30, 82.5, 26, 0, 0, 0, 0, 0, 1, 0, 0, 892.0418871252205, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1.8540423839554572,
       31.69749332424303, 22.5200886313275, 14.635026418953453, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0.07411615692729151, 0.0

In [747]:
X

Unnamed: 0,meal_id,master_menu_id,percentage_of_seats_sold,meal_date,days_to_sell,number_of_seats,ticket_price,meal_listed_days,meal_is_on_monday,meal_is_on_tuesday,...,menu_dishes_about_related_to_salad_ingredients,menu_dishes_about_related_to_curry_ingredients,menu_dishes_about_related_to_sushi_making,menu_dishes_about_related_to_bread_drinks,menu_dishes_about_related_to_beefy_potato,menu_dishes_about_related_to_stir_fry,menu_dishes_related_to_dessert,menu_dishes_about_related_to_cake_noodles,menu_dishes_about_related_to_organic_ingredients,menu_dishes_about_related_to_marinated_stir_fry
0,11191,8361,0.440000,2017-10-10,1.0,25,46.2,12,0,1,...,0.098724,0.203599,0.156903,0.036212,0.155680,0.173484,0.000000,0.429644,0.019230,0.392627
1,11191,8361,0.440000,2017-10-10,1.0,25,46.2,12,0,1,...,0.098724,0.203599,0.156903,0.036212,0.155680,0.173484,0.000000,0.429644,0.019230,0.392627
2,11191,8361,0.440000,2017-10-10,8.0,25,46.2,12,0,1,...,0.098724,0.203599,0.156903,0.036212,0.155680,0.173484,0.000000,0.429644,0.019230,0.392627
3,11191,8361,0.440000,2017-10-10,8.0,25,46.2,12,0,1,...,0.098724,0.203599,0.156903,0.036212,0.155680,0.173484,0.000000,0.429644,0.019230,0.392627
4,11191,8361,0.440000,2017-10-10,8.0,25,46.2,12,0,1,...,0.098724,0.203599,0.156903,0.036212,0.155680,0.173484,0.000000,0.429644,0.019230,0.392627
5,11191,8361,0.440000,2017-10-10,8.0,25,46.2,12,0,1,...,0.098724,0.203599,0.156903,0.036212,0.155680,0.173484,0.000000,0.429644,0.019230,0.392627
6,11191,8361,0.440000,2017-10-10,8.0,25,46.2,12,0,1,...,0.098724,0.203599,0.156903,0.036212,0.155680,0.173484,0.000000,0.429644,0.019230,0.392627
7,11191,8361,0.440000,2017-10-10,9.0,25,46.2,12,0,1,...,0.098724,0.203599,0.156903,0.036212,0.155680,0.173484,0.000000,0.429644,0.019230,0.392627
8,11191,8361,0.440000,2017-10-10,9.0,25,46.2,12,0,1,...,0.098724,0.203599,0.156903,0.036212,0.155680,0.173484,0.000000,0.429644,0.019230,0.392627
9,11191,8361,0.440000,2017-10-10,11.0,25,46.2,12,0,1,...,0.098724,0.203599,0.156903,0.036212,0.155680,0.173484,0.000000,0.429644,0.019230,0.392627
