In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import scipy.stats as scs
import numpy as np
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
tickets = pd.read_csv('./tickets.csv', delimiter='|')

In [3]:
meal_price_outliers = list(tickets[(tickets.ticket_price < 5) | (tickets.ticket_price > 200)].meal_id)

def is_meal_with_crazy_ticket_price(row):
    if row['meal_id'] in meal_price_outliers:
        return False
    else:
        return True

tickets = tickets[tickets.apply(is_meal_with_crazy_ticket_price, axis=1)]

In [4]:
X = tickets[['percentage_of_seats_sold', 'meal_id','number_of_seats', 'ticket_price', 'sold']]

In [5]:
meals = pd.read_csv('./cleaned/meals.csv', header=None, names=['Id',
 'Cook Id', 'Venue Id', 'Menu Id', 'Is Cancelled', 'Is Active', 'Title', 'Meal Date', 'Day of Date Added', 'Is Public?', 'Number Of Seats', 'Venue Style', 'Venue Capacity', 'Area Id', 'Menu Style', 'Cuisine Type', 'Meal Categories', 'is_category_kosher', 'is_category_paleo', 'is_category_organic', 'is_category_vegetarian', 'is_category_vegan', 'is_category_gluten_free', 'is_category_raw', 'is_category_halal', 'is_category_local', 'is_cuisine_type_portuguese', 'is_cuisine_type_irish', 'is_cuisine_type_cajun_creole', 'is_cuisine_type_polynesian', 'is_cuisine_type_chinese', 'is_cuisine_type_peruvian', 'is_cuisine_type_chamorro', 'is_cuisine_type_belgian', 'is_cuisine_type_central_american', 'is_cuisine_type_mediterranean', 'is_cuisine_type_japanese', 'is_cuisine_type_mexican', 'is_cuisine_type_singaporean', 'is_cuisine_type_ecuadorian', 'is_cuisine_type_persian', 'is_cuisine_type_lao', 'is_cuisine_type_asian', 'is_cuisine_type_latin_american', 'is_cuisine_type_spanish', 'is_cuisine_type_ice_cream_gelato', 'is_cuisine_type_barbecue', 'is_cuisine_type_cafe', 'is_cuisine_type_thai', 'is_cuisine_type_paleo', 'is_cuisine_type_caribbean', 'is_cuisine_type_health_food', 'is_cuisine_type_argentinian', 'is_cuisine_type_hispanic', 'is_cuisine_type_tapas_small_plates', 'is_cuisine_type_european', 'is_cuisine_type_desserts_bakeries', 'is_cuisine_type_south_american', 'is_cuisine_type_gastropub_food', 'is_cuisine_type_other', 'is_cuisine_type_guatemalan', 'is_cuisine_type_brazilian', 'is_cuisine_type_korean', 'is_cuisine_type_salvadorian', 'is_cuisine_type_pizza', 'is_cuisine_type_indonesian', 'is_cuisine_type_balkan', 'is_cuisine_type_srilankan', 'is_cuisine_type_indian', 'is_cuisine_type_hawaiian', 'is_cuisine_type_jewish', 'is_cuisine_type_taiwanese', 'is_cuisine_type_african', 'is_cuisine_type_middle_eastern', 'is_cuisine_type_french', 'is_cuisine_type_asian_noodle_soup', 'is_cuisine_type_vegan', 'is_cuisine_type_german', 'is_cuisine_type_russian', 'is_cuisine_type_vietnamese', 'is_cuisine_type_brunch', 'is_cuisine_type_australian', 'is_cuisine_type_cuban', 'is_cuisine_type_filipino', 'is_cuisine_type_vegetarian', 'is_cuisine_type_turkish', 'is_cuisine_type_malaysian', 'is_cuisine_type_british', 'is_cuisine_type_colombian', 'is_cuisine_type_north_african', 'is_cuisine_type_greek', 'is_cuisine_type_burmese', 'is_cuisine_type_east_european', 'is_cuisine_type_nordic', 'is_cuisine_type_north_american', 'is_cuisine_type_american', 'is_cuisine_type_italian', 'is_cuisine_type_seafood', 'is_cuisine_type_soul_food', 'is_cuisine_type_californian'])

In [6]:
meals['meal_date'] = pd.to_datetime(meals['Meal Date'])
meals['created_date'] = pd.to_datetime(meals['Day of Date Added'])
meals['meal_year'] = meals['meal_date'].apply(lambda x: x.year)
meals['meal_month'] = meals['meal_date'].apply(lambda x: x.month)
meals['meal_day_of_week'] = meals['meal_date'].apply(lambda x: x.weekday())
meals['meal_is_on_weekday'] = meals['meal_day_of_week'].apply(lambda x: 1 if x < 5 else 0)
meals['listed_days'] = (meals['meal_date'] - meals['created_date']).apply(lambda x: x.days if x.days > 0 else 0)

In [7]:
meals = meals[['Id', 'Venue Id', 'Menu Id', 'meal_date', 'meal_is_on_weekday', 'listed_days']]

In [8]:
menus = pd.read_csv('./cleaned/menus.csv', header=None, names=['Id', 'Cook Id', 'Title', 'About', 'Cuisine Type', 'Day of Date Added', 'Menu Style', 'Count of distinct Menu Dish Id', 'is_menus_cuisine_type_portuguese', 'is_menus_cuisine_type_irish', 'is_menus_cuisine_type_cajun_creole', 'is_menus_cuisine_type_polynesian', 'is_menus_cuisine_type_chinese', 'is_menus_cuisine_type_peruvian', 'is_menus_cuisine_type_chamorro', 'is_menus_cuisine_type_belgian', 'is_menus_cuisine_type_central_american', 'is_menus_cuisine_type_mediterranean', 'is_menus_cuisine_type_japanese', 'is_menus_cuisine_type_mexican', 'is_menus_cuisine_type_singaporean', 'is_menus_cuisine_type_ecuadorian', 'is_menus_cuisine_type_persian', 'is_menus_cuisine_type_lao', 'is_menus_cuisine_type_asian', 'is_menus_cuisine_type_latin_american', 'is_menus_cuisine_type_spanish', 'is_menus_cuisine_type_ice_cream_gelato', 'is_menus_cuisine_type_barbecue', 'is_menus_cuisine_type_cafe', 'is_menus_cuisine_type_thai', 'is_menus_cuisine_type_paleo', 'is_menus_cuisine_type_caribbean', 'is_menus_cuisine_type_health_food', 'is_menus_cuisine_type_argentinian', 'is_menus_cuisine_type_hispanic', 'is_menus_cuisine_type_tapas_small_plates', 'is_menus_cuisine_type_european', 'is_menus_cuisine_type_desserts_bakeries', 'is_menus_cuisine_type_south_american', 'is_menus_cuisine_type_gastropub_food', 'is_menus_cuisine_type_other', 'is_menus_cuisine_type_guatemalan', 'is_menus_cuisine_type_brazilian', 'is_menus_cuisine_type_korean', 'is_menus_cuisine_type_colombian', 'is_menus_cuisine_type_pizza', 'is_menus_cuisine_type_indonesian', 'is_menus_cuisine_type_balkan', 'is_menus_cuisine_type_srilankan', 'is_menus_cuisine_type_indian', 'is_menus_cuisine_type_hawaiian', 'is_menus_cuisine_type_jewish', 'is_menus_cuisine_type_taiwanese', 'is_menus_cuisine_type_african', 'is_menus_cuisine_type_middle_eastern', 'is_menus_cuisine_type_vegan', 'is_menus_cuisine_type_asian_noodle_soup', 'is_menus_cuisine_type_french', 'is_menus_cuisine_type_german', 'is_menus_cuisine_type_russian', 'is_menus_cuisine_type_vietnamese', 'is_menus_cuisine_type_brunch', 'is_menus_cuisine_type_australian', 'is_menus_cuisine_type_cuban', 'is_menus_cuisine_type_filipino', 'is_menus_cuisine_type_vegetarian', 'is_menus_cuisine_type_turkish', 'is_menus_cuisine_type_malaysian', 'is_menus_cuisine_type_british', 'is_menus_cuisine_type_salvadorian', 'is_menus_cuisine_type_north_african', 'is_menus_cuisine_type_greek', 'is_menus_cuisine_type_burmese', 'is_menus_cuisine_type_east_european', 'is_menus_cuisine_type_nordic', 'is_menus_cuisine_type_north_american', 'is_menus_cuisine_type_american', 'is_menus_cuisine_type_italian', 'is_menus_cuisine_type_seafood', 'is_menus_cuisine_type_soul_food', 'is_menus_cuisine_type_californian'])

In [9]:
menus['course_count'] = menus['Count of distinct Menu Dish Id']
menus = menus[['Id', 'Cook Id', 'is_menus_cuisine_type_portuguese', 'is_menus_cuisine_type_irish', 'is_menus_cuisine_type_cajun_creole', 'is_menus_cuisine_type_polynesian', 'is_menus_cuisine_type_chinese', 'is_menus_cuisine_type_peruvian', 'is_menus_cuisine_type_chamorro', 'is_menus_cuisine_type_belgian', 'is_menus_cuisine_type_central_american', 'is_menus_cuisine_type_mediterranean', 'is_menus_cuisine_type_japanese', 'is_menus_cuisine_type_mexican', 'is_menus_cuisine_type_singaporean', 'is_menus_cuisine_type_ecuadorian', 'is_menus_cuisine_type_persian', 'is_menus_cuisine_type_lao', 'is_menus_cuisine_type_asian', 'is_menus_cuisine_type_latin_american', 'is_menus_cuisine_type_spanish', 'is_menus_cuisine_type_ice_cream_gelato', 'is_menus_cuisine_type_barbecue', 'is_menus_cuisine_type_cafe', 'is_menus_cuisine_type_thai', 'is_menus_cuisine_type_paleo', 'is_menus_cuisine_type_caribbean', 'is_menus_cuisine_type_health_food', 'is_menus_cuisine_type_argentinian', 'is_menus_cuisine_type_hispanic', 'is_menus_cuisine_type_tapas_small_plates', 'is_menus_cuisine_type_european', 'is_menus_cuisine_type_desserts_bakeries', 'is_menus_cuisine_type_south_american', 'is_menus_cuisine_type_gastropub_food', 'is_menus_cuisine_type_other', 'is_menus_cuisine_type_guatemalan', 'is_menus_cuisine_type_brazilian', 'is_menus_cuisine_type_korean', 'is_menus_cuisine_type_colombian', 'is_menus_cuisine_type_pizza', 'is_menus_cuisine_type_indonesian', 'is_menus_cuisine_type_balkan', 'is_menus_cuisine_type_srilankan', 'is_menus_cuisine_type_indian', 'is_menus_cuisine_type_hawaiian', 'is_menus_cuisine_type_jewish', 'is_menus_cuisine_type_taiwanese', 'is_menus_cuisine_type_african', 'is_menus_cuisine_type_middle_eastern', 'is_menus_cuisine_type_vegan', 'is_menus_cuisine_type_asian_noodle_soup', 'is_menus_cuisine_type_french', 'is_menus_cuisine_type_german', 'is_menus_cuisine_type_russian', 'is_menus_cuisine_type_vietnamese', 'is_menus_cuisine_type_brunch', 'is_menus_cuisine_type_australian', 'is_menus_cuisine_type_cuban', 'is_menus_cuisine_type_filipino', 'is_menus_cuisine_type_vegetarian', 'is_menus_cuisine_type_turkish', 'is_menus_cuisine_type_malaysian', 'is_menus_cuisine_type_british', 'is_menus_cuisine_type_salvadorian', 'is_menus_cuisine_type_north_african', 'is_menus_cuisine_type_greek', 'is_menus_cuisine_type_burmese', 'is_menus_cuisine_type_east_european', 'is_menus_cuisine_type_nordic', 'is_menus_cuisine_type_north_american', 'is_menus_cuisine_type_american', 'is_menus_cuisine_type_italian', 'is_menus_cuisine_type_seafood', 'is_menus_cuisine_type_soul_food', 'is_menus_cuisine_type_californian', 'course_count']]

In [10]:
expanded_meals = meals.merge(menus, how='inner', left_on='Menu Id', right_on='Id', suffixes=['_meal', '_menu'])

In [11]:
cooks = pd.read_csv('./cleaned/cooks.csv', header=None, names=['Id', 'Cuisine Types', 'Day of Date Joined', 'Cooking Experience', 'Cooking Experience Years', 'Date of Application', 'Reasons For Cooking', 'Referrer', 'is_cooks_cuisine_type_portuguese', 'is_cooks_cuisine_type_cajun_creole', 'is_cooks_cuisine_type_chinese', 'is_cooks_cuisine_type_peruvian', 'is_cooks_cuisine_type_chamorro', 'is_cooks_cuisine_type_central_american', 'is_cooks_cuisine_type_mediterranean', 'is_cooks_cuisine_type_japanese', 'is_cooks_cuisine_type_mexican', 'is_cooks_cuisine_type_ecuadorian', 'is_cooks_cuisine_type_pizza', 'is_cooks_cuisine_type_persian', 'is_cooks_cuisine_type_asian', 'is_cooks_cuisine_type_latin_american', 'is_cooks_cuisine_type_spanish', 'is_cooks_cuisine_type_ice_cream_gelato', 'is_cooks_cuisine_type_barbecue', 'is_cooks_cuisine_type_brunch', 'is_cooks_cuisine_type_paleo', 'is_cooks_cuisine_type_caribbean', 'is_cooks_cuisine_type_argentinian', 'is_cooks_cuisine_type_vietnamese', 'is_cooks_cuisine_type_tapas_small_plates', 'is_cooks_cuisine_type_burmese', 'is_cooks_cuisine_type_desserts_bakeries', 'is_cooks_cuisine_type_south_american', 'is_cooks_cuisine_type_gastropub_food', 'is_cooks_cuisine_type_other', 'is_cooks_cuisine_type_brazilian', 'is_cooks_cuisine_type_korean', 'is_cooks_cuisine_type_colombian', 'is_cooks_cuisine_type_european', 'is_cooks_cuisine_type_indonesian', 'is_cooks_cuisine_type_lao', 'is_cooks_cuisine_type_indian', 'is_cooks_cuisine_type_hawaiian', 'is_cooks_cuisine_type_jewish', 'is_cooks_cuisine_type_taiwanese', 'is_cooks_cuisine_type_african', 'is_cooks_cuisine_type_middle_eastern', 'is_cooks_cuisine_type_french', 'is_cooks_cuisine_type_asian_noodle_soup', 'is_cooks_cuisine_type_greek', 'is_cooks_cuisine_type_vegan', 'is_cooks_cuisine_type_german', 'is_cooks_cuisine_type_russian', 'is_cooks_cuisine_type_thai', 'is_cooks_cuisine_type_australian', 'is_cooks_cuisine_type_balkan', 'is_cooks_cuisine_type_filipino', 'is_cooks_cuisine_type_vegetarian', 'is_cooks_cuisine_type_turkish', 'is_cooks_cuisine_type_malaysian', 'is_cooks_cuisine_type_british', 'is_cooks_cuisine_type_health_food', 'is_cooks_cuisine_type_north_african', 'is_cooks_cuisine_type_hispanic', 'is_cooks_cuisine_type_american', 'is_cooks_cuisine_type_east_european', 'is_cooks_cuisine_type_nordic', 'is_cooks_cuisine_type_north_american', 'is_cooks_cuisine_type_californian', 'is_cooks_cuisine_type_seafood', 'is_cooks_cuisine_type_soul_food', 'is_cooks_cuisine_type_italian', 'is_reason_meet', 'is_reason_brand', 'is_reason_money'])

In [12]:
cooks['Cooking Experience Years'].fillna('missing', inplace=True)
cooks['Cooking Experience'].fillna('missing', inplace=True)
cooks['Referrer'].fillna('missing', inplace=True)
cooks['cook_applied_date'] = pd.to_datetime(cooks['Date of Application'])
cooks['cook_joined_date'] = pd.to_datetime(cooks['Day of Date Joined'])

In [13]:
def split_and_add_to_set(column_values, unique_structure, split_character):
    for val in column_values.split(split_character):
        unique_structure.add(val)

def clean_value(val):
    return val.replace('/', '_').replace(' ', '_').lower()

In [14]:
unique_cooking_experience_years_values = set()

cooks['Cooking Experience Years'].apply(split_and_add_to_set, args=(unique_cooking_experience_years_values, ',', ))

for category in unique_cooking_experience_years_values:
    if len(category) > 0:
        column_name = 'is_cooking_experience_years_' + clean_value(category)
        cooks[column_name] = cooks['Cooking Experience Years'].apply(lambda x: 1 if category in x.split(',') else 0)

In [15]:
unique_cooking_experience_values = set()

cooks['Cooking Experience'].apply(split_and_add_to_set, args=(unique_cooking_experience_values, ' or ', ))

for category in unique_cooking_experience_values:
    if len(category) > 0:
        column_name = 'is_cooking_experience_' + clean_value(category)
        cooks[column_name] = cooks['Cooking Experience'].apply(lambda x: 1 if category in x.split(',') else 0)

In [16]:
unique_referrer_values = set()

cooks['Referrer'].apply(split_and_add_to_set, args=(unique_referrer_values, ',', ))

for category in unique_referrer_values:
    if len(category) > 0:
        column_name = 'is_cook_referrer_' + clean_value(category)
        cooks[column_name] = cooks['Referrer'].apply(lambda x: 1 if category in x.split(',') else 0)

In [17]:
cooks['cooks_missing_applied_date'] = cooks['cook_applied_date'].isnull().apply(lambda x: 1 if x else 0)

In [18]:
cooks['is_cooks_join_reason_meet'] = cooks['is_reason_meet']
cooks['is_cooks_join_reason_brand'] = cooks['is_reason_brand']
cooks['is_cooks_join_reason_money'] = cooks['is_reason_money']

In [19]:
cooks = cooks[['Id', 'cook_joined_date', 'cook_applied_date', 'cooks_missing_applied_date', 'is_cooking_experience_years_8+', 'is_cooking_experience_years_1-3', 'is_cooking_experience_years_4-7', 'is_cooking_experience_years_0', 'is_cooking_experience_years_missing', 'is_cooking_experience_current-chef', 'is_cooking_experience_avid', 'is_cooking_experience_former-chef', 'is_cooking_experience_missing', 'is_cooking_experience_home_cook', 'is_cooking_experience_ownerf', 'is_cooking_experience_novice', 'is_cooking_experience_private', 'is_cooking_experience_entrepreneur', 'is_cooking_experience_caterer', 'is_cooking_experience_chef_de_partie', 'is_cooking_experience_personal_chef', 'is_cooking_experience_chef_de_cuisine', 'is_cooking_experience_chef-in-traning', 'is_cooking_experience_commis', 'is_cooking_experience_sous_chef', 'is_cook_referrer_google', 'is_cook_referrer_missing', 'is_cook_referrer_job', 'is_cook_referrer_other', 'is_cook_referrer_social', 'is_cook_referrer_news', 'is_cook_referrer_referral', 'is_cook_referrer_meal', 'is_cook_referrer_friend', 'is_cooks_join_reason_meet', 'is_cooks_join_reason_brand', 'is_cooks_join_reason_money']]

In [20]:
expanded_meals = expanded_meals.merge(cooks, how='inner', left_on='Cook Id', right_on='Id', suffixes=['_meal2', '_cook'])

In [21]:
venues = pd.read_csv('./cleaned/venues.csv', header=None, names=['Id', 'Name', 'Venue Style', 'Location Id', 'Area Id', 'Max Seats', 'Monthly Services', 'Owner Id', 'Address', 'Day of Date Added', 'Neighborhood', 'Zipcode'])
venues['Name'].fillna('missing', inplace=True)

In [22]:
unqiue_venue_style = set()

venues['Venue Style'].apply(split_and_add_to_set, args=(unqiue_venue_style, ',', ))

for category in unqiue_venue_style:
    if len(category) > 0:
        column_name = 'is_venue_style_' + clean_value(category)
        venues[column_name] = venues['Venue Style'].apply(lambda x: 1 if category in x.split(',') else 0)

In [23]:
popular_venues = ['TheGarage', 'theNewberry', 'theLab', 'theUnion', 'theBocca', 'theTradesman', 'Foundation Cafe', 'Private Location', 'sound and savor', 'theCommons', 'The Humboldt House', 'theGreenhouse', 'Olea', 'R.T.B. At Dabba', 'Golden State Room', 'Tournant', 'Picnic on Third', 'Frances', 'theClub', 'Home', 'My Home']

In [24]:
for venue in popular_venues:
    venues['is_venue_name_' + venue] = venues['Name'].apply(lambda x: 1 if x.strip() == venue else 0)

In [25]:
venues = venues[['Id','is_venue_style_pop-up-space', 'is_venue_style_apartment', 'is_venue_style_restaurant','is_venue_style_farm', 'is_venue_style_house', 'is_venue_style_brown-stone', 'is_venue_name_TheGarage', 'is_venue_name_theNewberry', 'is_venue_name_theLab', 'is_venue_name_theUnion', 'is_venue_name_theBocca', 'is_venue_name_theTradesman', 'is_venue_name_Foundation Cafe', 'is_venue_name_Private Location', 'is_venue_name_sound and savor', 'is_venue_name_theCommons', 'is_venue_name_The Humboldt House', 'is_venue_name_theGreenhouse', 'is_venue_name_Olea', 'is_venue_name_R.T.B. At Dabba', 'is_venue_name_Golden State Room', 'is_venue_name_Tournant', 'is_venue_name_Picnic on Third', 'is_venue_name_Frances', 'is_venue_name_theClub', 'is_venue_name_Home', 'is_venue_name_My Home']]

In [26]:
expanded_meals = expanded_meals.merge(venues, how='inner', left_on='Venue Id', right_on='Id', suffixes=['_meal3', '_venue'])

In [27]:
expanded_meals = expanded_meals[['Id_meal', 'cook_joined_date', 'meal_date', 'meal_is_on_weekday', 'listed_days', 'is_menus_cuisine_type_portuguese', 'is_menus_cuisine_type_irish', 'is_menus_cuisine_type_cajun_creole', 'is_menus_cuisine_type_polynesian', 'is_menus_cuisine_type_chinese', 'is_menus_cuisine_type_peruvian', 'is_menus_cuisine_type_chamorro', 'is_menus_cuisine_type_belgian', 'is_menus_cuisine_type_central_american', 'is_menus_cuisine_type_mediterranean', 'is_menus_cuisine_type_japanese', 'is_menus_cuisine_type_mexican', 'is_menus_cuisine_type_singaporean', 'is_menus_cuisine_type_ecuadorian', 'is_menus_cuisine_type_persian', 'is_menus_cuisine_type_lao', 'is_menus_cuisine_type_asian', 'is_menus_cuisine_type_latin_american', 'is_menus_cuisine_type_spanish', 'is_menus_cuisine_type_ice_cream_gelato', 'is_menus_cuisine_type_barbecue', 'is_menus_cuisine_type_cafe', 'is_menus_cuisine_type_thai', 'is_menus_cuisine_type_paleo', 'is_menus_cuisine_type_caribbean', 'is_menus_cuisine_type_health_food', 'is_menus_cuisine_type_argentinian', 'is_menus_cuisine_type_hispanic', 'is_menus_cuisine_type_tapas_small_plates', 'is_menus_cuisine_type_european', 'is_menus_cuisine_type_desserts_bakeries', 'is_menus_cuisine_type_south_american', 'is_menus_cuisine_type_gastropub_food', 'is_menus_cuisine_type_other', 'is_menus_cuisine_type_guatemalan', 'is_menus_cuisine_type_brazilian', 'is_menus_cuisine_type_korean', 'is_menus_cuisine_type_colombian', 'is_menus_cuisine_type_pizza', 'is_menus_cuisine_type_indonesian', 'is_menus_cuisine_type_balkan', 'is_menus_cuisine_type_srilankan', 'is_menus_cuisine_type_indian', 'is_menus_cuisine_type_hawaiian', 'is_menus_cuisine_type_jewish', 'is_menus_cuisine_type_taiwanese', 'is_menus_cuisine_type_african', 'is_menus_cuisine_type_middle_eastern', 'is_menus_cuisine_type_vegan', 'is_menus_cuisine_type_asian_noodle_soup', 'is_menus_cuisine_type_french', 'is_menus_cuisine_type_german', 'is_menus_cuisine_type_russian', 'is_menus_cuisine_type_vietnamese', 'is_menus_cuisine_type_brunch', 'is_menus_cuisine_type_australian', 'is_menus_cuisine_type_cuban', 'is_menus_cuisine_type_filipino', 'is_menus_cuisine_type_vegetarian', 'is_menus_cuisine_type_turkish', 'is_menus_cuisine_type_malaysian', 'is_menus_cuisine_type_british', 'is_menus_cuisine_type_salvadorian', 'is_menus_cuisine_type_north_african', 'is_menus_cuisine_type_greek', 'is_menus_cuisine_type_burmese', 'is_menus_cuisine_type_east_european', 'is_menus_cuisine_type_nordic', 'is_menus_cuisine_type_north_american', 'is_menus_cuisine_type_american', 'is_menus_cuisine_type_italian', 'is_menus_cuisine_type_seafood', 'is_menus_cuisine_type_soul_food', 'is_menus_cuisine_type_californian', 'course_count', 'cook_applied_date', 'cooks_missing_applied_date', 'is_cooking_experience_years_8+', 'is_cooking_experience_years_1-3', 'is_cooking_experience_years_4-7', 'is_cooking_experience_years_0', 'is_cooking_experience_years_missing', 'is_cooking_experience_current-chef', 'is_cooking_experience_avid', 'is_cooking_experience_former-chef', 'is_cooking_experience_missing', 'is_cooking_experience_home_cook', 'is_cooking_experience_ownerf', 'is_cooking_experience_novice', 'is_cooking_experience_private', 'is_cooking_experience_entrepreneur', 'is_cooking_experience_caterer', 'is_cooking_experience_chef_de_partie', 'is_cooking_experience_personal_chef', 'is_cooking_experience_chef_de_cuisine', 'is_cooking_experience_chef-in-traning', 'is_cooking_experience_commis', 'is_cooking_experience_sous_chef', 'is_cook_referrer_google', 'is_cook_referrer_missing', 'is_cook_referrer_job', 'is_cook_referrer_other', 'is_cook_referrer_social', 'is_cook_referrer_news', 'is_cook_referrer_referral', 'is_cook_referrer_meal', 'is_cook_referrer_friend', 'is_cooks_join_reason_meet', 'is_cooks_join_reason_brand', 'is_cooks_join_reason_money', 'is_venue_style_pop-up-space', 'is_venue_style_apartment', 'is_venue_style_restaurant', 'is_venue_style_farm', 'is_venue_style_house', 'is_venue_style_brown-stone', 'is_venue_name_TheGarage', 'is_venue_name_theNewberry', 'is_venue_name_theLab', 'is_venue_name_theUnion', 'is_venue_name_theBocca', 'is_venue_name_theTradesman', 'is_venue_name_Foundation Cafe', 'is_venue_name_Private Location', 'is_venue_name_sound and savor', 'is_venue_name_theCommons', 'is_venue_name_The Humboldt House', 'is_venue_name_theGreenhouse', 'is_venue_name_Olea', 'is_venue_name_R.T.B. At Dabba', 'is_venue_name_Golden State Room', 'is_venue_name_Tournant', 'is_venue_name_Picnic on Third', 'is_venue_name_Frances', 'is_venue_name_theClub', 'is_venue_name_Home', 'is_venue_name_My Home']]

### Days on platform

In [28]:
def compute_cook_days_on_platform(row):
    if (row['meal_date'] - row['cook_joined_date']).days < 0:
        return None
    else:
        return (row['meal_date'] - row['cook_joined_date']).days

expanded_meals['cook_days_on_platform'] = expanded_meals.apply(compute_cook_days_on_platform, axis=1)

In [29]:
expanded_meals['couldnt_compute_cook_days_on_platform'] = expanded_meals['cook_days_on_platform'].apply(lambda x: 0 if x > 0 else 1)

In [30]:
expanded_meals['cook_days_on_platform'].fillna(int(expanded_meals['cook_days_on_platform'].median()), inplace=True)

In [31]:
X = X.merge(expanded_meals, how='inner', left_on='meal_id', right_on='Id_meal')
del X['Id_meal']
del X['meal_id']
del X['meal_date']
del X['cook_applied_date']
del X['cook_joined_date']
X['meal_is_on_weekday'].fillna(0, inplace=True)
X['is_menus_cuisine_type_portuguese'].fillna(0, inplace=True)
X['is_menus_cuisine_type_irish'].fillna(0, inplace=True)
X['is_menus_cuisine_type_cajun_creole'].fillna(0, inplace=True)
X['is_menus_cuisine_type_polynesian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_chinese'].fillna(0, inplace=True)
X['is_menus_cuisine_type_peruvian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_chamorro'].fillna(0, inplace=True)
X['is_menus_cuisine_type_belgian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_central_american'].fillna(0, inplace=True)
X['is_menus_cuisine_type_mediterranean'].fillna(0, inplace=True)
X['is_menus_cuisine_type_japanese'].fillna(0, inplace=True)
X['is_menus_cuisine_type_mexican'].fillna(0, inplace=True)
X['is_menus_cuisine_type_singaporean'].fillna(0, inplace=True)
X['is_menus_cuisine_type_ecuadorian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_persian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_lao'].fillna(0, inplace=True)
X['is_menus_cuisine_type_asian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_latin_american'].fillna(0, inplace=True)
X['is_menus_cuisine_type_spanish'].fillna(0, inplace=True)
X['is_menus_cuisine_type_ice_cream_gelato'].fillna(0, inplace=True)
X['is_menus_cuisine_type_barbecue'].fillna(0, inplace=True)
X['is_menus_cuisine_type_cafe'].fillna(0, inplace=True)
X['is_menus_cuisine_type_thai'].fillna(0, inplace=True)
X['is_menus_cuisine_type_paleo'].fillna(0, inplace=True)
X['is_menus_cuisine_type_caribbean'].fillna(0, inplace=True)
X['is_menus_cuisine_type_health_food'].fillna(0, inplace=True)
X['is_menus_cuisine_type_argentinian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_hispanic'].fillna(0, inplace=True)
X['is_menus_cuisine_type_tapas_small_plates'].fillna(0, inplace=True)
X['is_menus_cuisine_type_european'].fillna(0, inplace=True)
X['is_menus_cuisine_type_desserts_bakeries'].fillna(0, inplace=True)
X['is_menus_cuisine_type_south_american'].fillna(0, inplace=True)
X['is_menus_cuisine_type_gastropub_food'].fillna(0, inplace=True)
X['is_menus_cuisine_type_other'].fillna(0, inplace=True)
X['is_menus_cuisine_type_guatemalan'].fillna(0, inplace=True)
X['is_menus_cuisine_type_brazilian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_korean'].fillna(0, inplace=True)
X['is_menus_cuisine_type_colombian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_pizza'].fillna(0, inplace=True)
X['is_menus_cuisine_type_indonesian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_balkan'].fillna(0, inplace=True)
X['is_menus_cuisine_type_srilankan'].fillna(0, inplace=True)
X['is_menus_cuisine_type_indian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_hawaiian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_jewish'].fillna(0, inplace=True)
X['is_menus_cuisine_type_taiwanese'].fillna(0, inplace=True)
X['is_menus_cuisine_type_african'].fillna(0, inplace=True)
X['is_menus_cuisine_type_middle_eastern'].fillna(0, inplace=True)
X['is_menus_cuisine_type_vegan'].fillna(0, inplace=True)
X['is_menus_cuisine_type_asian_noodle_soup'].fillna(0, inplace=True)
X['is_menus_cuisine_type_french'].fillna(0, inplace=True)
X['is_menus_cuisine_type_german'].fillna(0, inplace=True)
X['is_menus_cuisine_type_russian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_vietnamese'].fillna(0, inplace=True)
X['is_menus_cuisine_type_brunch'].fillna(0, inplace=True)
X['is_menus_cuisine_type_australian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_cuban'].fillna(0, inplace=True)
X['is_menus_cuisine_type_filipino'].fillna(0, inplace=True)
X['is_menus_cuisine_type_vegetarian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_turkish'].fillna(0, inplace=True)
X['is_menus_cuisine_type_malaysian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_british'].fillna(0, inplace=True)
X['is_menus_cuisine_type_salvadorian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_north_african'].fillna(0, inplace=True)
X['is_menus_cuisine_type_greek'].fillna(0, inplace=True)
X['is_menus_cuisine_type_burmese'].fillna(0, inplace=True)
X['is_menus_cuisine_type_east_european'].fillna(0, inplace=True)
X['is_menus_cuisine_type_nordic'].fillna(0, inplace=True)
X['is_menus_cuisine_type_north_american'].fillna(0, inplace=True)
X['is_menus_cuisine_type_american'].fillna(0, inplace=True)
X['is_menus_cuisine_type_italian'].fillna(0, inplace=True)
X['is_menus_cuisine_type_seafood'].fillna(0, inplace=True)
X['is_menus_cuisine_type_soul_food'].fillna(0, inplace=True)
X['is_menus_cuisine_type_californian'].fillna(0, inplace=True)
X['is_cooking_experience_years_8+'].fillna(0, inplace=True)
X['is_cooking_experience_years_1-3'].fillna(0, inplace=True)
X['is_cooking_experience_years_4-7'].fillna(0, inplace=True)
X['is_cooking_experience_years_0'].fillna(0, inplace=True)
X['is_cooking_experience_years_missing'].fillna(0, inplace=True)
X['is_cooking_experience_current-chef'].fillna(0, inplace=True)
X['is_cooking_experience_avid'].fillna(0, inplace=True)
X['is_cooking_experience_former-chef'].fillna(0, inplace=True)
X['is_cooking_experience_missing'].fillna(0, inplace=True)
X['is_cooking_experience_home_cook'].fillna(0, inplace=True)
X['is_cooking_experience_ownerf'].fillna(0, inplace=True)
X['is_cooking_experience_novice'].fillna(0, inplace=True)
X['is_cooking_experience_private'].fillna(0, inplace=True)
X['is_cooking_experience_entrepreneur'].fillna(0, inplace=True)
X['is_cooking_experience_caterer'].fillna(0, inplace=True)
X['is_cooking_experience_chef_de_partie'].fillna(0, inplace=True)
X['is_cooking_experience_personal_chef'].fillna(0, inplace=True)
X['is_cooking_experience_chef_de_cuisine'].fillna(0, inplace=True)
X['is_cooking_experience_chef-in-traning'].fillna(0, inplace=True)
X['is_cooking_experience_commis'].fillna(0, inplace=True)
X['is_cooking_experience_sous_chef'].fillna(0, inplace=True)
X['is_cook_referrer_google'].fillna(0, inplace=True)
X['is_cook_referrer_missing'].fillna(0, inplace=True)
X['is_cook_referrer_job'].fillna(0, inplace=True)
X['is_cook_referrer_other'].fillna(0, inplace=True)
X['is_cook_referrer_social'].fillna(0, inplace=True)
X['is_cook_referrer_news'].fillna(0, inplace=True)
X['is_cook_referrer_referral'].fillna(0, inplace=True)
X['is_cook_referrer_meal'].fillna(0, inplace=True)
X['is_cook_referrer_friend'].fillna(0, inplace=True)
X['is_cooks_join_reason_meet'].fillna(0, inplace=True)
X['is_cooks_join_reason_brand'].fillna(0, inplace=True)
X['is_cooks_join_reason_money'].fillna(0, inplace=True)
X['is_venue_style_pop-up-space'].fillna(0, inplace=True)
X['is_venue_style_apartment'].fillna(0, inplace=True)
X['is_venue_style_restaurant'].fillna(0, inplace=True)
X['is_venue_style_farm'].fillna(0, inplace=True)
X['is_venue_style_house'].fillna(0, inplace=True)
X['is_venue_style_brown-stone'].fillna(0, inplace=True)
X['is_venue_name_TheGarage'].fillna(0, inplace=True)
X['is_venue_name_theNewberry'].fillna(0, inplace=True)
X['is_venue_name_theLab'].fillna(0, inplace=True)
X['is_venue_name_theUnion'].fillna(0, inplace=True)
X['is_venue_name_theBocca'].fillna(0, inplace=True)
X['is_venue_name_theTradesman'].fillna(0, inplace=True)
X['is_venue_name_Foundation Cafe'].fillna(0, inplace=True)
X['is_venue_name_Private Location'].fillna(0, inplace=True)
X['is_venue_name_sound and savor'].fillna(0, inplace=True)
X['is_venue_name_theCommons'].fillna(0, inplace=True)
X['is_venue_name_The Humboldt House'].fillna(0, inplace=True)
X['is_venue_name_theGreenhouse'].fillna(0, inplace=True)
X['is_venue_name_Olea'].fillna(0, inplace=True)
X['is_venue_name_R.T.B. At Dabba'].fillna(0, inplace=True)
X['is_venue_name_Golden State Room'].fillna(0, inplace=True)
X['is_venue_name_Tournant'].fillna(0, inplace=True)
X['is_venue_name_Picnic on Third'].fillna(0, inplace=True)
X['is_venue_name_Frances'].fillna(0, inplace=True)
X['is_venue_name_theClub'].fillna(0, inplace=True)
X['is_venue_name_Home'].fillna(0, inplace=True)
X['is_venue_name_My Home'].fillna(0, inplace=True)
X['course_count'].fillna(0, inplace=True)

In [32]:
y = X.sold
# del X.sold
del X['sold']

In [33]:
X_train, X_validate, y_train, y_validate = train_test_split(X.values, y)

In [34]:
percentage_seats_sold_train = X_train[:, 0]
percentage_seats_sold_validate = X_validate[:, 0]

In [35]:
X_train = np.delete(X_train, [0], axis=1)
X_validate = np.delete(X_validate, [0], axis=1)
del X['percentage_of_seats_sold']

In [36]:
model = LogisticRegression()

In [37]:
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
sum(abs(percentage_seats_sold_validate - model.predict_proba(X_validate)[:, 1]))

4372.3783389110331

In [39]:
def plot_pmf(dist, x):
    plt.scatter(x, dist.pmf(x))
    plt.vlines(x, 0, dist.pmf(x))
    plt.ylim(ymin=0)
    plt.xlabel('number of seats')
    plt.ylabel('P(number of seats)')
    plt.show()

In [40]:
for index, x in enumerate(X_validate):
    n, p = (x[0], probability_predictions[index][1])
    binom = scs.distributions.binom(n,p)
    x = np.arange(1, n, 1)
    plot_pmf(binom, x)
    if index == 30:
        break

NameError: name 'probability_predictions' is not defined

### I'm a bit confused as to how to interpret my model

### With this logistic regression, we can't ever expect it to get everything right because we're sending it mixed signals. It is almost guaranteed that rows in the feature matrix with the exact same column values will have different target values.

### Re how to set an initial price suggestion, maybe I'm figuring out how to say "similar meals were priced between blah and blah" and then finding the optimal price in that range?

### Maybe I should limit the scope of the problem to be first time chefs?

### Or maybe I should scope the problem to be chefs cooking a menu for the first time?

### Feature Ideas:
- avg chef review leading up to meal (leakage warning)
- days between chef signup and meal 
- meal sequence number (leakage warning)
- meal menu sequence number (leakage warning)
- something with the description column on a menu 
    - something with tfidf/count vectorizer and nmf to get latent features
- min price for chef (leakage warning)
- max price for chef (leakage warning)
- avg price for chef (leakage warning)

In [None]:
clf = RandomForestClassifier(n_estimators = 100, max_depth = 30)
clf.fit(X_train, y_train)

### Calculate distance...how far is my predicted probability from the percentage of seats sold?

In [None]:
sum(abs(percentage_seats_sold_validate - clf.predict_proba(X_validate)[:, 1]))

In [None]:
sorted_features = reversed(sorted([(index, x) for index, x in enumerate(clf.feature_importances_)], key=lambda x: x[1]))

In [None]:
for feature in sorted_features:
    importance = feature[1]
    print X.columns[feature[0]], importance

### distance metric between the true fraction of seats sold and the predicted probability of selling a seat

### What is my baseline? Baseline could be total tickets sold / total number of seats. Calculate that mean - actual for each and sum

### separate model to predict the price that chef would want as start value

In [None]:
predicted_probabilities = clf.predict_proba(X_validate)[:, 1].reshape(clf.predict_proba(X_validate)[:, 1].shape[0], 1)

In [None]:
actuals = percentage_seats_sold_validate.reshape(percentage_seats_sold_validate.shape[0], 1)

In [None]:
list(np.concatenate((predicted_probabilities, actuals), axis=1))

### days on platform

In [None]:
model

### See if probability of selling decreases as price increases (sadly, it doesn't)

In [None]:
for price in range(1, 100):
    something = np.array(X_validate[6])
    something[1] = price
    print price, clf.predict_proba(np.array([something]))[0][1]

In [None]:
X.columns[-2]

In [None]:
menus

In [None]:
X_validate[0]

In [None]:
clf.classes_

In [None]:
X

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from nltk.corpus import words
from sklearn.feature_extraction import stop_words

stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [None]:
stop_words = list(set(stop_words.ENGLISH_STOP_WORDS))
stop_words.append("(', u')")
stop_words.append("-")
stop_words.append(":")
stop_words.append(".")
stop_words.append("!")
stop_words.append("\u2019")
stop_words.append("\u2022")
stop_words.append("'s")
stop_words.append("--")
stop_words.append(",")
stop_words.append("ll")
stop_words.append(">")
stop_words.append("<")
stop_words.append("'")
stop_words.append("(")
stop_words.append(")")
stop_words.append("...")
stop_words.append("ll")
stop_words.append("s")
stop_words.append("est")
stop_words.append("''")
stop_words.append("dolor")
stop_words.append("ut")
stop_words.append("11")
stop_words.append("?")
stop_words.append("baia")
stop_words.append("al")
stop_words.append("dri")
stop_words.append("'ll")
stop_words.append("203")
stop_words.append("&")
stop_words.append("tu")

vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words)

vect = vectorizer.fit_transform(X)

feature_names = vectorizer.get_feature_names()

In [None]:
def implement_sklearn_nmf(term_doc_matrix, feature_names):
    nmf = NMF(3)
    W = nmf.fit_transform(term_doc_matrix)
    print ('components: {}'.format(nmf.components_))
    print(nmf.components_.shape)
    indices = np.argsort(nmf.components_, axis=1)[:, -20:]
    for row in indices:
        print ([feature_names[i] for i in row])
    return W

latent_features = implement_sklearn_nmf(vect, feature_names)

In [None]:
tickets.groupby(['meal_id', 'ticket_price']).count()

In [42]:
something = pd.read_csv('./meal_seats.csv', delimiter='|', names=['meal_id', 'meal_created_date', 'meal_date', 'ticket_price',  'seats_available', 'seats_sold',  'percentage_seats_sold'])
something['meal_year'] = pd.to_datetime(something.meal_date).apply(lambda x: x.year)
something = something[(something.ticket_price < 200) & (something.ticket_price > 5) & (something.meal_year == 2016)]

In [44]:
ids = [8032, 7490, 7266, 6886, 6884, 6450, 6448, 6270, 6268, 6266, 6112, 6087, 5193, 4751, 4463, 4187, 4185, 4183, 4016, 3991, 8033, 7487, 7267, 7198, 6885, 6451, 6449, 6271, 6269, 6267, 6110, 6086, 5194, 4752, 4462, 4186, 4182, 4180]