In [1]:
import pandas as pd




data = pd.read_csv("../data/original_data.csv")

data.drop("country",axis=1,inplace=True) # only single country, no need
data = data[data["bar_type"]!= "Night Club"] # only ~90 rows from Night Club 
#df = df[df["city"] == "Toronto"] #choose only Toronto city
data.drop(["data_availability_status_id","is_bulk","status","last_status","state","state_id","waiter_id","country_id"],axis=1,inplace=True) # only single country, no need


data["order_time"] = pd.to_datetime(data["order_time"], format="%Y-%m-%d %H:%M:%S.%f")


# 1. Add `total_orders_category_id_X` feature (X = 1.0 ... 6.0)

# add one-hot-encoding for category ids
data = pd.concat([data, pd.get_dummies(data["category_id"], prefix="category_id")], axis=1)
# count total of orders per category id
category_ids = [float(i) for i in range(1, 7)]
for category_id in category_ids:
    data["total_orders_category_id_" + str(category_id)] =\
        data.groupby("order_id")["category_id_" + str(category_id)].transform("sum")

# drop the one-hot-encoding
one_hot_encoded = ["category_id_" + str(category_id) for category_id in category_ids]
data.drop(one_hot_encoded, axis=1, inplace=True)


# 2. Add `total_orders` feature (excluding category 5)
total_categories_ids = ["total_orders_category_id_" + str(float(i)) for i in range(1, 7)]
total_categories_ids_to_sum = [column for column in total_categories_ids if column != "total_orders_category_id_5.0"]
data["total_orders"] = data.apply(lambda order: sum(order[column] for column in total_categories_ids_to_sum), axis=1)


# 3. Add `day_of_week` feature
data["day_of_week"] = data.order_time.apply(lambda ticket: ticket.day_name())


# 4. Add `period_of_day` feature ('breakfast', 'lunch', 'afternoon', 'dinner', 'night')
data["order_hour"] = data.order_time.apply(lambda ticket: ticket.hour)

def period_of_day(hour):
    if hour >= 6 and hour < 11: return 'breakfast'
    elif hour >= 11 and hour < 14: return 'lunch'
    elif hour >= 14 and hour < 18: return 'afternoon'
    elif hour >= 18 and hour < 21: return 'dinner'
    elif hour >= 21 and hour < 23: return 'hang_out'
    elif hour >= 23 or hour < 6: return 'night'

data["period_of_day"] = data.apply(lambda order: period_of_day(int(order["order_hour"])), axis=1)
data.drop("order_hour", axis=1, inplace=True)

# 5. Add `is_weekend` feature
weekend = ["Friday", "Saturday", "Sunday"]
data["is_weekend"] = data.day_of_week.apply(lambda ticket_day: ticket_day in weekend)


sauces = set()
sub = set()
# drop unrelevant titles
def filter_titles(row):
    title = row["title"].lower()
    price = row["sales_before_tax"]

    if "xtra" in title:
        return False
    if "sub" in title and price == 0:
        sub.add(title)
        return False
    elif "add" in title:
        return False
    elif "sauce" in title:
        for t in foods_w_sauce: 
            if t in title: return True
        for t in remove_w_sauce:
            if t in title: return False
        sauces.add(title)
        return False
    elif "no " in title:
        return False
    elif "no." in title:
        return False
    elif "-no " in title:
        return False
    elif "side " in title:
        return False
    elif "+" in title:
        return False
    elif "dip" in title:
        return False
    elif "blue cheese" in title:
        return False
    elif "bbq" in title:
        return False
    elif "n/c" in title:
        return False
    elif "s/o" in title:
        return False
    elif title == '' or title == 'garlic.aioli' or title == 'gluten' or title == 'hot n honey' or title == 'honey garlic' or title == 'kids.' or title == 'to go':
        return False
    else:
        return True

foods_w_sauce = ["fingers", "spaghetti", "poutine", "wings", "pate", "bowl", "fries", "rigatoni", "pasta",
                "linguini", "frite"]
remove_w_sauce = ["no wing", "for wing", "on", "side"]
whitelist = set('.abcdefghijklmnopqrstuvwxyz \\//')
    
    
data = data[data.apply(lambda x: filter_titles(x), axis=1)]

sharable_foods = [
    "pizza", "cake", "hot pot", "nachos", "guac", "wings", "Focaccia", "bread", "fries", "pretzels",
"quesadilla", "nuts", "fondue", "calamari", "fingers sauced","chicken fingers", "chkn fingers", "quesa stack" 
]

def find_sharable(title):
    title = " ".join(title.lower().split("."))
    for s in sharable_foods:
        if s in title:
            return 1
    return 0

data["sharable"] = data.title.apply(lambda x: find_sharable(x))




# meal with kids
data["kids_meal"] = data.title.apply(lambda x: 1 if 'kid' in x else 0)
# birthday
data["birthday"] = data.title.apply(lambda x: 1 if 'birthday' in x else 0)

table = data.groupby('order_id', as_index=False).agg({'bar_id': "mean", 'order_time':"max", 
'sales_inc_tax':"sum", 'guest_count':"mean",'bar_type_id':"mean", 'total_orders':"mean", 'item_qty':"sum",
'total_orders_category_id_1.0':"mean", 'total_orders_category_id_2.0':"mean", 'total_orders_category_id_3.0':"mean",
'total_orders_category_id_4.0':"mean", 'total_orders_category_id_5.0':"mean", 'total_orders_category_id_6.0':"mean",
'kids_meal':"max", 'birthday':"max", 'bar_type':"max", "beer_volume":"sum", "day_of_week":"mean",
'beer_brand_id':"list", 'beer_serving_type_id':"list",
'bar_type_id':"mean", 'total_orders':"sum", 'order_hour':"mean", 'order_day_of_week':"mean"})

table['num_drinks'] = table['total_orders_category_id_1.0'] + table['total_orders_category_id_3.0'] + table['total_orders_category_id_4.0'] + table['total_orders_category_id_6.0']

'''
def f(x):
    if x['guest_count'] == 0:
        food = x["total_orders_category_id_2.0"]
        drinks = x["num_drinks"]
        if food > 0: return food
        else: return drinks
    else: return x['guest_count']

till_10_guests['guest_count'] = till_10_guests.apply(f, axis=1)

Counting guests - if zero return food number


'''

data.to_csv("df_per_item_august.csv", index=False)
table.to_csv("df_per_table_august.csv", index=False)


KeyboardInterrupt: 