# Occasion Classfier using Guest Count Bins and Heuristics

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
sys.path.append(module_path+"/heuristics")

In [26]:
module_path

'/Users/danielastepanov/Documents/Bar-Ilan/b_semester18/DS_project/BeerW-BigData-Project'

In [3]:
from occasion_classifier import shrink_orders_to_table
from occasion_classifier import classify

In [5]:
hockey = pd.read_csv("../data/hockey_3_text_processed.csv")
silvester = pd.read_csv("../data/silvester_3_text_processed.csv")
valentine = pd.read_csv("../data/valentine_3_text_processed.csv")
hockey = hockey[~(hockey.guest_count == 0)]
silvester = silvester[~(silvester.guest_count == 0)]
valentine = valentine[~(valentine.guest_count == 0)]
data = pd.concat([hockey, silvester, valentine], axis=0)

## Using clean data (pre-processed) and bars we deemed trsutworthy, we use the Guest_Count feature to divide our orders into the following bins:

### Bins:
### - 1
### - 2
### - 3-5
### - 6+

## For each bin, we describe the chosen occasions

## Common Occasions to all bins:
* **BREAKFAST**
* **LUNCH**
* **DINNER**
* **DRINKING**
* **UNK**

### Category 1:
1. **LUNCH** - time of day must be `lunch time`, and must contain at least 1 large meal and 0 or 1 drinks.
2. **MUNCH** - time of day must be `late_night`, contains only meals that are not large
3. **DINNER** - time of day either `dinner` or `late_night`, contains at least 1 large meal, and up to 2 drinks
4. **CASUAL DRINK** - any time of day, ratio of food to drinks less than 1.5. up to 3 drinks can have <=1.2 L of beer.
5. **DRINKING** - any time of day, ratio of drinks to food larger than 1.5.
6. **NOT_1** - either too many items ordered in a single step, or "kid" in one of the titles.
7. **UNK** - undealt with

### Category 2:
1. **LUNCH** 
2. **DINNER** 
3. **DRINKING**
4. **ROMANTIC_DATE**
5. **FANCY_DATE**
6. **MALES_ONLY**
7. **JUST_EATING**
8. **BIRTHDAY**
9. **KIDS**
10. **NOT_2**
11. **UNK**

### Category 3-5:
1. **Family Event**
2. **Drinking**
3. **Breakfast**
4. **Lunch**
5. **Dinner**
6. **Social Gathering**
7. **After Work**

### Category 6:
1. 

------------------------------------------

In [6]:
labels_1 = ["LUNCH", "MUNCH", "DINNER", "DRINKING", "CASUAL_DRINK", "NOT_1", "UNK"]
labels_2 = ["LUNCH","DINNER", "DRINKING","ROMANTIC_DATE", "FANCY_DATE", "MALES_ONLY", \
            "JUST_EATING", "BIRTHDAY", "KIDS","NOT_2" ,"UNK"]
labels_35 = ["FAMILY_EVENT", "DRINKING", "BREAKFAST", "LUNCH", "DINNER", "SOCIAL_GATHERING", "AFTER_WORK"]
labels_6 = ["Family Event","Drinking","Breakfast","Lunch","Dinner","Social Gathering","After Work"]

## We annotated ~50 tables for dev, and 25 for test, for each category bin.


# DEV results

In [6]:
import csv

results = {}
total_correct = 0
total_incorrect = 0
with open("../heuristics/dev.csv") as f:
    csv_reader = csv.reader(f, delimiter=',')
    current_cat = 0
    table_results = {}
    for row in csv_reader:
        if len(row) == 1:
            current_cat = row[0]
            results[current_cat] = {}
            table_results[current_cat] = []
            results[current_cat]["correct"] = 0
            results[current_cat]["incorrect"] = 0
            continue
            
        orders = data[data.order_id == int(row[0])]
        table = shrink_orders_to_table(orders)
        pred_occasion = classify(table)
        table_results[current_cat].append((int(row[0]), row[1], pred_occasion))
        if pred_occasion == row[1]:
            results[current_cat]["correct"] += 1
            total_correct += 1
        else:
            results[current_cat]["incorrect"] += 1
            total_incorrect += 1


# Accumulative Accuracy for all guest count bins for all occasions:


In [7]:
total_correct/(total_incorrect + total_correct)*100

85.0

## Accuracy for Category 1

In [8]:
results['1']["correct"]/(results['1']["correct"] + results['1']["incorrect"])

0.9215686274509803

## Accuracy for Category 3-5

In [9]:
results['3-5']["correct"]/(results['3-5']["correct"] + results['3-5']["incorrect"])

0.85

## Accuracy for Category 6

In [10]:
results['6']["correct"]/(results['6']["correct"] + results['6']["incorrect"])

0.8571428571428571

----------------------------------------------

# TEST Results

In [11]:
import csv

results = {}
total_correct = 0
total_incorrect = 0
with open("../heuristics/test.csv") as f:
    csv_reader = csv.reader(f, delimiter=',')
    current_cat = 0
    table_results = {}
    for row in csv_reader:
        if len(row) == 1:
            current_cat = row[0]
            results[current_cat] = {}
            table_results[current_cat] = []
            results[current_cat]["correct"] = 0
            results[current_cat]["incorrect"] = 0
            continue
            
        orders = data[data.order_id == int(row[0])]
        table = shrink_orders_to_table(orders)
        pred_occasion = classify(table)
        table_results[current_cat].append((int(row[0]), row[1], pred_occasion))
        if pred_occasion == row[1]:
            results[current_cat]["correct"] += 1
            total_correct += 1
        else:
            results[current_cat]["incorrect"] += 1
            total_incorrect += 1


# Accumulative TEST Accuracy for all guest count bins for all occasions:


In [12]:
total_correct/(total_incorrect + total_correct)*100

82.55813953488372

## Accuracy for Category 1

In [13]:
results['1']["correct"]/(results['1']["correct"] + results['1']["incorrect"])

0.7916666666666666

## Accuracy for Category 3-5

In [14]:
results['3-5']["correct"]/(results['3-5']["correct"] + results['3-5']["incorrect"])

0.8666666666666667

## Accuracy for Category 6

In [15]:
results['6']["correct"]/(results['6']["correct"] + results['6']["incorrect"])

0.8125

## Accuracies for test are 80 and above. To really be trustworthy of these numbers, a much larger test set should be collected.

------------------------------------

## Lets try our classifiers on each dataset

# Hockey

In [16]:
'''
hockey_ids = list(hockey.order_id.value_counts().keys())
results = {}
current_cat = 0
hockey_results = []
hockey_tables = shrink_orders_to_table(hockey)
for order in hockey_ids:
    table = hockey_tables[hockey_tables.index == order]
    pred_occasion = classify(table)
    hockey_results.append((order, table.guest_count.iloc[0], pred_occasion))
hockey_results = pd.DataFrame(hockey_results, columns=["order_id", "guest_count", "occasion"])
hockey_results.to_csv("hockey_occasions.csv", index=False)
'''

In [21]:
hockey_occasions = pd.read_csv("hockey_occasions.csv")

# Valentine

In [17]:
'''
valentine_ids = list(valentine.order_id.value_counts().keys())
results = {}
current_cat = 0
val_results = []
valentine_tables = shrink_orders_to_table(valentine)
for order in valentine_ids:
    table = valentine_tables[valentine_tables.index == order]
    pred_occasion = classify(table)
    val_results.append((order, table.guest_count.iloc[0], pred_occasion))
val_results = pd.DataFrame(val_results, columns=["order_id", "guest_count", "occasion"])
val_results.to_csv("valentine_occasions.csv", index=False)
'''

In [22]:
valentine_occasions = pd.read_csv("valentine_occasions.csv")

# Silvester

In [None]:
'''
silvester_ids = list(silvester.order_id.value_counts().keys())
results = {}
current_cat = 0
sil_results = []
sil_tables = shrink_orders_to_table(silvester)
for order in silvester_ids:
    table = sil_tables[sil_tables.index == order]
    pred_occasion = classify(table)
    sil_results.append((order, table.guest_count.iloc[0], pred_occasion))
sil_results = pd.DataFrame(sil_results, columns=["order_id", "guest_count", "occasion"])
sil_results.to_csv("silvester_occasions.csv", index=False)
'''

In [23]:
silvester_occasions = pd.read_csv("silvester_occasions.csv")

# Guest count distributions

In [24]:
hockey_occasions.guest_count.value_counts(normalize=True)[:10]

1     0.545749
2     0.275550
3     0.087486
4     0.046452
5     0.018579
6     0.010326
7     0.004897
8     0.003129
9     0.002142
10    0.001428
Name: guest_count, dtype: float64

In [25]:
valentine_occasions.guest_count.value_counts(normalize=True)[:10]

1     0.483146
2     0.338730
3     0.091992
4     0.047210
5     0.016255
6     0.009228
7     0.004177
8     0.002752
9     0.002040
10    0.001230
Name: guest_count, dtype: float64

In [26]:
silvester_occasions.guest_count.value_counts(normalize=True)[:10]

1     0.467163
2     0.302354
3     0.107757
4     0.067130
5     0.025907
6     0.012960
7     0.006099
8     0.003701
9     0.002287
10    0.001331
Name: guest_count, dtype: float64

# Occasions Distribution

In [27]:
hockey_occasions.occasion.value_counts(normalize=True)[:10]

DINNER              0.233938
LUNCH               0.155577
CASUAL_DRINK        0.139844
UNK                 0.115337
DRINKING            0.105792
NOT_1               0.056064
JUST_EATING         0.040909
FAMILY_EVENT        0.039708
SOCIAL_GATHERING    0.018386
FANCY_DATE          0.017842
Name: occasion, dtype: float64

In [28]:
valentine_occasions.occasion.value_counts(normalize=True)[:10]

DINNER           0.254444
LUNCH            0.165496
UNK              0.113331
CASUAL_DRINK     0.104685
DRINKING         0.088431
JUST_EATING      0.052877
NOT_1            0.052262
FAMILY_EVENT     0.043681
FANCY_DATE       0.027232
ROMANTIC_DATE    0.020432
Name: occasion, dtype: float64

In [29]:
silvester_occasions.occasion.value_counts(normalize=True)[:10]

DINNER              0.250042
LUNCH               0.147219
UNK                 0.113357
CASUAL_DRINK        0.098525
DRINKING            0.081767
FAMILY_EVENT        0.071732
NOT_1               0.060379
JUST_EATING         0.054987
FANCY_DATE          0.024839
SOCIAL_GATHERING    0.020376
Name: occasion, dtype: float64