In [476]:
import re
import time, datetime
from collections import Counter

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import json
import string

In [477]:
data = pd.read_json("tag-data.json")
datalist = data['drinks'].tolist()


In [478]:
# preprocess tag data

for drink in datalist:
    flavors = drink['flavors']
    if (len(drink['hours']) > 0):
        flavors.extend(drink['hours'])
#   flavors.extend(drink['tags'])
#     if (len(drink['type']) > 0):
#         flavors.append(drink['type'])
    ing = []
    for i in drink['ingredients']:
        ingredient = i[1]
        ing.append(ingredient)
    drink['ingredients'] = ing

    
print(datalist[0]['flavors'], datalist[0]['ingredients'])

['Bubbly', 'Fruity/Citrus-forward', 'Sweet', 'Dinner/Paired with food', 'Evening'] ['Johnnie Walker Double Black Scotch Whisky', 'Lemon juice', 'Grapefruit juice', 'Citrus Oleo-Saccharum', 'Victory Prima Pils Pilsner', 'Grapefruit half-wheel']


In [479]:

X = pd.DataFrame(datalist)
y = X.pop('flavors')
print(X.shape, y.shape)

t = [x for row in y for x in row]
flavors = Counter(t)
print(flavors, len(flavors))


(2092, 6) (2092,)
Counter({'Evening': 1443, 'Sweet': 1062, 'Afternoon': 985, 'Fruity/Citrus-forward': 804, 'Spirit-forward': 572, 'Dinner/Paired with food': 424, 'Sour': 406, 'Bubbly': 259, 'Bitter': 221, 'Aperitif': 210, 'Spicy': 148, 'Creamy': 146, 'Morning/Brunch': 128, 'Digestif': 120, 'Salty/Savory': 104, 'Nightcap': 89, 'Herbaceous': 33}) 17


In [480]:
# data preprocessing
yBin = mlb.fit_transform(y)
print(yBin)
xTr, xTe, yTr, yTe = train_test_split(X, yBin, test_size=0.10)

print(xTr.shape, yTr.shape)
print("Loaded:","xTr",xTr.shape,"yTr",yTr.shape,"xTe",xTe.shape)
assert(xTr.shape[1] == xTe.shape[1])

def preprocess_meta(xTr):
    X = xTr.dropna(axis=1)
    # X.loc[:,('created')] = pd.to_datetime(X.loc[:,'created']).dt.time.apply(lambda x: x.hour*60 + x.minute)
    # X.loc[:,('has_link')] = X.loc[:,'text'].str.contains('http').astype(int)
    X.loc[:,('ingredientCount')] = [ len(x) for x in X.loc[:,'ingredients'] ]
        
    return X[['ingredientCount']]

# remove/replace useless text
def preprocess_text(X):
    # ingredient list to str and treat name as part of same doc
    i = [ str(x[1]['name'] + ' ' + ' '.join(x[1]['ingredients'])) for x in X.iterrows() ]
    
    return np.array(i)

# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(
    lambda x: preprocess_text(x),
    validate=False
)
# Preprocess the numeric data: get_numeric_data
get_meta_data = FunctionTransformer(
    # use useful metadata and create/modify features from xTr cols
    lambda x: preprocess_meta(x),
    validate=False)
print("Final Metadata Columns:",list(get_meta_data.transform(xTr)))
print("Processed:",
      "xTr_text",get_text_data.transform(xTr).shape,
      "xTr_meta",get_meta_data.transform(xTr).shape,
      "yTr",yTr.shape)

[[0 0 0 ... 0 0 1]
 [1 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 1]
 ...
 [0 1 0 ... 0 1 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(1882, 6) (1882, 17)
Loaded: xTr (1882, 6) yTr (1882, 17) xTe (210, 6)
Final Metadata Columns: ['ingredientCount']
Processed: xTr_text (1882,) xTr_meta (1882, 1) yTr (1882, 17)


In [481]:
# model configuration

# vectorize unlabeled testing and training data text
vectorizer = TfidfVectorizer(
    analyzer='word',
    norm=None,
)

# default model
default_model = LogisticRegression(solver='lbfgs', max_iter=1000); #best(?)
# default_model = LinearSVC(max_iter=100000);
# default_model = BernoulliNB()
# default_model = RandomForestClassifier()

In [482]:
# add more features

start_time=time.time()
pipe = Pipeline([
    ('union', FeatureUnion( # add both text and metadata to xTr
        transformer_list = [
            ('numeric_features', Pipeline([
                ('selector', get_meta_data)
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('vectorizer', vectorizer)
            ]))
        ]
    )),
    ('clf', OneVsRestClassifier(default_model))
])

param_grid = {'union__text_features__vectorizer__max_features': [10000, 30000],
              # 'clf__estimator__C': [0.1, 1]
             } 
grid = GridSearchCV(pipe, param_grid, cv=6, scoring='f1_samples')
grid4 = grid.fit(xTr, yTr)

print('Accuracy score:',grid4.best_score_)
print('Best params:',grid4.best_params_)
print('Estimator:',grid4.estimator)

end_time=time.time()
print("Pipeline time:",end_time-start_time)


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'reca

Accuracy score: 0.5817685030756232
Best params: {'union__text_features__vectorizer__max_features': 10000}
Estimator: Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('numeric_features', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function <lambda> at 0x12ba48ae8>, inv_kw_args=None,
          inverse_func=None, kw_ar...e=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])
Pipeline time: 21.07375192642212


In [483]:
pred = grid4.predict(xTe)
pred_inv = mlb.inverse_transform(pred)
actual = mlb.inverse_transform(yTe)
xTe = xTe.assign(label = pred_inv)
xTe = xTe.assign(actual = actual)
print(xTe[:10])


precision, recall, fscore, support = metrics.precision_recall_fscore_support(yTe, pred, average='weighted')
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('f1score: {}'.format(fscore))
print('support: {}'.format(support))


                     hours                                        ingredients  \
242   [Afternoon, Evening]  [Sotol, Pineapple juice, Fresh lemon juice, Ri...   
71             [Afternoon]  [Great King St. Glasgow Blend scotch, Del Magu...   
773            [Afternoon]  [El Gobernador pisco, Dry white wine, Fresh pi...   
15    [Afternoon, Evening]  [Hamilton 86 demerara rum, Plantation O.F.T.D....   
711             [Aperitif]  [American rye whiskey, Sweet vermouth, Bénédic...   
600   [Afternoon, Evening]  [Angostura bitters, Orgeat, Fresh lemon juice,...   
199              [Evening]  [Booth's dry gin, Montanaro bianco vermouth, C...   
1156             [Evening]  [Bourbon, Ginger syrup*, Fresh lemon juice, Al...   
1904  [Afternoon, Evening]  [Templeton rye whiskey, Carpano Antica Formula...   
794                     []  [Woodford Reserve Double Oaked bourbon, Banana...   

                                  name  \
242                    Matador Norteño   
71                      

  'precision', 'predicted', average, warn_for)


In [484]:
# run on actual drink list
with open('../app/static/drinks-with-related.json', 'r') as infile:  
    jsonData = json.load(infile)
    drinklst = jsonData['drinks']
    drinks = pd.DataFrame(drinklst)
    
    pred = grid4.predict(drinks)
    pred = mlb.inverse_transform(pred)
    
    drinks['tags'] = pred
    print(drinks[:10])
    

        categories                                        description  \
0               []  Place one ice cube in the glass and add 1 1/2 ...   
1               []  Fill a pint glass with ice. Pour vodka and cof...   
2      [halloween]  Shake with ice and strain into chilled cocktai...   
3               []  Bring all syrup ingredients to a boil in a sma...   
4               []  In a blender, combine ice, rum, limeade concen...   
5               []  Stir the orange juice, pineapple juice, simple...   
6               []  Muddle the sugar into the lime wedges in an ol...   
7               []  Cut half a lime into pieces, place in a shaker...   
8      [afternoon]  Put ice cubes in collins glass and add lime vo...   
9  [party, sports]  Rub rim of cocktail glass with rind of lemon o...   

                                         ingredients  \
0  [1 1/2 oz  Advocaat, 8-10 oz cold  Lemonade, 1...   
1  [ice cubes, or as needed, 2 fluid ounces vodka...   
2  [1 oz  Blanco tequila, 3/

In [485]:
jsondata = {}
jsondata['drinks'] = []
for i, drink in drinks.iterrows():
    try:
        ing = [(b +" " + a) for (a,b) in drink['ingredients'] if a != ""]
    except ValueError:
        ing = [a for a in drink['ingredients'] if a != ""]
        
    if 'instructions' in drink:
        drink['description'] = drink['instructions']
    
    jsondata['drinks'].append({
        'name' : drink['name'],
        'description' : drink['description'],
        'src' : drink['src'],
        'ingredients' : ing,
        'rating' : drink['rating'],
        'reviews' : drink.get('reviews', []),
        'categories' : drink.get('categories', []),
        'related' : drink.get('related', []),
        'tags' : drink.get('tags', [])
    })

In [486]:
with open('../app/static/drinks-with-related-and-tags.json', 'w') as outfile:  
    json.dump(jsondata, outfile)