In [128]:
import re
import time, datetime
from collections import Counter

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import json
print(os.listdir("./"))

['mr-boston-flattened.csv', 'temp', 'preprocessor.ipynb', 'flavor-data.json', 'prices.csv', 'mr-boston-flattened-categories.csv', 'all_drinks.csv', 'classifier.ipynb', '.ipynb_checkpoints', 'flavor-scraper.ipynb', 'SVD ingredients demo.ipynb']


In [97]:
data = pd.read_json("flavor-data.json")
datalist = data['drinks'].tolist()
X = pd.DataFrame(datalist)
y = X.pop('flavors')
print(X.shape, y.shape)

t = [x for row in y for x in row]
flavors = Counter(t)
print(flavors, len(flavors))

(609, 2) (609,)
Counter({'Sweet': 399, 'Fruity': 162, 'Sour': 58, 'Spicy': 33, 'Savory': 26, 'Hot': 22, 'Frozen': 10, 'Beginner': 7}) 8


In [133]:
# data preprocessing
mlb = MultiLabelBinarizer()
yBin = mlb.fit_transform(y)
print(yBin)
xTr, xTe, yTr, yTe = train_test_split(X, yBin, test_size=0.10)

print(xTr.shape, yTr.shape)
print("Loaded:","xTr",xTr.shape,"yTr",yTr.shape,"xTe",xTe.shape)
assert(xTr.shape[1] == xTe.shape[1])

def preprocess_meta(xTr):
    X = xTr.dropna(axis=1)
    # X.loc[:,('created')] = pd.to_datetime(X.loc[:,'created']).dt.time.apply(lambda x: x.hour*60 + x.minute)
    # X.loc[:,('has_link')] = X.loc[:,'text'].str.contains('http').astype(int)
    X.loc[:,('ingredientCount')] = len(X.loc[:,'ingredients'])
    return X[[]]

# remove/replace useless text
def preprocess_text(txt_df):
    # print(txt_df)
    new_txt_df = txt_df.apply(lambda lst: ' '.join(lst))
    return new_txt_df

# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(
    lambda x: preprocess_text(x['ingredients']), # text column from xTr
    validate=False
)
# Preprocess the numeric data: get_numeric_data
get_meta_data = FunctionTransformer(
    # use useful metadata and create/modify features from xTr cols
    lambda x: preprocess_meta(x),
    validate=False)
print("Final Metadata Columns:",list(get_meta_data.transform(xTr)))
print("Processed:",
      "xTr_text",get_text_data.transform(xTr).shape,
      "xTr_meta",get_meta_data.transform(xTr).shape,
      "yTr",yTr.shape)

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 ...
 [0 1 1 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]
(548, 2) (548, 8)
Loaded: xTr (548, 2) yTr (548, 8) xTe (61, 2)
Final Metadata Columns: []
Processed: xTr_text (548,) xTr_meta (548, 0) yTr (548, 8)


In [139]:
# model configuration

# vectorize unlabeled testing and training data text
vectorizer = TfidfVectorizer(
    min_df=1, # min word count
    ngram_range=(1, 3), # use unigrams, bigrams, trigrams
    stop_words='english', # remove articles, pronouns, common words
    norm=None,
    sublinear_tf=False,
)

# default model
default_model = LogisticRegression(solver='lbfgs', max_iter=1000); #best(?)
# default_model = LinearSVC(max_iter=100000); #slowest
# default_model = BernoulliNB() #fastest

In [140]:
# add more features

start_time=time.time()
pipe = Pipeline([
    ('union', FeatureUnion( # add both text and metadata to xTr
        transformer_list = [
            ('numeric_features', Pipeline([
                ('selector', get_meta_data)
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('vectorizer', vectorizer)
            ]))
        ]
    )),
    ('clf', OneVsRestClassifier(default_model))
])

param_grid = {'union__text_features__vectorizer__max_features': [10000, 30000],
              'clf__estimator__C': [0.1, 1]
             } 
grid = GridSearchCV(pipe, param_grid, cv=6, scoring='f1_samples')
grid4 = grid.fit(xTr, yTr)

print('Accuracy score:',grid4.best_score_)
print('Best params:',grid4.best_params_)
print('Estimator:',grid4.estimator)

end_time=time.time()
print("Pipeline time:",end_time-start_time)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Accuracy score: 0.6170316301703163
Best params: {'clf__estimator__C': 0.1, 'union__text_features__vectorizer__max_features': 10000}
Estimator: Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('numeric_features', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function <lambda> at 0x12a290510>, inv_kw_args=None,
          inverse_func=None, kw_ar...e=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])
Pipeline time: 4.684762001037598


  'precision', 'predicted', average, warn_for)


In [113]:
pred = grid4.predict(xTe)
pred = mlb.inverse_transform(pred)
actual = mlb.inverse_transform(yTe)
xTe['label'] = pred
xTe['actual'] = actual
print(xTe[:10])

                                           ingredients  \
455  [1/2 part DeKuyper® Pucker® Sour Apple Schnapp...   
387  [1 part Jim Beam® Original, 1 part DeKuyper® B...   
525  [1/2 part Red Stag by Jim Beam® Black Cherry, ...   
488  [1 1/2 parts DeKuyper® Pucker® Berry Fusion Sc...   
520  [2 parts Maker's Mark® Bourbon, 1 part Lime Ju...   
23   [12 parts (Pinot Grigio or Sauvignon Blanc) Wh...   
199  [1 1/2 parts Cruzan® 9 Spiced Rum, 3/4 part Ve...   
563  [1 part JDK & Sons™ Crave Chocolate Cherry Liq...   
591  [1 part EFFEN® Raspberry Vodka, 2 parts Lemonade]   
574  [1 part DeKuyper® Peachtree® Schnapps Liqueur,...   

                          name            label           actual  
455             Apple Daiquiri         (Sweet,)   (Frozen, Sour)  
387        Toffee Drop Martini         (Sweet,)         (Sweet,)  
525      Cinnamon Sugar Cookie         (Sweet,)         (Sweet,)  
488           Boo-Berry Scream        (Fruity,)        (Fruity,)  
520       Maker's Mark® Mo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [144]:
# run on actual drink list
with open('../app/static/drinks-with-related.json', 'r') as infile:  
    jsonData = json.load(infile)
    drinklst = jsonData['drinks']
    drinks = pd.DataFrame(drinklst)
    
    pred = grid4.predict(drinks)
    pred = mlb.inverse_transform(pred)
    
    drinks['tags'] = pred
    print(drinks[:10])
    

        categories                                        description  \
0               []  Place one ice cube in the glass and add 1 1/2 ...   
1               []  Fill a pint glass with ice. Pour vodka and cof...   
2      [halloween]  Shake with ice and strain into chilled cocktai...   
3               []  Bring all syrup ingredients to a boil in a sma...   
4               []  In a blender, combine ice, rum, limeade concen...   
5               []  Stir the orange juice, pineapple juice, simple...   
6               []  Muddle the sugar into the lime wedges in an ol...   
7               []  Cut half a lime into pieces, place in a shaker...   
8      [afternoon]  Put ice cubes in collins glass and add lime vo...   
9  [party, sports]  Rub rim of cocktail glass with rind of lemon o...   

                                         ingredients  \
0  [1 1/2 oz  Advocaat, 8-10 oz cold  Lemonade, 1...   
1  [ice cubes, or as needed, 2 fluid ounces vodka...   
2  [1 oz  Blanco tequila, 3/

In [161]:
jsondata = {}
jsondata['drinks'] = []
for i, drink in drinks.iterrows():
    try:
        ing = [(b +" " + a) for (a,b) in drink['ingredients'] if a != ""]
    except ValueError:
        ing = [a for a in drink['ingredients'] if a != ""]
        
    if 'instructions' in drink:
        drink['description'] = drink['instructions']
    
    jsondata['drinks'].append({
        'name' : drink['name'],
        'description' : drink['description'],
        'src' : drink['src'],
        'ingredients' : ing,
        'rating' : drink['rating'],
        'reviews' : drink.get('reviews', []),
        'categories' : drink.get('categories', []),
        'related' : drink.get('related', []),
        'tags' : drink.get('tags', [])
    })

In [162]:
with open('../app/static/drinks-with-related-and-tags.json', 'w') as outfile:  
    json.dump(jsondata, outfile)