# Setup

In [1]:
# Import libraries 
import yaml
import string
import requests

# Recall from the APIs for Data lab that including passwords in code is a terrible practice. 
# So we include a yaml file.
config_file = open('GCP_model_details.yaml', 'r')
config = yaml.safe_load(config_file)

In [2]:
from google.cloud import aiplatform

In [3]:
def endpoint_predict_sample(
    project: str, location: str, instances: list, endpoint: str
):
    aiplatform.init(project=project, location=location)

    endpoint = aiplatform.Endpoint(endpoint)

    prediction = endpoint.predict(instances=instances)
    return prediction

# PCA for Predictions

In [4]:
# Importing data manipulation libraries and PCA tools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import KernelPCA
import itertools

In [5]:
%%bigquery base

SELECT 
    idDrink,
    strDrink,
    strIngredient1,
    strIngredient2,
    strIngredient3,
    strIngredient4,
    strIngredient5,
    strIngredient6,
    strIngredient7,
    strIngredient8,
    strIngredient9,
    strIngredient10,
    strIngredient11,
    strIngredient12,
    strIngredient13,
    strIngredient14,
    strIngredient15
FROM `bookish-journey-343419.cocktails_dataset.cocktails-table`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 853.54query/s] 
Downloading: 100%|██████████| 425/425 [00:00<00:00, 507.13rows/s]


In [6]:
# Drop empty columns
base = base.drop(['strIngredient12','strIngredient13','strIngredient14','strIngredient15'],axis=1)

# Replace NaN with ""
base = base.replace(np.nan,"")

# Create a ner variable, ingredients, that has all the previous ingredients together. 
base['ingredients'] = base[['strIngredient1','strIngredient2','strIngredient3','strIngredient4','strIngredient5',
        'strIngredient6','strIngredient7','strIngredient8','strIngredient9','strIngredient10','strIngredient11']].agg(','.join, axis=1)

# Function to transform letters to lowercase.
def lower(text):
    text=text.lower()
    return text

# Pass all words throw the funtion and append them
ingredients_low=[]
for i in base.ingredients:
    il=lower(i)
    ingredients_low.append(il)

# Lowercase
base['ingredients']=ingredients_low

# This function convert a list of ingredients into a dictionary, note: every ingredient gets a 1.
# this mean that the value of every key is 1. key:value
def convert_to_dict(lst):
    d = {} #empty dict
    for ingre in lst:
        d[ingre] = 1
    return d

# We use the function to convert every row into a dictionary. 
# 'vodka': 1, 'lime juice': 1... this will help us later to create a one hot encoding.
base['bagofwords'] = base.ingredients.str.split(',').apply(convert_to_dict)

# One Hot Encoding
# To find similarities between dishes and cluster cocktails using their ingredients, we will represent a recipe by a one-hot encoded vector 
# of its ingredients. We will be establishing a vocabulary of ingredients using a method ‘DictVectorizer’ provided in the sklearn library

# DictVectorizer:This transformer turns lists of mappings (dict-like objects) of feature names to feature values into Numpy arrays or scipy.sparse matrices for use with scikit-learn estimators.
# sparse, default=True. Whether transform should produce scipy.sparse matrices. In this case we set it as False.

vector_dict = DictVectorizer(sparse = False)

# fit_transform() is used on the training data so that we can scale the training data and also learn the scaling parameters of that data. 
#The fit method is calculating the mean and variance of each of the features present in our data. 
#The transform method is transforming all the features using the respective mean and variance.
# We past every dictionary into a list.
X = vector_dict.fit_transform(base["bagofwords"].tolist())

# We select the column strDrink(name of the drink) from de dataset
y = base.strDrink

# Using Kernel PCA
# kernel = "cosine": This is called cosine similarity, because Euclidean (L2) normalization projects the vectors onto the unit sphere, and their dot product is then the cosine of the angle between the points denoted by the vectors.

kpca = KernelPCA(n_components=6,kernel="cosine", n_jobs=2)

# Using the transform method we can use the same mean and variance as it is calculated from our training data to transform our test data. 
#Thus, the parameters learned by our model using the training data will help us to transform our test data.
x_pca = kpca.fit_transform(X)

# Online Prediction for K-Means

In [7]:
# Making sure the pca is the correct number of dimensions
kpca = KernelPCA(n_components=6,kernel="cosine", n_jobs=2)
x_pca = kpca.fit_transform(X)

In [8]:
# Recommendations
def cluster_recomm_kmeans(observation,n_return=5):
    cluster = endpoint_predict_sample(project = config['project_id'], \
                                      location = config['region'], \
                                      instances = [observation.tolist()], \
                                      endpoint = str(config['kmeans_endpoint_id']))
    cluster_map = pd.DataFrame()
    cluster_map['cluster'] = np.array(endpoint_predict_sample(project = config['project_id'], \
                                                     location = config['region'], \
                                                     instances = x_pca.tolist(), \
                                                     endpoint = str(config['kmeans_endpoint_id']))[0])
    in_cluster = cluster_map[cluster_map.cluster == cluster[0][0]].sample(n=n_return)
    return y[in_cluster.index]

In [9]:
# We create a new observation based on a previous cocktail we had, and add two different ingredients that weren't in the original cocktail. 
new_obs = X[1]
new_obs[4] = 1
new_obs[5] = 1

X_test_kernel_pca = kpca.fit(X).transform(np.array([new_obs]))

In [10]:
# We can compare the new PCA to the one of the cocktail we were basing ourselves on. 
new_obs = np.squeeze(X_test_kernel_pca)
new_obs

array([ 0.32430237,  0.06565846, -0.0462426 , -0.09555177, -0.0216276 ,
       -0.04625695])

In [11]:
x_pca[1]

array([ 0.37078919,  0.09149095, -0.06096403, -0.11225486, -0.02721124,
       -0.05151923])

In [12]:
# Online Experiment
cluster_recomm_kmeans(new_obs)

364         Gin Rickey
378        Clover Club
134    Flying Dutchman
133         Gin Squirt
352         Martinez 2
Name: strDrink, dtype: object

# Online Prediction for Mean Shift

In [13]:
# Making sure the pca is the correct number of dimensions
kpca = KernelPCA(n_components=2,kernel="cosine", n_jobs=2)
x_pca = kpca.fit_transform(X)

In [14]:
# Recommendations
def cluster_recomm_meanshift(observation,n_return=5):
    cluster = endpoint_predict_sample(project = config['project_id'], \
                                      location = config['region'], \
                                      instances = [observation.tolist()], \
                                      endpoint = str(config['meanshift_endpoint_id']))
    cluster_map = pd.DataFrame()
    cluster_map['cluster'] = np.array(endpoint_predict_sample(project = config['project_id'], \
                                                     location = config['region'], \
                                                     instances = x_pca.tolist(), \
                                                     endpoint = str(config['meanshift_endpoint_id']))[0])
    in_cluster = cluster_map[cluster_map.cluster == cluster[0][0]].sample(n=n_return)
    return y[in_cluster.index]

In [19]:
# We create a new observation based on a previous cocktail we had, and add two different ingredients that weren't in the original cocktail. 
new_obs = X[1]
new_obs[4] = 1
new_obs[5] = 1
new_obs[6] = 1

X_test_kernel_pca = kpca.fit(X).transform(np.array([new_obs]))

In [20]:
# We can compare the new PCA to the one of the cocktail we were basing ourselves on. 
new_obs = np.squeeze(X_test_kernel_pca)
new_obs

array([0.30638482, 0.05796297])

In [21]:
x_pca[1]

array([0.32430237, 0.06565846])

In [22]:
# Online Experiment
cluster_recomm_meanshift(new_obs)

374       Munich Mule
73          Alexander
292                A1
339    Ramos Gin Fizz
407          Aviation
Name: strDrink, dtype: object