# Imports and GCP Setup

In [1]:
# Import libraries 
import yaml
import string
import requests

# Recall from the APIs for Data lab that including passwords in code is a terrible practice. 
# So we include a yaml file.
config_file = open('GCP_model_details.yaml', 'r')
config = yaml.safe_load(config_file)

In [2]:
!source GCP_modeling_setup.sh

Setting up envirement variables
Done

Setting project up
Updated property [core/project].
Done

Enabling APIs
Operation "operations/acat.p2-679997314711-f37c00a6-ea11-40ad-9fbe-7a3f142cb679" finished successfully.
Done

Making a bucket, if it exists, returns error
Creating gs://bookish-journey-clustering/...
ServiceException: 409 A Cloud Storage bucket named 'bookish-journey-clustering' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.
Done



## Modeling

In [3]:
import numpy as np
import pandas as pd
import pathlib

In [4]:
# From sklearn importing DictVectorizer: Transforms lists of feature-value mappings to vectors.
# Importing KernelPCA: Kernel Principal component analysis (KPCA).
# Importing itertools provides various functions that work on iterators. 

from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import KernelPCA
import itertools

In [5]:
%%bigquery base

SELECT 
    idDrink,
    strDrink,
    strIngredient1,
    strIngredient2,
    strIngredient3,
    strIngredient4,
    strIngredient5,
    strIngredient6,
    strIngredient7,
    strIngredient8,
    strIngredient9,
    strIngredient10,
    strIngredient11,
    strIngredient12,
    strIngredient13,
    strIngredient14,
    strIngredient15
FROM `bookish-journey-343419.cocktails_dataset.cocktails-table`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 651.29query/s] 
Downloading: 100%|██████████| 425/425 [00:00<00:00, 493.97rows/s]


In [6]:
base.tail()

Unnamed: 0,idDrink,strDrink,strIngredient1,strIngredient2,strIngredient3,strIngredient4,strIngredient5,strIngredient6,strIngredient7,strIngredient8,strIngredient9,strIngredient10,strIngredient11,strIngredient12,strIngredient13,strIngredient14,strIngredient15
420,178359,Kiwi Martini,Kiwi,Sugar Syrup,Vodka,Kiwi,,,,,,,,,,,
421,17181,Dirty Martini,Vodka,Dry Vermouth,Olive Brine,Lemon,Olive,,,,,,,,,,
422,178349,Snowday,Vodka,Amaro Montenegro,Ruby Port,Blood Orange,Angostura Bitters,Orange Peel,,,,,,,,,
423,178343,Michelada,Beer,Tomato Juice,Lime Juice,Hot Sauce,Worcestershire Sauce,Soy Sauce,,,,,,,,,
424,178368,Planter’s Punch,Dark Rum,Orange Juice,Pineapple Juice,Grenadine,Sugar Syrup,Angostura Bitters,,,,,,,,,


In [7]:
# Drop empty columns
base = base.drop(['strIngredient12','strIngredient13','strIngredient14','strIngredient15'],axis=1)

## Feature Engineering

In [8]:
# Replace NaN with ""
base = base.replace(np.nan,"")

In [9]:
# Create a ner variable, ingredients, that has all the previous ingredients together. 
base['ingredients'] = base[['strIngredient1','strIngredient2','strIngredient3','strIngredient4','strIngredient5',
        'strIngredient6','strIngredient7','strIngredient8','strIngredient9','strIngredient10','strIngredient11']].agg(','.join, axis=1)
base.ingredients.head()

0      Irish whiskey,Coffee,Sugar,Whipped cream,,,,,,,
1    Gin,Triple sec,Pineapple juice,Grenadine,Pinea...
2    Gin,Carbonated water,Powdered sugar,Orange spi...
3                           Sugar,Lime,Cachaca,,,,,,,,
4      Dry Vermouth,Gin,Anis,Bitters,Orange peel,,,,,,
Name: ingredients, dtype: object

In [10]:
# Function to transform letters to lowercase.

def lower(text):
    text=text.lower()
    return text

# Pass all words throw the funtion and append them
ingredients_low=[]
for i in base.ingredients:
    il=lower(i)
    ingredients_low.append(il)

base['ingredients']=ingredients_low
base.ingredients

0        irish whiskey,coffee,sugar,whipped cream,,,,,,,
1      gin,triple sec,pineapple juice,grenadine,pinea...
2      gin,carbonated water,powdered sugar,orange spi...
3                             sugar,lime,cachaca,,,,,,,,
4        dry vermouth,gin,anis,bitters,orange peel,,,,,,
                             ...                        
420                   kiwi,sugar syrup,vodka,kiwi,,,,,,,
421     vodka,dry vermouth,olive brine,lemon,olive,,,,,,
422    vodka,amaro montenegro,ruby port,blood orange,...
423    beer,tomato juice,lime juice,hot sauce,worcest...
424    dark rum,orange juice,pineapple juice,grenadin...
Name: ingredients, Length: 425, dtype: object

In [11]:
# This function convert a list of ingredients into a dictionary, note: every ingredient gets a 1.
# this mean that the value of every key is 1. key:value

def convert_to_dict(lst):
    d = {} #empty dict
    for ingre in lst:
        d[ingre] = 1
    return d

# We use the function to convert every row into a dictionary. 
# 'vodka': 1, 'lime juice': 1... this will help us later to create a one hot encoding.
base['bagofwords'] = base.ingredients.str.split(',').apply(convert_to_dict)
print(base.bagofwords)

0      {'irish whiskey': 1, 'coffee': 1, 'sugar': 1, ...
1      {'gin': 1, 'triple sec': 1, 'pineapple juice':...
2      {'gin': 1, 'carbonated water': 1, 'powdered su...
3           {'sugar': 1, 'lime': 1, 'cachaca': 1, '': 1}
4      {'dry vermouth': 1, 'gin': 1, 'anis': 1, 'bitt...
                             ...                        
420     {'kiwi': 1, 'sugar syrup': 1, 'vodka': 1, '': 1}
421    {'vodka': 1, 'dry vermouth': 1, 'olive brine':...
422    {'vodka': 1, 'amaro montenegro': 1, 'ruby port...
423    {'beer': 1, 'tomato juice': 1, 'lime juice': 1...
424    {'dark rum': 1, 'orange juice': 1, 'pineapple ...
Name: bagofwords, Length: 425, dtype: object


In [12]:
# One Hot Encoding
# To find similarities between dishes and cluster cocktails using their ingredients, we will represent a recipe by a one-hot encoded vector 
# of its ingredients. We will be establishing a vocabulary of ingredients using a method ‘DictVectorizer’ provided in the sklearn library

# DictVectorizer:This transformer turns lists of mappings (dict-like objects) of feature names to feature values into Numpy arrays or scipy.sparse matrices for use with scikit-learn estimators.
# sparse, default=True. Whether transform should produce scipy.sparse matrices. In this case we set it as False.

vector_dict = DictVectorizer(sparse = False)

# fit_transform() is used on the training data so that we can scale the training data and also learn the scaling parameters of that data. 
#The fit method is calculating the mean and variance of each of the features present in our data. 
#The transform method is transforming all the features using the respective mean and variance.
# We past every dictionary into a list.
X = vector_dict.fit_transform(base["bagofwords"].tolist())

# We select the column strDrink(name of the drink) from the dataset
y = base.strDrink

## Training a Model: Mean Shift Clustering

In [13]:
# Using Kernel PCA
# kernel = "cosine": This is called cosine similarity, because Euclidean (L2) normalization projects the vectors onto the unit sphere, and their dot product is then the cosine of the angle between the points denoted by the vectors.

kpca = KernelPCA(n_components=2,kernel="cosine", n_jobs=2)

# Using the transform method we can use the same mean and variance as it is calculated from our training data to transform our test data. 
#Thus, the parameters learned by our model using the training data will help us to transform our test data.
x_pca = kpca.fit_transform(X)

In [14]:
from sklearn.cluster import MeanShift
ms = MeanShift().fit(x_pca)

In [15]:
# Recommendations
def cluster_recomm(index,algorithm = ms,n_return = 5):
    cluster = algorithm.predict(x_pca[index].reshape(1, -1))[0]
    cluster_map = pd.DataFrame()
    cluster_map['cluster'] = algorithm.labels_
    in_cluster = cluster_map[cluster_map.cluster == cluster].sample(n=n_return, random_state = 42)
    return y[in_cluster.index]

In [16]:
cluster_recomm(index = 6,algorithm = ms,n_return = 3)

47         Bumble Bee
170      Whiskey Sour
118    Nutty Irishman
Name: strDrink, dtype: object

## Saving Model

In [17]:
import joblib
from google.cloud import storage
import datetime

In [18]:
# Export the model to a file
model = 'model.joblib'
joblib.dump(ms, model)

['model.joblib']

In [19]:
# Public bucket holding the census data
bucket = storage.Client().bucket(config['bucket_name'])

# Upload the model to GCS
bucket = storage.Client().get_bucket(bucket)
blob = bucket.blob("MeanShift/" + model)
blob.upload_from_filename(model)

# Sending Model to Vertex AI

In [20]:
!source GCP_Deploying_MeanShift.sh

Setting up envirement variables
Done

Uploading Model
Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Waiting for operation [4040404584366080000]...done.                            
Done

Creating endpoint
Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Waiting for operation [4088255330406891520]...done.                            
Created Vertex AI endpoint: projects/679997314711/locations/us-central1/endpoints/1240996784038215680.
Done

Setting IDs
Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Done

Deploying Model
Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Waiting for operation [7583048641246396416]...done.                            
Deployed a model to the endpoint 1240996784038215680. Id of the deployed model: 3687524505049104384.
Done



# Metrics

The **Bandwidth** estimates the best number of clusters by fitting the model. This method generates the number of labels, i.e, the number of estimated clusters by reducing the value of *Bandwidth*. The number of labels generated is is the homologue to clusters estimated in the elbow method. Fitting with MeanShift, el number of estimated clusters is 4.


In [21]:
from sklearn.cluster import estimate_bandwidth

# Compute clustering with MeanShift

# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(x_pca, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(x_pca)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)

number of estimated clusters : 4
