###Imports

In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

import spacy
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import pickle

### EDA/Cleaning

In [None]:
url = 'https://raw.githubusercontent.com/Build-Week-Med-Cabinet-10/data-science/master/cannabis.csv'

df = pd.read_csv(url)

df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [None]:
df.columns = df.columns.str.lower()

print(df.shape)
print(df.isna().sum())

(2351, 6)
strain          0
type            0
rating          0
effects         0
flavor         46
description    33
dtype: int64


In [None]:
df = df.dropna().reset_index(drop=True)
df.shape

(2277, 6)

In [None]:
# Split effects and flavors into additional parameters
import itertools

def criteria(s):
  temp = s.apply(lambda x: x.split(','))
  return set(itertools.chain(*temp))

effects = criteria(df['effects'])
flavors = criteria(df['flavor'])

print(f'There are {len(effects)} effects and {len(flavors)} flavors')

There are 16 effects and 50 flavors


In [None]:
effects

{'Aroused',
 'Creative',
 'Dry',
 'Energetic',
 'Euphoric',
 'Focused',
 'Giggly',
 'Happy',
 'Hungry',
 'Mouth',
 'None',
 'Relaxed',
 'Sleepy',
 'Talkative',
 'Tingly',
 'Uplifted'}

In [None]:
flavors

{'Ammonia',
 'Apple',
 'Apricot',
 'Berry',
 'Blue',
 'Blueberry',
 'Butter',
 'Cheese',
 'Chemical',
 'Chestnut',
 'Citrus',
 'Coffee',
 'Diesel',
 'Earthy',
 'Flowery',
 'Fruit',
 'Grape',
 'Grapefruit',
 'Honey',
 'Lavender',
 'Lemon',
 'Lime',
 'Mango',
 'Menthol',
 'Mint',
 'Minty',
 'None',
 'Nutty',
 'Orange',
 'Peach',
 'Pear',
 'Pepper',
 'Pine',
 'Pineapple',
 'Plum',
 'Pungent',
 'Rose',
 'Sage',
 'Skunk',
 'Spicy/Herbal',
 'Strawberry',
 'Sweet',
 'Tar',
 'Tea',
 'Tobacco',
 'Tree',
 'Tropical',
 'Vanilla',
 'Violet',
 'Woody'}

In [None]:
# Combine columns to instantiate a corpus

doc = df['effects'] + df['flavor'] + df['description']

doc[0]

'Creative,Energetic,Tingly,Euphoric,RelaxedEarthy,Sweet,Citrus$100 OG is a 50/50 hybrid strain that packs a strong punch. The name supposedly refers to both its strength and high price when it first started showing up in Hollywood. As a plant, $100 OG tends to produce large dark green buds with few stems. Users report a strong body effect of an indica for pain relief with the more alert, cerebral feeling thanks to its sativa side.'

###NLP

In [None]:
# Instantiate a Vectorizer
nlp = spacy.load("en_core_web_lg")

def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [None]:
# Convert to vector matrix
X = get_word_vectors(doc)

In [None]:
# fit model for quick results
nn = NearestNeighbors(n_neighbors=3, algorithm='ball_tree')
nn.fit(X)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                 radius=1.0)

In [None]:
# test using strings
query = nlp('Happy, Citrus')
result = nn.kneighbors(query.vector.reshape(1,-1))

In [None]:
result

(array([[2.18494946, 2.33560439, 2.35080486]]), array([[1681, 1092,  977]]))

In [None]:
# obtain results by looking up index in dataframe
test = df.iloc[list(result[1][0])]
test

Unnamed: 0,strain,type,rating,effects,flavor,description
1681,Q3,sativa,3.6,"Happy,Tingly,Uplifted,Giggly,Energetic","Tree,Fruit,Citrus,Orange",A sativa dominant strain.
1092,Jamba-Juice,hybrid,5.0,"Tingly,Giggly,Happy,Talkative,Energetic","Earthy,Sweet,Berry",Jamba Juice is a unique clone-only offering fr...
977,Hawaiian-Delight,indica,3.8,"Happy,Relaxed,Aroused,Uplifted,Euphoric","Pungent,Sweet,Diesel","Hawaiian Delight has a strong and musky aroma,..."


In [None]:
# convert to list to supply to transform into other(json) objects
test['strain'].tolist()

['Q3', 'Jamba-Juice', 'Hawaiian-Delight']

In [None]:
# Second test using individual vector embeddings
happy = nlp('Happy')
citrus = nlp('Citrus')

In [None]:
query2 = happy.vector + citrus.vector
result2 = nn.kneighbors(hca.reshape(1,-1))

In [None]:
result2

(array([[8.0945148 , 8.12922302, 8.16511646]]), array([[ 978, 1557, 1486]]))

In [None]:
# Better results
df.iloc[list(result2[1][0])]

Unnamed: 0,strain,type,rating,effects,flavor,description
978,Hawaiian-Diesel,sativa,4.2,"Uplifted,Creative,Euphoric,Happy,Relaxed","Diesel,Citrus,Tropical",Hawaiian Diesel is the tropical mix of an Aloh...
1557,Pineapple-Purple-Skunk,hybrid,4.5,"Relaxed,Happy,Uplifted,Euphoric,Aroused","Sweet,Citrus,Pineapple","Bred by MTG Seeds, Pineapple Purple Skunk is a..."
1486,Orange-Cookies,hybrid,4.6,"Relaxed,Happy,Uplifted,Euphoric,Creative","Citrus,Orange,Sweet",Orange Cookies bred by Franchise Genetics is a...


###Pickles!

In [None]:
# Get embeddings for effects and flavors
# Circumvents the need to load spacy in api
e = {}
for effect in effects:
  e[effect] = nlp(effect).vector

f = {}
for flavor in flavors:
  f[flavor] = nlp(flavor).vector

In [64]:
# Dump pickles
parent_directory = Path().resolve().parent

pickle.dump(e, open(os.path.join(parent_directory, 'effects.pkl'), 'wb'))
pickle.dump(f, open(os.path.join(parent_directory, 'flavors.pkl'), 'wb'))
pickle.dump(nn, open(os.path.join(parent_directory, 'knn.pkl'), 'wb'))


###Use Case


In [65]:
#1. Web team sends this user query in some format(probably json)
from_web = {'effect':'Aroused', 'flavor': 'Sweet'} #json like object 
#2. Use info to get vectors from pickled dictionaries
effect = e['Aroused']
flavor = f['Sweet']
#3. Generate query vector by adding these vectors
query = effect + flavor

In [67]:
#4. Run knn model using query vector. Needs to be reshaped
result = nn.kneighbors(query.reshape(1,-1))

In [72]:
#5. Result object will have the index location of recomendations to lookup in df
df.iloc[result[1][0]]['strain'].tolist()

['Hawaiian-Delight', 'Cheese-Candy', 'Sweet-Deep-Grapefruit']