In [29]:
# takes 3 minutes in Colab
#
# run, then restart runtime and it should be in the kernel,
# after which "import spacy" works
#
# !python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 2.1MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180944 sha256=769f2244602dd22a943a63b66f0f6fbd8f4e84d44f4f25d4a5045d7ada31c23e
  Stored in directory: /tmp/pip-ephem-wheel-cache-tf2u4p10/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


## Exploratory Data Analysis and Cleaning

In [1]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import spacy
from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
leafly = pd.read_csv('https://raw.githubusercontent.com/Build-Week-Med-Cabinet-6/DS/mark-dev/data/cannabis.csv')

In [3]:
leafly.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [4]:
leafly.shape

(2351, 6)

In [5]:
leafly['Strain'].nunique()

2350

In [6]:
leafly.isnull().sum()

Strain          0
Type            0
Rating          0
Effects         0
Flavor         46
Description    33
dtype: int64

## Modeling

In [7]:
# Set up spacy tokenizer


#!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')
tokenizer = Tokenizer(nlp.vocab)

sample_text = "Text data needs to be processed to prepare it for machine learning models!"

[token.text for token in tokenizer(sample_text)]

['Text',
 'data',
 'needs',
 'to',
 'be',
 'processed',
 'to',
 'prepare',
 'it',
 'for',
 'machine',
 'learning',
 'models!']

In [8]:
leafly['Description'].isnull().sum()

33

In [10]:
import numpy as np
leafly.replace('None',np.NaN,inplace=True)

In [11]:
leafly = leafly.dropna()

In [12]:
# 2351 --> 2277.  I'm happy to just wholesale drop anything with missing values

leafly.shape

(2163, 6)

In [13]:
# Make tokens out of descriptions

tokens = []
for desc in tokenizer.pipe(leafly['Description'], batch_size=500):
    desc_tokens = [token.text for token in desc]
    tokens.append(desc_tokens)
leafly['tokens'] = tokens
leafly['tokens'].head()

0    [$100, OG, is, a, 50/50, hybrid, strain, that,...
1    [The, ‘98, Aloha, White, Widow, is, an, especi...
2    [1024, is, a, sativa-dominant, hybrid, bred, i...
3    [13, Dawgs, is, a, hybrid, of, G13, and, Chemd...
4    [Also, known, as, Kosher, Tangie,, 24k, Gold, ...
Name: tokens, dtype: object

In [14]:
leafly.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,tokens
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,"[$100, OG, is, a, 50/50, hybrid, strain, that,..."
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,"[The, ‘98, Aloha, White, Widow, is, an, especi..."
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,"[1024, is, a, sativa-dominant, hybrid, bred, i..."
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,"[13, Dawgs, is, a, hybrid, of, G13, and, Chemd..."
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...","[Also, known, as, Kosher, Tangie,, 24k, Gold, ..."


## TF-IDF Vectorizer

In [15]:
def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [16]:


# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', 
                        ngram_range=(1,2),
                        max_df=.97,
                        min_df=3,
                        tokenizer=tokenize)

# Create a vocabulary and get word counts per listing
dtm = tfidf.fit_transform(leafly['Description'])

# Get feature names to use a dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,1st,8,afghani,aroma,black,blend,blue,blueberry,...,yoda,zealand,zest,zestful,zesty,zesty lemon,zion,zombie,zombie og,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.028345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.038187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.030593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
dtm.shape

(2163, 8340)

## Fit a KNearestNeighbors Model on dtm

In [18]:
#from sklearn.neighbors import NearestNeighbors

# Fit on dtm
nn = NearestNeighbors(n_neighbors=20, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                 radius=1.0)

In [19]:
# Query Using kneighbors 
nn.kneighbors([dtm.iloc[0]])

(array([[0.        , 1.28354555, 1.28999472, 1.30751895, 1.31149959,
         1.31691786, 1.31792323, 1.31849079, 1.31862296, 1.31960226,
         1.32253938, 1.32642939, 1.32662427, 1.32953381, 1.33112344,
         1.33364671, 1.3375809 , 1.33799352, 1.33911045, 1.33984016]]),
 array([[   0, 1738,  219,  376, 2055,  631, 1381,  889, 2036, 1791,  503,
           48,  974,  208, 1539,   41, 1100, 1534, 1312, 1289]]))

In [25]:
# user input("Describe your ideal strain!")

ideal_strain = ["""
earthy full lavored hybrid with fuit over tones
"""]

In [26]:
# Query the user's ideal strain. 2nd array is their top 20 strains

new = tfidf.transform(ideal_strain)
nn.kneighbors(new.todense())

(array([[1.2791434 , 1.29560779, 1.30798674, 1.31335276, 1.31400403,
         1.31539701, 1.31938311, 1.32299936, 1.32307984, 1.32672534,
         1.3283781 , 1.3283936 , 1.33063945, 1.33269761, 1.33495885,
         1.34075064, 1.3421498 , 1.34245085, 1.34289047, 1.34358965]]),
 array([[1598,  398, 1006,  436, 2136, 2095, 1674, 1134,  612, 1126,  673,
         1535,  381,  127, 1073, 1374,  263,  617,  122, 2038]]))

In [32]:
!pwd

/home/joe/Desktop/school/ds-project/app/ml


In [34]:
# serializ the fitted model and the fitted estimator for use in the web app
import pickle
with open('../pickles/isaac_tf.pickle','wb') as fp:
    pickle.dump(tfidf,fp)
with open('../pickles/isaac_nn.pickle','wb') as fp2:
    pickle.dump(nn,fp2)

In [None]:
# And then do (pseudocode)
#
# for number in nn.kneighbors(new.todense())[1]:
#     return that strain
#      Send to web so they can visualize the Top 20 list.
#



In [31]:
# Inspect most relevant strain
leafly['Description'][2038]

'Sunshine #4 is an award-winning hybrid bred by Bodhi Seeds. This blend of Chemdawg 4 and Sunshine Daydream genetics took the prize for Best Hybrid Flower at the 2015 High Times Medical Cup in Michigan.'

In [24]:
# Inspect 2nd most relevant strain
leafly['Description'][1698]

'Purple Kush is a pure indica strain that emerged from the Oakland area of California as the result of a Hindu Kush and Purple Afghani cross. Its aroma is subtle and earthy with sweet overtones typical of Kush varieties. Blissful, long-lasting euphoria blankets the mind while physical relaxation rids the body of pain, sleeplessness, and stress. Purple Kush will grow wide rather than tall, and will be ready for harvest following an 8 week flowering time.'

In [None]:
#
##
###
###     IGNORE BELOW
###
##
#

In [None]:
# Another dataset. Will probably ignore.

kushy = pd.read_csv('https://raw.githubusercontent.com/kushyapp/cannabis-dataset/master/Dataset/Strains/strains-kushy_api.2017-11-14.csv')

In [11]:
kushy.head()

Unnamed: 0,id,status,sort,name,slug,image,description,type,crosses,breeder,effects,ailment,flavor,location,terpenes,thc,thca,thcv,cbd,cbda,cbdv,cbn,cbg,cbgm,cbgv,cbc,cbcv,cbv,cbe,cbt,cbl
0,1,1,0,100 OG,,,<p>This strain is named after it's high price ...,Hybrid,,Old School Breeder's Association,Focused,Depression,Citrus,,Limonene,127,0.0,0.0,16,0.0,0.0,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,0,707 Headband,,,,Hybrid,378272.0,Unknown Breeder,,,,,,127,,,0,,,0,,,,,,,,,
2,3,1,0,A-10,,,,Indica,0.0,,"Relaxed, Happy, Uplifted, Energetic, Sleepy, D...","Stress, Insomnia, Pain, Muscle Spasms, Depression","Citrus, Sweet",,,0,,,0,,,0,,,,,,,,,
3,4,1,0,Acapulco Gold,,,,Sativa,0.0,,"Happy, Euphoric, Uplifted, Relaxed, Creative, ...","Depression, Stress, Pain, Lack of Appetite","Earthy, Citrus",,,0,,,0,,,0,,,,,,,,,
4,5,1,0,Afghani Bullrider,,,,Hybrid,0.0,Unknown Breeder,"Uplifted, Relaxed, Happy, Euphoric, Dry Mouth,...","Stress, Depression, Insomnia, Pain","Sweet, Pine, Earthy",,,127,,,0,,,0,,,,,,,,,
