# 1 Initial data exploration

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
# import datasets from urls
leafly_csv_url = 'https://raw.githubusercontent.com/med-cabinet-5/data-science/master/data/cannabis.csv'
kushy_strains_csv_url = 'https://raw.githubusercontent.com/kushyapp/cannabis-dataset/master/Dataset/Strains/strains-kushy_api.2017-11-14.csv'

In [3]:
# save to dataframe
leafly = pd.read_csv(leafly_csv_url)
kushy = pd.read_csv(kushy_strains_csv_url)

In [4]:
leafly.head(3)

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...


In [5]:
kushy.head(3)

Unnamed: 0,id,status,sort,name,slug,image,description,type,crosses,breeder,...,cbn,cbg,cbgm,cbgv,cbc,cbcv,cbv,cbe,cbt,cbl
0,1,1,0,100 OG,,,<p>This strain is named after it's high price ...,Hybrid,,Old School Breeder's Association,...,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,0,707 Headband,,,,Hybrid,378272.0,Unknown Breeder,...,0,,,,,,,,,
2,3,1,0,A-10,,,,Indica,0.0,,...,0,,,,,,,,,


In [6]:
kushy.isnull().sum()

id                0
status            0
sort              0
name              1
slug           9524
image          9524
description    9523
type             24
crosses        8960
breeder         545
effects        8509
ailment        8553
flavor         8553
location       8903
terpenes       9523
thc               0
thca            889
thcv            889
cbd               0
cbda            889
cbdv            889
cbn               0
cbg             889
cbgm            889
cbgv            889
cbc             889
cbcv            889
cbv             889
cbe             889
cbt             889
cbl             889
dtype: int64

#### **Note:** Most of the information is in chemical analysis, which we don't care about, but we do gain ~1000 ailment observations

In [7]:
# shape of dataframe
leafly.shape

(2351, 6)

In [8]:
# null values
leafly.isnull().sum()

Strain          0
Type            0
Rating          0
Effects         0
Flavor         46
Description    33
dtype: int64

In [9]:
# drop a small portion of the data to maintain our ability to provide standardized output
leafly = leafly.dropna(subset=['Flavor', 'Description'])

In [10]:
# The only merge-able data of interest
kushy_clean = kushy[['name', 'ailment']]

In [11]:
# shape of clean kush data
kushy_clean.shape

(9524, 2)

In [12]:
# null values in clean dataset
kushy_clean.isnull().sum()

name          1
ailment    8553
dtype: int64

In [13]:
# drop null values
kushy_clean = kushy_clean.dropna()

In [14]:
# kushy clean shape 
kushy_clean.shape

(971, 2)

In [15]:
# merge data
merged = pd.merge(leafly, kushy_clean, left_on='Strain', right_on='name', how='left' )

In [16]:
merged.shape

(2280, 8)

In [17]:
# print a description
merged.loc[19]['Description']

'A-10 has an earthy, hashy taste that provides a very heavy body stone. \xa0Frequently used to treat insomnia and chronic pain.'

In [18]:
# print an ailment
merged.loc[19]['ailment']

'Stress, Insomnia, Pain, Muscle Spasms, Depression'

In [19]:
# Dropping the rating and retaining only string information
merged_strings = merged.drop(['Rating'], axis=1)

In [20]:
# merge nan values with ''
merged_strings = merged_strings.fillna('')

In [21]:
# check for null values
merged_strings.isnull().sum()

Strain         0
Type           0
Effects        0
Flavor         0
Description    0
name           0
ailment        0
dtype: int64

In [22]:
# check data types
merged_strings.dtypes

Strain         object
Type           object
Effects        object
Flavor         object
Description    object
name           object
ailment        object
dtype: object

In [23]:
# Concatenating all text such that it is easy to process for our NLP model
merged_strings['all_text'] = merged_strings.apply(lambda x: ' '.join(x), axis=1)

In [24]:
# print 'all_text' example
merged_strings['all_text'][0]

'100-Og hybrid Creative,Energetic,Tingly,Euphoric,Relaxed Earthy,Sweet,Citrus $100 OG is a 50/50 hybrid strain that packs a strong punch. The name supposedly refers to both its strength and high price when it first started showing up in Hollywood. As a plant, $100 OG tends to produce large dark green buds with few stems. Users report a strong body effect of an indica for pain relief with the more alert, cerebral feeling thanks to its sativa side.  '

# 2 Natural Language Processing

In [27]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
nlp = spacy.load("en_core_web_lg")

In [29]:
def lemma_producer(text):
    """
    Tokenizes string and returning a list of lemmas.
    """
    lemmas = []
    processed_text = nlp(text)
    
    for token in processed_text:
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    return ' '.join(lemmas)       

In [30]:
merged_strings['lemmas'] = merged_strings['all_text'].apply(lemma_producer)

In [31]:
merged_strings['lemmas'][0]

'100-og hybrid Creative energetic tingly euphoric Relaxed Earthy Sweet Citrus $ 100 OG 50/50 hybrid strain pack strong punch supposedly refer strength high price start show Hollywood plant $ 100 OG tend produce large dark green bud stem user report strong body effect indica pain relief alert cerebral feeling thank sativa  '

In [32]:
tfidf = TfidfVectorizer(stop_words="english", min_df=0.025, max_df=0.98, ngram_range=(1,3))

In [34]:
dtm = tfidf.fit_transform(merged_strings['lemmas'])

In [35]:
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [36]:
dtm.head()

Unnamed: 0,10,10 week,1st,20,50,60,70,80,active,activity,...,user,variety,way,week,white,white widow,widow,win,woody,yield
0,0.0,0.0,0.0,0.0,0.400535,0.0,0.0,0.0,0.0,0.0,...,0.20087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.115176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.414382,0.530228,0.527328,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144146,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152048,0.0
4,0.0,0.0,0.0,0.0,0.0,0.172131,0.0,0.0,0.0,0.0,...,0.182326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3 Modeling

In [37]:
from sklearn.neighbors import NearestNeighbors

In [38]:
model = NearestNeighbors(n_neighbors=3, algorithm='kd_tree')
model.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                 radius=1.0)

In [63]:
# test_string = ['I have fibromyalgia and I want pain relief']
test_string = ['I have trouble focusing.']

In [64]:
test_string = tfidf.transform(test_string)

In [65]:
test_string = test_string.todense()

In [66]:
predictions = model.kneighbors(test_string)

In [67]:
predictions # (distance, index)

(array([[1., 1., 1.]]), array([[ 699,  713, 2145]]))

In [68]:
predictions[1][0][0] # best match

699

In [69]:
best_match = predictions[1][0][0]

In [70]:
merged_strings.iloc[best_match]

Strain                                                Dream-Star
Type                                                      hybrid
Effects             Euphoric,Creative,Uplifted,Talkative,Relaxed
Flavor                                        Earthy,Woody,Sweet
Description    A cross between Blue Dream and Stardawg, Dream...
name                                                            
ailment                                                         
all_text       Dream-Star hybrid Euphoric,Creative,Uplifted,T...
lemmas         Dream Star hybrid Euphoric Creative uplifted t...
Name: 699, dtype: object

In [71]:
recommended_strain = merged_strings.iloc[best_match]

In [72]:
recommended_strain.drop(['name', 'ailment', 'all_text', 'lemmas']).to_dict()

{'Strain': 'Dream-Star',
 'Type': 'hybrid',
 'Effects': 'Euphoric,Creative,Uplifted,Talkative,Relaxed',
 'Flavor': 'Earthy,Woody,Sweet',
 'Description': 'A cross between Blue Dream and Stardawg, Dream Star is a sativa-dominant hybrid bred by Oaksterdam Seed Co. Its aroma is sweet and fruity, with sour accents that hint at Dream Star’s Chemdawg lineage. This strain’s psychoactive onset begins in the head and evens out over time into a mellow full-body calm. Dream Star is used by patients to treat a variety of symptoms and conditions including headaches, pain, depression, multiple sclerosis, and Parkinson’s. This hybrid might come as a challenge to novice growers, but cultivators of this strain should wait nine weeks for indoor plants to flower.'}

In [73]:
returned_values = recommended_strain.drop(['name', 'ailment', 'all_text', 'lemmas']).to_dict()

In [74]:
returned_values

{'Strain': 'Dream-Star',
 'Type': 'hybrid',
 'Effects': 'Euphoric,Creative,Uplifted,Talkative,Relaxed',
 'Flavor': 'Earthy,Woody,Sweet',
 'Description': 'A cross between Blue Dream and Stardawg, Dream Star is a sativa-dominant hybrid bred by Oaksterdam Seed Co. Its aroma is sweet and fruity, with sour accents that hint at Dream Star’s Chemdawg lineage. This strain’s psychoactive onset begins in the head and evens out over time into a mellow full-body calm. Dream Star is used by patients to treat a variety of symptoms and conditions including headaches, pain, depression, multiple sclerosis, and Parkinson’s. This hybrid might come as a challenge to novice growers, but cultivators of this strain should wait nine weeks for indoor plants to flower.'}