# Initial Data Exploration

In [None]:
import pandas as pd
import numpy as np

### The Kaggle Data

In [17]:
leafly_csv_url = 'https://raw.githubusercontent.com/med-cabinet-5/data-science/master/data/cannabis.csv'

### Supplementary data referencing ailments from the app 'Kushy'

In [18]:
kushy_strains_csv_url = 'https://raw.githubusercontent.com/kushyapp/cannabis-dataset/master/Dataset/Strains/strains-kushy_api.2017-11-14.csv'

In [19]:
leafly = pd.read_csv(leafly_csv_url)
kushy = pd.read_csv(kushy_strains_csv_url)

In [20]:
leafly.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [21]:
kushy.head()

Unnamed: 0,id,status,sort,name,slug,image,description,type,crosses,breeder,...,cbn,cbg,cbgm,cbgv,cbc,cbcv,cbv,cbe,cbt,cbl
0,1,1,0,100 OG,,,<p>This strain is named after it's high price ...,Hybrid,,Old School Breeder's Association,...,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,0,707 Headband,,,,Hybrid,378272.0,Unknown Breeder,...,0,,,,,,,,,
2,3,1,0,A-10,,,,Indica,0.0,,...,0,,,,,,,,,
3,4,1,0,Acapulco Gold,,,,Sativa,0.0,,...,0,,,,,,,,,
4,5,1,0,Afghani Bullrider,,,,Hybrid,0.0,Unknown Breeder,...,0,,,,,,,,,


In [86]:
kushy.isnull().sum()

id                0
status            0
sort              0
name              1
slug           9524
image          9524
description    9523
type             24
crosses        8960
breeder         545
effects        8509
ailment        8553
flavor         8553
location       8903
terpenes       9523
thc               0
thca            889
thcv            889
cbd               0
cbda            889
cbdv            889
cbn               0
cbg             889
cbgm            889
cbgv            889
cbc             889
cbcv            889
cbv             889
cbe             889
cbt             889
cbl             889
dtype: int64

### Most of the information is in chemical analysis, which we don't care about, but we do gain ~1000 ailment observations

In [24]:
leafly.shape

(2351, 6)

In [69]:
leafly.isnull().sum()

Strain          0
Type            0
Rating          0
Effects         0
Flavor         46
Description    33
dtype: int64

### In order to maintain our ability to provide standardized output we'll drop a small portion of the data

In [None]:
leafly = leafly.dropna(subset=['Flavor', 'Description'])

### The only merge-able data of interest

In [27]:
kushy_clean = kushy[['name', 'ailment']]

In [37]:
kushy_clean.shape

(9524, 2)

In [36]:
kushy_clean.isnull().sum()

name          1
ailment    8553
dtype: int64

In [44]:
kushy_clean = kushy_clean.dropna()

In [46]:
kushy_clean.shape

(971, 2)

In [70]:
merged = pd.merge(leafly, kushy_clean, left_on='Strain', right_on='name', how='left' )

In [72]:
merged.shape

(2354, 8)

In [84]:
merged.loc[19]['Description']

'A-10 has an earthy, hashy taste that provides a very heavy body stone. \xa0Frequently used to treat insomnia and chronic pain.'

In [85]:
merged.loc[19]['ailment']

'Stress, Insomnia, Pain, Muscle Spasms, Depression'

### Dropping the rating and retaining only string information

In [120]:
merged_strings = merged.drop(['Rating'], axis=1)

In [121]:
merged_strings = merged_strings.fillna('')

In [101]:
merged_strings.isnull().sum()

Strain         0
Type           0
Effects        0
Flavor         0
Description    0
name           0
ailment        0
dtype: int64

In [98]:
merged_strings.dtypes

Strain         object
Type           object
Effects        object
Flavor         object
Description    object
name           object
ailment        object
dtype: object

### Concatenating all text such that it is easy to process for our NLP model

In [122]:
merged_strings['all_text'] = merged_strings.apply(lambda x: ' '.join(x), axis=1)

In [111]:
merged_strings['all_text'][0]

'100-Og hybrid Creative,Energetic,Tingly,Euphoric,Relaxed Earthy,Sweet,Citrus $100 OG is a 50/50 hybrid strain that packs a strong punch. The name supposedly refers to both its strength and high price when it first started showing up in Hollywood. As a plant, $100 OG tends to produce large dark green buds with few stems. Users report a strong body effect of an indica for pain relief with the more alert, cerebral feeling thanks to its sativa side.   100-OghybridCreative,Energetic,Tingly,Euphoric,RelaxedEarthy,Sweet,Citrus$100 OG is a 50/50 hybrid strain that packs a strong punch. The name supposedly refers to both its strength and high price when it first started showing up in Hollywood. As a plant, $100 OG tends to produce large dark green buds with few stems. Users report a strong body effect of an indica for pain relief with the more alert, cerebral feeling thanks to its sativa side.100-Og hybrid Creative,Energetic,Tingly,Euphoric,Relaxed Earthy,Sweet,Citrus $100 OG is a 50/50 hybrid

# NLP Time

In [112]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

In [113]:
nlp = spacy.load("en_core_web_lg")

In [114]:
def lemma_producer(text):
    """
    tokenizes string, returning a list of lemmas
    """
    lemmas = []
    processed_text = nlp(text)
    
    for token in processed_text:
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    return lemmas        

In [126]:
tfidf = TfidfVectorizer(tokenizer=lemma_producer, min_df=0.025, max_df=0.98, ngram_range=(1,3))

In [127]:
dtm = tfidf.fit_transform(merged_strings['all_text'])

In [128]:
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [129]:
dtm.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,1,10,10 week,1st,20,7,70,8,8 9,8 9 week,8 week,9,9 week,active,activity,afghani,alien,anxiety,appetite,aroma,arouse,average,award,balance,balanced,begin,berry,berry sweet,best,big,black,blend,blue,blue dream,blueberry,body,body buzz,body effect,breed,breeder,bright,bring,bubba,bubba kush,bud,buzz,california,calm,candy,cannabis,cannabis cup,cannabis strain,carry,cbd,cerebral,cerebral effect,cheese,chemdawg,chemical,cherry,choice,chronic,chronic pain,citrus,citrus earthy,citrus sweet,classic,clear,coast,coat,coffee,colorado,combination,combine,come,complex,consumer,consumption,content,cookie,couch,cover,create,create cross,creative,creative earthy,creative energetic,creative euphoric,creative focus,creativity,cross,crystal,cup,cut,cycle,dark,day,daytime,...,sedative,seed,seek,sensation,sense,sensi,short,similar,skunk,skunk 1,skunky,sleep,sleepy,sleepy euphoric,sleepy happy,sleepy happy euphoric,sleepy uplifted,smell,smoke,smooth,social,sour,sour diesel,space,spasm,spice,spicy,spicy herbal,star,state,stay,sticky,stimulate,stimulation,strain,strain breed,strain cross,strain ’s,strawberry,stress,stress depression,strong,structure,subtle,super,sweet,sweet berry,sweet citrus,sweet earthy,sweet flowery,sweet pungent,symptom,take,talkative,tall,taste,tend,terpene,terpene profile,thc,thc content,thick,time,tingly,trainwreck,treat,trichome,tropical,true,typically,undertone,unique,uplift,uplift earthy,uplift happy,uplifted,uplifted creative,uplifted energetic,uplifted euphoric,uplifted focus,uplifted happy,uplifted relaxed,uplifting,use,user,variety,way,week,week flower,white,white widow,widow,win,woody,x,yield,Unnamed: 198,Unnamed: 199,’,’s
0,0.083471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117706,0.0,0.203685,0.0,0.0,0.0,0.0,0.0,0.0,0.118573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109463,0.0,0.223952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.219026,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065835,0.0,0.0,0.0,0.0,0.0,0.0,0.316783,0.0,0.0,0.0,0.080006,0.0,0.208183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.223381,0.0,0.0,0.0,0.0,0.0,0.0,0.153892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.125172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.059576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111394,0.0,0.110047,0.0,0.0,...,0.0,0.062394,0.0,0.0,0.0,0.0,0.0,0.0,0.077244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091873,0.0,0.0,0.0,0.0,0.0,0.072385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.360276,0.460997,0.573094,0.0,0.0,0.0,0.0,0.063619,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11623,0.17555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.106822,0.0,0.0,0.0,0.0,0.0,0.208375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2691,0.15151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114672,0.0,0.0,0.0,0.18796,0.0,0.068822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.123927,0.0,0.198419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142087,0.0,0.0,0.0,0.0,0.0,0.0
3,0.078069,0.108576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.185951,0.0,0.0,0.0,0.0,0.0,0.0,0.157612,0.0,0.0,0.0,0.0,0.214535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187654,0.0,0.0,0.0,0.110089,0.229681,0.0,0.126375,0.0,0.0,0.186966,0.0,0.0,0.0,0.149735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217596,0.0,0.0,0.0,0.0,0.0,0.220031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137454,0.0,0.204759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187308,0.0,0.061575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074829,0.0,0.0,0.172277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086794,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154489,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183712,0.183842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189801,0.171038,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051411,0.0,0.0,0.0,0.0,0.0,0.0,0.123688,0.0,0.0,0.0,0.062477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118292,0.189165,0.0,0.0,0.0,0.0,0.116162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166407,0.0,0.0,0.0,0.072468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098879,0.0,0.0,0.0


# Modeling

In [131]:
from sklearn.neighbors import NearestNeighbors

In [132]:
model = NearestNeighbors(n_neighbors=3, algorithm='kd_tree')
model.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=3, p=2, radius=1.0)

In [166]:
test_string = ['I have no appetite and want to feel euphoria. Also I suffer from insomnia and like fruity flavors']

In [145]:
test_string = tfidf.transform(test_string)

In [146]:
test_string = test_string.todense()

In [147]:
predictions = model.kneighbors(test_string)

In [148]:
predictions # (distance, index)

(array([[1.20593005, 1.21232781, 1.21298629]]),
 array([[ 808, 2270, 2075]], dtype=int64))

In [150]:
predictions[1][0][0] # best match

808

In [151]:
best_match = predictions[1][0][0]

In [152]:
merged_strings.iloc[best_match]

Strain                                            Fruity-Pebbles
Type                                                      hybrid
Effects                   Happy,Relaxed,Uplifted,Euphoric,Giggly
Flavor                                      Sweet,Tropical,Berry
Description    Fruity Pebbles (AKA Fruity Pebbles OG) by Alie...
name                                                            
ailment                                                         
all_text       Fruity-Pebbles hybrid Happy,Relaxed,Uplifted,E...
Name: 824, dtype: object

In [153]:
recommended_strain = merged_strings.iloc[best_match]

In [163]:
recommended_strain.drop(['name', 'ailment', 'all_text']).to_dict()

{'Strain': 'Fruity-Pebbles',
 'Type': 'hybrid',
 'Effects': 'Happy,Relaxed,Uplifted,Euphoric,Giggly',
 'Flavor': 'Sweet,Tropical,Berry',
 'Description': 'Fruity Pebbles (AKA Fruity Pebbles OG) by Alien Genetics was a limited-time offering from the breeder. This sweet hybrid takes genetics from Green Ribbon, Granddaddy Purple, and Tahoe Alien\xa0to create a tropical, berry flavor reminiscent of the cereal. The euphoric effects will keep you happy when you’re stressed and help you catch some sleep when faced with insomnia. Sit back, relax, and pour yourself a bowl of Fruity Pebbles!'}

In [164]:
returned_values = recommended_strain.drop(['name', 'ailment', 'all_text']).to_dict()

In [167]:
test_string

['I have no appetite and want to feel euphoria. Also I suffer from insomnia and like fruity flavors']

In [168]:
returned_values

{'Strain': 'Fruity-Pebbles',
 'Type': 'hybrid',
 'Effects': 'Happy,Relaxed,Uplifted,Euphoric,Giggly',
 'Flavor': 'Sweet,Tropical,Berry',
 'Description': 'Fruity Pebbles (AKA Fruity Pebbles OG) by Alien Genetics was a limited-time offering from the breeder. This sweet hybrid takes genetics from Green Ribbon, Granddaddy Purple, and Tahoe Alien\xa0to create a tropical, berry flavor reminiscent of the cereal. The euphoric effects will keep you happy when you’re stressed and help you catch some sleep when faced with insomnia. Sit back, relax, and pour yourself a bowl of Fruity Pebbles!'}