# Initial Data Exploration

In [4]:
import pandas as pd
import numpy as np

### The Kaggle Data

In [5]:
leafly_csv_url = 'https://raw.githubusercontent.com/med-cabinet-5/data-science/master/data/cannabis.csv'

### Supplementary data referencing ailments from the app 'Kushy'

In [6]:
kushy_strains_csv_url = 'https://raw.githubusercontent.com/kushyapp/cannabis-dataset/master/Dataset/Strains/strains-kushy_api.2017-11-14.csv'

In [7]:
leafly = pd.read_csv(leafly_csv_url)
kushy = pd.read_csv(kushy_strains_csv_url)

In [8]:
leafly.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [9]:
kushy.head()

Unnamed: 0,id,status,sort,name,slug,image,description,type,crosses,breeder,...,cbn,cbg,cbgm,cbgv,cbc,cbcv,cbv,cbe,cbt,cbl
0,1,1,0,100 OG,,,<p>This strain is named after it's high price ...,Hybrid,,Old School Breeder's Association,...,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,0,707 Headband,,,,Hybrid,378272.0,Unknown Breeder,...,0,,,,,,,,,
2,3,1,0,A-10,,,,Indica,0.0,,...,0,,,,,,,,,
3,4,1,0,Acapulco Gold,,,,Sativa,0.0,,...,0,,,,,,,,,
4,5,1,0,Afghani Bullrider,,,,Hybrid,0.0,Unknown Breeder,...,0,,,,,,,,,


In [10]:
kushy.isnull().sum()

id                0
status            0
sort              0
name              1
slug           9524
image          9524
description    9523
type             24
crosses        8960
breeder         545
effects        8509
ailment        8553
flavor         8553
location       8903
terpenes       9523
thc               0
thca            889
thcv            889
cbd               0
cbda            889
cbdv            889
cbn               0
cbg             889
cbgm            889
cbgv            889
cbc             889
cbcv            889
cbv             889
cbe             889
cbt             889
cbl             889
dtype: int64

### Most of the information is in chemical analysis, which we don't care about, but we do gain ~1000 ailment observations

In [11]:
leafly.shape

(2351, 6)

In [12]:
leafly.isnull().sum()

Strain          0
Type            0
Rating          0
Effects         0
Flavor         46
Description    33
dtype: int64

### In order to maintain our ability to provide standardized output we'll drop a small portion of the data

In [13]:
leafly = leafly.dropna(subset=['Flavor', 'Description'])

### The only merge-able data of interest

In [14]:
kushy_clean = kushy[['name', 'ailment']]

In [15]:
kushy_clean.shape

(9524, 2)

In [16]:
kushy_clean.isnull().sum()

name          1
ailment    8553
dtype: int64

In [17]:
kushy_clean = kushy_clean.dropna()

In [18]:
kushy_clean.shape

(971, 2)

In [19]:
merged = pd.merge(leafly, kushy_clean, left_on='Strain', right_on='name', how='left' )

In [20]:
merged.shape

(2280, 8)

In [21]:
merged.loc[19]['Description']

'A-10 has an earthy, hashy taste that provides a very heavy body stone. \xa0Frequently used to treat insomnia and chronic pain.'

In [22]:
merged.loc[19]['ailment']

'Stress, Insomnia, Pain, Muscle Spasms, Depression'

### Dropping the rating and retaining only string information

In [23]:
merged_strings = merged.drop(['Rating'], axis=1)

In [24]:
merged_strings = merged_strings.fillna('')

In [25]:
merged_strings.isnull().sum()

Strain         0
Type           0
Effects        0
Flavor         0
Description    0
name           0
ailment        0
dtype: int64

In [26]:
merged_strings.dtypes

Strain         object
Type           object
Effects        object
Flavor         object
Description    object
name           object
ailment        object
dtype: object

### Concatenating all text such that it is easy to process for our NLP model

In [27]:
merged_strings['all_text'] = merged_strings.apply(lambda x: ' '.join(x), axis=1)

In [28]:
merged_strings['all_text'][0]

'100-Og hybrid Creative,Energetic,Tingly,Euphoric,Relaxed Earthy,Sweet,Citrus $100 OG is a 50/50 hybrid strain that packs a strong punch. The name supposedly refers to both its strength and high price when it first started showing up in Hollywood. As a plant, $100 OG tends to produce large dark green buds with few stems. Users report a strong body effect of an indica for pain relief with the more alert, cerebral feeling thanks to its sativa side.  '

# NLP Time

In [29]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
nlp = spacy.load("en_core_web_lg")

In [40]:
def lemma_producer(text):
    """
    tokenizes string, returning a list of lemmas
    """
    lemmas = []
    processed_text = nlp(text)
    
    for token in processed_text:
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    return ' '.join(lemmas)       

In [41]:
merged_strings['lemmas'] = merged_strings['all_text'].apply(lemma_producer)

In [42]:
merged_strings['lemmas'][0]

'100-Og hybrid Creative energetic Tingly Euphoric Relaxed Earthy Sweet Citrus $ 100 og 50/50 hybrid strain pack strong punch supposedly refer strength high price start show Hollywood plant $ 100 og tend produce large dark green bud stem user report strong body effect indica pain relief alert cerebral feeling thank sativa  '

In [43]:
tfidf = TfidfVectorizer(stop_words="english", min_df=0.025, max_df=0.98, ngram_range=(1,3))

In [44]:
dtm = tfidf.fit_transform(merged_strings['lemmas'])

In [45]:
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [None]:
dtm.head()

# Modeling

In [46]:
from sklearn.neighbors import NearestNeighbors

In [47]:
model = NearestNeighbors(n_neighbors=3, algorithm='kd_tree')
model.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=3, p=2, radius=1.0)

In [48]:
test_string = ['I have fibromyalgia and I want pain relief']

In [49]:
test_string = tfidf.transform(test_string)

In [50]:
test_string = test_string.todense()

In [51]:
predictions = model.kneighbors(test_string)

In [52]:
predictions # (distance, index)

(array([[1.07434981, 1.1410936 , 1.14179703]]),
 array([[1622, 1451,  662]], dtype=int64))

In [53]:
predictions[1][0][0] # best match

1622

In [54]:
best_match = predictions[1][0][0]

In [55]:
merged_strings.iloc[best_match]

Strain                                              Purple-Arrow
Type                                                      hybrid
Effects                 Happy,Uplifted,Focused,Energetic,Relaxed
Flavor                                       Sweet,Citrus,Earthy
Description    When it comes to knocking out pain, no medical...
name                                                            
ailment                                                         
all_text       Purple-Arrow hybrid Happy,Uplifted,Focused,Ene...
lemmas         Purple Arrow hybrid Happy Uplifted Focused ene...
Name: 1622, dtype: object

In [57]:
recommended_strain = merged_strings.iloc[best_match]

In [58]:
recommended_strain.drop(['name', 'ailment', 'all_text', 'lemmas']).to_dict()

{'Strain': 'Purple-Arrow',
 'Type': 'hybrid',
 'Effects': 'Happy,Uplifted,Focused,Energetic,Relaxed',
 'Flavor': 'Sweet,Citrus,Earthy',
 'Description': 'When it comes to knocking out pain, no medical strain hits the target quite like Purple Arrow. This hybrid provides effective relief for severe pain while simultaneously inducing a sense of uplift and euphoria. Extremely well-rounded, Purple Arrow is potent without causing that over-medicated feeling of some pain relief strains. The uniqueness of this strain is complemented by its earthy aroma. Fragrant, herbal, and a little sweet, this strain tastes almost as good as it feels. When you need immediate relief and would like to stay off the couch, Purple Arrow is a fantastic choice.',
 'lemmas': 'Purple Arrow hybrid Happy Uplifted Focused energetic Relaxed Sweet Citrus Earthy come knock pain medical strain hit target like Purple Arrow hybrid provide effective relief severe pain simultaneously induce sense uplift euphoria extremely rounde

In [59]:
returned_values = recommended_strain.drop(['name', 'ailment', 'all_text', 'lemmas']).to_dict()

In [60]:
returned_values

{'Strain': 'Purple-Arrow',
 'Type': 'hybrid',
 'Effects': 'Happy,Uplifted,Focused,Energetic,Relaxed',
 'Flavor': 'Sweet,Citrus,Earthy',
 'Description': 'When it comes to knocking out pain, no medical strain hits the target quite like Purple Arrow. This hybrid provides effective relief for severe pain while simultaneously inducing a sense of uplift and euphoria. Extremely well-rounded, Purple Arrow is potent without causing that over-medicated feeling of some pain relief strains. The uniqueness of this strain is complemented by its earthy aroma. Fragrant, herbal, and a little sweet, this strain tastes almost as good as it feels. When you need immediate relief and would like to stay off the couch, Purple Arrow is a fantastic choice.'}