In [1]:
# !unzip cannabis-strains.zip

Archive:  cannabis-strains.zip
  inflating: cannabis.csv            


## Data

In [137]:
import pandas as pd

In [138]:
df = pd.read_csv('data/cannabis.csv')

In [139]:
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [140]:
import numpy as np

for desc in df['Description']:
    if desc == 'None':
        desc = np.nan

In [141]:
# Find nulls
df['Description'].isna().sum()

33

In [142]:
df.shape

(2351, 6)

In [143]:
df = df.dropna()

In [144]:
# Bring it down to description and strain
features = ['Description']
target = 'Strain'

X = df[features]
y = df[[target]]

In [145]:
# Convert all descriptions to strings
X['Description'] = X['Description'].apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Tokenize & Vectorize

In [146]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

In [147]:
import spacy
nlp = spacy.load("en_core_web_md")

In [148]:
from spacy.tokenizer import Tokenizer

In [149]:
# Instantiate tokenizer
tokenizer = Tokenizer(nlp.vocab)

In [150]:
# Define Stop Words
STOP_WORDS = nlp.Defaults.stop_words.union([' ', '  ', 'the', 'like', 'i', 'for', 'I', "i've", 'we', '\n', '\n\n', 'my', 'this'])

In [151]:
# Initialize tokens list
tokens = []

In [152]:
# Define tokenize function
def tokenize(series):
    for doc in tokenizer.pipe(series, batch_size = 500):
        # Initialize doc tokens
        doc_tokens = []

        # Filter for stop words
        for token in doc:
            if token.text not in STOP_WORDS:
                doc_tokens.append(token.text.lower())

        # Append doc_tokens to tokens
        tokens.append(doc_tokens)
    return tokens

In [153]:
# Apply tokenize function
tokens = tokenize(X['Description'])
X['Tokens'] = tokens

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## Vector Representation

In [154]:
# Clean up text
clean_descriptions = []

# Loop over tokens
for token_list in X['Tokens']:
    clean_description = ' '.join(token_list)
    clean_descriptions.append(clean_description)

In [155]:
# Send to X
X['clean_descriptions'] = clean_descriptions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [156]:
# Instantiate Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

In [157]:
# Create vocab and get word counts
sparse = tfidf.fit_transform(X['clean_descriptions'])

In [158]:
# send the matrix to a DataFrame
tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())

## KNN Model

In [159]:
# Instantiate nearest neighbors model
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(tfidf_dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

## Make Predictions (API)

In [160]:
# Create a fake weed review
fake_input = """this indica is great for stress relief and insomnia. 
It sends me to sleep every time! Very nice body high and nive flavor!"""

fake = pd.Series(fake_review)

In [161]:
# Transform
fake_sparse = tfidf.transform(fake)

# Densify
fake_tfidf = pd.DataFrame(fake_sparse.todense())

In [162]:
# The bottom array is the index of the recommendations
nn.kneighbors([fake_tfidf][0], n_neighbors=10)

(array([[1.        , 1.        , 1.        , 1.        , 1.26731976,
         1.28302907, 1.28674518, 1.2879383 , 1.28827035, 1.30055434]]),
 array([[1606, 1698, 1605, 1607, 1423, 1677, 1829, 1400, 1655,  641]]))

In [164]:
print(y['Strain'][1606])
print(X['Description'][1606])
print('')
print(y['Strain'][1698])
print(X['Description'][1698])
print('')
print(y['Strain'][1605])
print(X['Description'][1605])
print('')
print(y['Strain'][1607])
print(X['Description'][1607])
print('')
print(y['Strain'][1423])
print(X['Description'][1423])
print('')
print(y['Strain'][1677])
print(X['Description'][1677])
print('')
print(y['Strain'][1829])
print(X['Description'][1829])
print('')
print(y['Strain'][1400])
print(X['Description'][1400])
print('')
print(y['Strain'][1655])
print(X['Description'][1655])
print('')
print(y['Strain'][641])
print(X['Description'][641])
print('')

Pineapple-Thai
Pineapple Thai is a strain that sets itself apart from the rest with a high 5% CBD content. This flower’s attractiveness comes from its ability to administer powerful pain relief without sedation, embodying true hybrid effects. Although Pineapple Thai is an option for daytime use, its strength may lead to a nighttime preference. Patients who desire the powerful medicinal effects of cannabis without the psychoactive inundation will find Pineapple Thai to be a staple in their arsenal.

Purple-Kush
Purple Kush is a pure indica strain that emerged from the Oakland area of California as the result of a Hindu Kush and Purple Afghani cross. Its aroma is subtle and earthy with sweet overtones typical of Kush varieties. Blissful, long-lasting euphoria blankets the mind while physical relaxation rids the body of pain, sleeplessness, and stress. Purple Kush will grow wide rather than tall, and will be ready for harvest following an 8 week flowering time.

Pineapple-Super-Silver-Haz

## Pickle Model

In [165]:
import pickle

In [166]:
# Export Pickle File
filename = 'knn_01.pkl'
pickle.dump(nn, open(filename, 'wb'))