In [1]:
# !unzip cannabis-strains.zip

## Data

In [26]:
import pandas as pd

In [27]:
df = pd.read_csv('../data/clean_can.csv')

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,Strain,Type,Rating,Effects,Flavor,Description
0,0,0,100-og,hybrid,4.0,"creative,energetic,tingly,euphoric,relaxed","earthy,sweet,citrus",$100 og is a 50/50 hybrid strain that packs a ...
1,1,1,98-white-widow,hybrid,4.7,"relaxed,aroused,creative,happy,energetic","flowery,violet,diesel",the ‘98 aloha white widow is an especially pot...
2,2,2,1024,sativa,4.4,"uplifted,happy,relaxed,energetic,creative","spicy/herbal,sage,woody",1024 is a sativa-dominant hybrid bred in spain...
3,3,3,13-dawgs,hybrid,4.2,"tingly,creative,hungry,relaxed,uplifted","apricot,citrus,grapefruit",13 dawgs is a hybrid of g13 and chemdawg genet...
4,4,4,24k-gold,hybrid,4.6,"happy,relaxed,euphoric,uplifted,talkative","citrus,earthy,orange","also known as kosher tangie, 24k gold is a 60%..."


## Tokenize

In [29]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

from spacy.tokenizer import Tokenizer

In [30]:
import spacy
nlp = spacy.load("en_core_web_md")

In [31]:
# Instantiate tokenizer
tokenizer = Tokenizer(nlp.vocab)

In [32]:
# Define Stop Words
STOP_WORDS = nlp.Defaults.stop_words.union([' ', '  ', 'the', 'like', 'i', 'for', 'I', "i've", 'we', '\n', '\n\n', 'my', 'this'])

In [33]:
# Define tokenize function
def tokenize(series):
    for doc in tokenizer.pipe(series, batch_size = 500):
        # Initialize doc tokens
        doc_tokens = []

        # Filter for stop words
        for token in doc:
            if token.text not in STOP_WORDS:
                doc_tokens.append(token.text.lower())

        # Append doc_tokens to tokens
        tokens.append(doc_tokens)
    return tokens

In [34]:
# Apply tokenize function
tokens = []
tokens = tokenize(df['Flavor'])
df['Tokens'] = tokens

In [35]:
# Loop to clean up
for token_list in tokens:
    for token in token_list:
        if token in STOP_WORDS:
            token_list.remove(token)

## Vector Representation

In [36]:
# Clean up text
clean_descriptions = []

# Loop over tokens
for token_list in df['Tokens']:
    clean_description = ' '.join(token_list)
    clean_descriptions.append(clean_description)

In [37]:
# Send to X
df['clean_descriptions'] = clean_descriptions

In [44]:
# Instantiate Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf = tfidf.fit(df['clean_descriptions'])

In [47]:
# Pickle tfidf Vectorizer
filename = '../models/flavor_vectorizer_01.pkl'
pickle.dump(tfidf, open(filename, 'wb'))

In [48]:
# Create vocab and get word counts
sparse = tfidf.transform(df['clean_descriptions'])

In [49]:
# send the matrix to a DataFrame
tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())

## KNN Model

In [50]:
# Instantiate nearest neighbors model
flavor_nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
flavor_nn.fit(tfidf_dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

## Make Predictions (API)

In [1]:
import pickle
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Import model and Vectorizer
flavor_knn = pickle.load(open("../models/flavor_knn_01.pkl", 'rb'))
tfidf = pickle.load(open("../models/flavor_vectorizer_01.pkl", "rb"))

In [3]:
# Get data
df = pd.read_csv('../data/clean_can.csv')

In [4]:
# Create a fake weed review
fake_input = """flowery, violet, diesel"""

In [8]:
def get_5_recommendations(request):
    # Transform
    request = pd.Series(request)
    request_sparse = tfidf.transform(request)

    # Send to df
    request_tfidf = pd.DataFrame(request_sparse.todense())

    # Return a list of indexes
    top5 = flavor_knn.kneighbors([request_tfidf][0], n_neighbors=5)[1][0].tolist()
    
    # Send recomendations to DataFrame
    recommendations_df = df.iloc[top5]
    
    return recommendations_df

In [9]:
# Test request function
top5 = get_5_recommendations(fake_input)

In [10]:
top5

Unnamed: 0.1,Unnamed: 0,index,Strain,Type,Rating,Effects,Flavor,Description
1,1,1,98-white-widow,hybrid,4.7,"relaxed,aroused,creative,happy,energetic","flowery,violet,diesel",the ‘98 aloha white widow is an especially pot...
1738,1742,1796,rolls-choice,hybrid,4.8,"relaxed,happy,uplifted,energetic,euphoric","diesel,earthy,violet",rolls choice by royal choice farms is a sativa...
1963,1967,2028,suicide-girl,hybrid,4.8,"talkative,euphoric,uplifted,relaxed,tingly","violet,sweet,citrus",suicide girl by calyx gardens is the cleverly ...
1532,1532,1575,petrolia-headstash,indica,4.6,"relaxed,euphoric,uplifted,happy,aroused","pungent,violet,woody",the coveted petrolia headstash by reeferman se...
1253,1253,1283,leonidas,sativa,4.0,"focused,aroused,uplifted,creative,energetic","grape,lavender,violet",leonidas is a super silver haze variation prod...


## Pickle Model

In [56]:
import pickle

In [57]:
# Export Pickle File
filename = '../models/flavor_knn_01.pkl'
pickle.dump(flavor_nn, open(filename, 'wb'))