In [1]:
# !unzip cannabis-strains.zip

## Data

In [2]:
import pandas as pd

In [9]:
df = pd.read_csv('../data/clean_can.csv')

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,Strain,Type,Rating,Effects,Flavor,Description
0,0,0,100-og,hybrid,4.0,"creative,energetic,tingly,euphoric,relaxed","earthy,sweet,citrus",$100 og is a 50/50 hybrid strain that packs a ...
1,1,1,98-white-widow,hybrid,4.7,"relaxed,aroused,creative,happy,energetic","flowery,violet,diesel",the ‘98 aloha white widow is an especially pot...
2,2,2,1024,sativa,4.4,"uplifted,happy,relaxed,energetic,creative","spicy/herbal,sage,woody",1024 is a sativa-dominant hybrid bred in spain...
3,3,3,13-dawgs,hybrid,4.2,"tingly,creative,hungry,relaxed,uplifted","apricot,citrus,grapefruit",13 dawgs is a hybrid of g13 and chemdawg genet...
4,4,4,24k-gold,hybrid,4.6,"happy,relaxed,euphoric,uplifted,talkative","citrus,earthy,orange","also known as kosher tangie, 24k gold is a 60%..."


## Tokenize & Vectorize

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

import spacy
from spacy.tokenizer import Tokenizer

In [12]:
# Load Spacy Model
nlp = spacy.load("en_core_web_md")

In [13]:
# Instantiate tokenizer
tokenizer = Tokenizer(nlp.vocab)

In [14]:
# Define Stop Words
STOP_WORDS = nlp.Defaults.stop_words.union([' ', '  ', 'the', 'like', 'i', 'for', 'I', "i've",
                                            'we', '\n', '\n\n', 'my', 'this', 'as'])

In [15]:
# Define tokenize function
def tokenize(series):
    for doc in tokenizer.pipe(series, batch_size = 500):
        # Initialize doc tokens
        doc_tokens = []

        # Filter for stop words
        for token in doc:
            if token.text not in STOP_WORDS:
                doc_tokens.append(token.text.lower())

        # Append doc_tokens to tokens
        tokens.append(doc_tokens)
    return tokens

In [16]:
# Apply tokenize function
tokens = []
tokens = tokenize(df['Description'])
df['Tokens'] = tokens

In [17]:
# Loop to clean up
for token_list in tokens:
    for token in token_list:
        if token in STOP_WORDS:
            token_list.remove(token)

## Vector Representation

In [18]:
# Clean up text
clean_descriptions = []

# Loop over tokens
for token_list in df['Tokens']:
    clean_description = ' '.join(token_list)
    clean_descriptions.append(clean_description)

In [19]:
# Send to X
df['clean_descriptions'] = clean_descriptions

In [23]:
# Instantiate Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf = tfidf.fit(df['clean_descriptions'])

In [25]:
# Pickle Vectorizer
filename = 'vectorizer_02.pkl'
pickle.dump(tfidf, open(filename, 'wb'))

In [26]:
# Create vocab and get word counts
sparse = tfidf.transform(df['clean_descriptions'])

In [27]:
# send the matrix to a DataFrame
tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())

## KNN Model

In [28]:
# Instantiate nearest neighbors model
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(tfidf_dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

## Make Predictions (API)

In [1]:
import pickle
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# Get data
df = pd.read_csv('../data/clean_can.csv')

In [6]:
# Import model and Vectorizer
nn = pickle.load(open("knn_02.pkl", 'rb'))
tfidf = pickle.load(open("vectorizer_02.pkl", "rb"))

In [7]:
# Create a fake weed review
fake_input = """nice cherry is an indica-dominant strain that captures the flavorful qualities of its cherry parent and the relaxing attributes of mr. nice. with an aroma of sweet skunk, pine, and berry, nice cherry delivers a rush of cerebral energy that lifts the mood while relaxing the body. 
it’ll also bring an edge back to your appetite while providing focus to keep you productive."""

In [8]:
def get_5_recommendations(request):
    # Transform
    request = pd.Series(request)
    request_sparse = tfidf.transform(request)

    # Send to df
    request_tfidf = pd.DataFrame(request_sparse.todense())

    # Return a list of indexes
    top5 = nn.kneighbors([request_tfidf][0], n_neighbors=5)[1][0].tolist()
    
    # Send recomendations to DataFrame
    recommendations_df = df.iloc[top5]
    
    return recommendations_df

In [11]:
# Test request function
top5 = get_5_recommendations(fake_input)

In [12]:
# Send recomendations to DataFrame
top5

Unnamed: 0.1,Unnamed: 0,index,Strain,Type,Rating,Effects,Flavor,Description
1423,1423,1458,nice-cherry,indica,4.6,"happy,talkative,uplifted,relaxed,hungry","sweet,berry,pungent",nice cherry is an indica-dominant strain that ...
1652,1655,1703,purple-mr-nice,indica,4.2,"relaxed,sleepy,euphoric,happy,uplifted","pine,earthy,grape",granddaddy purple crossed with mr. nice. this...
496,496,505,cherry-durban-poison,hybrid,4.3,"uplifted,giggly,sleepy,aroused,euphoric","flowery,citrus,sweet",cherry durban poison is a hybrid strain that b...
498,498,507,cherry-grapefruit,hybrid,4.3,"creative,happy,relaxed,tingly,uplifted","berry,sweet,flowery",cherry grapefruit by kera seeds is a super fru...
502,502,511,cherry-og,hybrid,4.2,"euphoric,relaxed,happy,uplifted,hungry","berry,diesel,sweet",cherry og by emerald triangle seeds is a hybri...


## Pickle Model

In [107]:
import pickle

In [108]:
# Export Pickle File
filename = 'knn_02.pkl'
pickle.dump(nn, open(filename, 'wb'))