In [62]:
import pandas as pd
import numpy as np

import spacy
from spacy.tokenizer import Tokenizer
from sklearn.externals import joblib 

nlp = spacy.load("en_core_web_lg")

In [135]:
data = pd.read_csv('./merged.csv', na_values='None')
print(data.shape)
data.head()

(2826, 9)


Unnamed: 0,id,name,race,flavors,positive,negative,medical,Rating,Description
0,1,Afpak,hybrid,"['Earthy', 'Chemical', 'Pine', 'Spicy/Herbal']","['Relaxed', 'Hungry', 'Happy', 'Sleepy', 'Crea...",['Dizzy'],"['Depression', 'Insomnia', 'Pain', 'Stress', '...",4.2,"Afpak, named for its direct Afghani and Pakist..."
1,2,African,sativa,"['Spicy/Herbal', 'Pungent', 'Earthy', 'Pepper']","['Euphoric', 'Happy', 'Creative', 'Energetic',...",['Dry Mouth'],"['Depression', 'Pain', 'Stress', 'Lack of Appe...",3.9,African refers to the indigenous varieties of ...
2,3,Afternoon Delight,hybrid,"['Pepper', 'Flowery', 'Pine', 'Pungent', 'Citr...","['Relaxed', 'Hungry', 'Euphoric', 'Uplifted', ...","['Dizzy', 'Dry Mouth', 'Paranoid']","['Depression', 'Insomnia', 'Pain', 'Stress', '...",4.8,"Afternoon Delight, created by Colorado Seed In..."
3,4,Afwreck,hybrid,"['Pine', 'Earthy', 'Flowery', 'Pungent']","['Relaxed', 'Happy', 'Creative', 'Uplifted', '...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Pain', 'Stress', 'Headache', 'Fatigue', 'Hea...",4.2,Afwreck is a hybrid cross of Afghani and Train...
4,5,Agent Orange,hybrid,"['Citrus', 'Orange', 'Sweet', 'Earthy']","['Relaxed', 'Euphoric', 'Happy', 'Energetic', ...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Depression', 'Pain', 'Stress', 'Nausea', 'He...",4.2,Don’t let the name scare you! The only herbici...


In [137]:
data['all_text'] = data['name'] + ' ' + data['race'] + ' ' + data.flavors  + ' ' + data.positive  + ' ' + data.negative  + ' ' + data.medical  + ' ' + data.Description

In [139]:
tokenizer = Tokenizer(nlp.vocab)

In [140]:
tokens = []

for doc in tokenizer.pipe(data['all_text'], batch_size=500):
    
    doc_tokens = []
    
    for token in doc:
        if (token.is_stop == False) & (token.is_punct == False):
            doc_tokens.append(token.text.lower())

    tokens.append(doc_tokens)

data['tokens'] = tokens

In [141]:
data.head()

Unnamed: 0,id,name,race,flavors,positive,negative,medical,Rating,Description,all_text,tokens
0,1,Afpak,hybrid,"['Earthy', 'Chemical', 'Pine', 'Spicy/Herbal']","['Relaxed', 'Hungry', 'Happy', 'Sleepy', 'Crea...",['Dizzy'],"['Depression', 'Insomnia', 'Pain', 'Stress', '...",4.2,"Afpak, named for its direct Afghani and Pakist...","Afpak hybrid ['Earthy', 'Chemical', 'Pine', 'S...","[afpak, hybrid, ['earthy',, 'chemical',, 'pine..."
1,2,African,sativa,"['Spicy/Herbal', 'Pungent', 'Earthy', 'Pepper']","['Euphoric', 'Happy', 'Creative', 'Energetic',...",['Dry Mouth'],"['Depression', 'Pain', 'Stress', 'Lack of Appe...",3.9,African refers to the indigenous varieties of ...,"African sativa ['Spicy/Herbal', 'Pungent', 'Ea...","[african, sativa, ['spicy/herbal',, 'pungent',..."
2,3,Afternoon Delight,hybrid,"['Pepper', 'Flowery', 'Pine', 'Pungent', 'Citr...","['Relaxed', 'Hungry', 'Euphoric', 'Uplifted', ...","['Dizzy', 'Dry Mouth', 'Paranoid']","['Depression', 'Insomnia', 'Pain', 'Stress', '...",4.8,"Afternoon Delight, created by Colorado Seed In...","Afternoon Delight hybrid ['Pepper', 'Flowery',...","[afternoon, delight, hybrid, ['pepper',, 'flow..."
3,4,Afwreck,hybrid,"['Pine', 'Earthy', 'Flowery', 'Pungent']","['Relaxed', 'Happy', 'Creative', 'Uplifted', '...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Pain', 'Stress', 'Headache', 'Fatigue', 'Hea...",4.2,Afwreck is a hybrid cross of Afghani and Train...,"Afwreck hybrid ['Pine', 'Earthy', 'Flowery', '...","[afwreck, hybrid, ['pine',, 'earthy',, 'flower..."
4,5,Agent Orange,hybrid,"['Citrus', 'Orange', 'Sweet', 'Earthy']","['Relaxed', 'Euphoric', 'Happy', 'Energetic', ...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Depression', 'Pain', 'Stress', 'Nausea', 'He...",4.2,Don’t let the name scare you! The only herbici...,"Agent Orange hybrid ['Citrus', 'Orange', 'Swee...","[agent, orange, hybrid, ['citrus',, 'orange',,..."


In [142]:
# The data was an Object; I had to do this to make it strings rather than lists.

data['work_plz']=[" ".join(string) for string in data['tokens'].values]

In [144]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
dtm = tfidf.fit_transform(data['work_plz'])
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.shape

(2826, 8814)

In [145]:
from sklearn.neighbors import NearestNeighbors

# Fit on DTM
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [147]:
nn.kneighbors([dtm.iloc[1000]])  # The second set of values are the suggested strains.

(array([[0.        , 1.22555965, 1.24215367, 1.25453501, 1.26453768]]),
 array([[1000, 1994, 1646, 1686, 1297]]))

In [149]:
test_phrase = ["""
This is a test to see if the model predicts properly
"""]

In [150]:
new = tfidf.transform(test_phrase)

In [151]:
nn.kneighbors(new.todense())  # The second set of values are the suggested strains.

(array([[1.33120341, 1.33758175, 1.34028308, 1.34882354, 1.35019649]]),
 array([[2192, 2580, 2042, 2589, 2413]]))

In [153]:
joblib.dump(nn, 'test_compressed.pkl', compress=3)

['test_compressed.pkl']

In [154]:
joblib.dump(tfidf, "test_tfidf.pkl")

['test_tfidf.pkl']