In [1]:
import pandas as pd
import numpy as np

import spacy
from spacy.tokenizer import Tokenizer

from sklearn.externals import joblib 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors


nlp = spacy.load("en_core_web_lg")

In [116]:
data = pd.read_csv('../merged.csv', na_values='None')
# data = data.set_index('name')
print(data.shape)
data.head()

(2826, 9)


Unnamed: 0,id,name,race,flavors,positive,negative,medical,Rating,Description
0,1,Afpak,hybrid,"['Earthy', 'Chemical', 'Pine', 'Spicy/Herbal']","['Relaxed', 'Hungry', 'Happy', 'Sleepy', 'Crea...",['Dizzy'],"['Depression', 'Insomnia', 'Pain', 'Stress', '...",4.2,"Afpak, named for its direct Afghani and Pakist..."
1,2,African,sativa,"['Spicy/Herbal', 'Pungent', 'Earthy', 'Pepper']","['Euphoric', 'Happy', 'Creative', 'Energetic',...",['Dry Mouth'],"['Depression', 'Pain', 'Stress', 'Lack of Appe...",3.9,African refers to the indigenous varieties of ...
2,3,Afternoon Delight,hybrid,"['Pepper', 'Flowery', 'Pine', 'Pungent', 'Citr...","['Relaxed', 'Hungry', 'Euphoric', 'Uplifted', ...","['Dizzy', 'Dry Mouth', 'Paranoid']","['Depression', 'Insomnia', 'Pain', 'Stress', '...",4.8,"Afternoon Delight, created by Colorado Seed In..."
3,4,Afwreck,hybrid,"['Pine', 'Earthy', 'Flowery', 'Pungent']","['Relaxed', 'Happy', 'Creative', 'Uplifted', '...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Pain', 'Stress', 'Headache', 'Fatigue', 'Hea...",4.2,Afwreck is a hybrid cross of Afghani and Train...
4,5,Agent Orange,hybrid,"['Citrus', 'Orange', 'Sweet', 'Earthy']","['Relaxed', 'Euphoric', 'Happy', 'Energetic', ...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Depression', 'Pain', 'Stress', 'Nausea', 'He...",4.2,Don’t let the name scare you! The only herbici...


In [87]:
data.isnull().sum()

id               0
race             0
flavors          0
positive         0
negative         0
medical          0
Rating         475
Description    480
dtype: int64

In [117]:
data['all_text'] = data['race'] + ' ' + data.flavors  + ' ' + data.positive  + ' ' + data.negative  + ' ' + data.medical + ' ' + str(data.Description)

In [118]:
tokenizer = Tokenizer(nlp.vocab)

In [119]:
tokens = []

for doc in tokenizer.pipe(data['all_text'], batch_size=500):
    
    doc_tokens = []
    
    for token in doc:
        if (token.is_stop == False) & (token.is_punct == False):
            doc_tokens.append(token.text.lower())

    tokens.append(doc_tokens)

data['tokens'] = tokens

In [120]:
data.head()

Unnamed: 0,id,name,race,flavors,positive,negative,medical,Rating,Description,all_text,tokens
0,1,Afpak,hybrid,"['Earthy', 'Chemical', 'Pine', 'Spicy/Herbal']","['Relaxed', 'Hungry', 'Happy', 'Sleepy', 'Crea...",['Dizzy'],"['Depression', 'Insomnia', 'Pain', 'Stress', '...",4.2,"Afpak, named for its direct Afghani and Pakist...","hybrid ['Earthy', 'Chemical', 'Pine', 'Spicy/H...","[hybrid, ['earthy',, 'chemical',, 'pine',, 'sp..."
1,2,African,sativa,"['Spicy/Herbal', 'Pungent', 'Earthy', 'Pepper']","['Euphoric', 'Happy', 'Creative', 'Energetic',...",['Dry Mouth'],"['Depression', 'Pain', 'Stress', 'Lack of Appe...",3.9,African refers to the indigenous varieties of ...,"sativa ['Spicy/Herbal', 'Pungent', 'Earthy', '...","[sativa, ['spicy/herbal',, 'pungent',, 'earthy..."
2,3,Afternoon Delight,hybrid,"['Pepper', 'Flowery', 'Pine', 'Pungent', 'Citr...","['Relaxed', 'Hungry', 'Euphoric', 'Uplifted', ...","['Dizzy', 'Dry Mouth', 'Paranoid']","['Depression', 'Insomnia', 'Pain', 'Stress', '...",4.8,"Afternoon Delight, created by Colorado Seed In...","hybrid ['Pepper', 'Flowery', 'Pine', 'Pungent'...","[hybrid, ['pepper',, 'flowery',, 'pine',, 'pun..."
3,4,Afwreck,hybrid,"['Pine', 'Earthy', 'Flowery', 'Pungent']","['Relaxed', 'Happy', 'Creative', 'Uplifted', '...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Pain', 'Stress', 'Headache', 'Fatigue', 'Hea...",4.2,Afwreck is a hybrid cross of Afghani and Train...,"hybrid ['Pine', 'Earthy', 'Flowery', 'Pungent'...","[hybrid, ['pine',, 'earthy',, 'flowery',, 'pun..."
4,5,Agent Orange,hybrid,"['Citrus', 'Orange', 'Sweet', 'Earthy']","['Relaxed', 'Euphoric', 'Happy', 'Energetic', ...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Depression', 'Pain', 'Stress', 'Nausea', 'He...",4.2,Don’t let the name scare you! The only herbici...,"hybrid ['Citrus', 'Orange', 'Sweet', 'Earthy']...","[hybrid, ['citrus',, 'orange',, 'sweet',, 'ear..."


In [121]:
# The data was an Object; I had to do this to make it strings rather than lists.

data['work_plz']=[" ".join(string) for string in data['tokens'].values]

In [122]:
tfidf = TfidfVectorizer()
dtm = tfidf.fit_transform(data['work_plz'])
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,...,woody,xj,xxx,yeti,yoda,you,zamaldelica,zelly,zeus,zombie
0,0.023229,0.023229,0.023229,0.046457,0.023229,0.023229,0.023229,0.023229,0.023229,0.023229,...,0.0,0.023229,0.069686,0.023229,0.046457,0.046457,0.023229,0.023229,0.023229,0.023229
1,0.023149,0.023149,0.023149,0.046299,0.023149,0.023149,0.023149,0.023149,0.023149,0.023149,...,0.0,0.023149,0.069448,0.023149,0.046299,0.046299,0.023149,0.023149,0.023149,0.023149
2,0.022967,0.022967,0.022967,0.045934,0.022967,0.022967,0.022967,0.022967,0.022967,0.022967,...,0.0,0.022967,0.068902,0.022967,0.045934,0.045934,0.022967,0.022967,0.022967,0.022967
3,0.023206,0.023206,0.023206,0.046413,0.023206,0.023206,0.023206,0.023206,0.023206,0.023206,...,0.0,0.023206,0.069619,0.023206,0.046413,0.046413,0.023206,0.023206,0.023206,0.023206
4,0.023292,0.023292,0.023292,0.046584,0.023292,0.023292,0.023292,0.023292,0.023292,0.023292,...,0.0,0.023292,0.069876,0.023292,0.046584,0.046584,0.023292,0.023292,0.023292,0.023292


In [123]:
dtm['id'] = None
dtm['id'] = data.name
dtm = dtm.set_index('id')
dtm.head()

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,...,xj,xxx,yeti,yoda,you,zamaldelica,zelly,zeus,zombie,id
0,0.023229,0.023229,0.023229,0.046457,0.023229,0.023229,0.023229,0.023229,0.023229,0.023229,...,0.023229,0.069686,0.023229,0.046457,0.046457,0.023229,0.023229,0.023229,0.023229,Afpak
1,0.023149,0.023149,0.023149,0.046299,0.023149,0.023149,0.023149,0.023149,0.023149,0.023149,...,0.023149,0.069448,0.023149,0.046299,0.046299,0.023149,0.023149,0.023149,0.023149,African
2,0.022967,0.022967,0.022967,0.045934,0.022967,0.022967,0.022967,0.022967,0.022967,0.022967,...,0.022967,0.068902,0.022967,0.045934,0.045934,0.022967,0.022967,0.022967,0.022967,Afternoon Delight
3,0.023206,0.023206,0.023206,0.046413,0.023206,0.023206,0.023206,0.023206,0.023206,0.023206,...,0.023206,0.069619,0.023206,0.046413,0.046413,0.023206,0.023206,0.023206,0.023206,Afwreck
4,0.023292,0.023292,0.023292,0.046584,0.023292,0.023292,0.023292,0.023292,0.023292,0.023292,...,0.023292,0.069876,0.023292,0.046584,0.046584,0.023292,0.023292,0.023292,0.023292,Agent Orange


In [125]:
dtm.head()

Unnamed: 0_level_0,10,11,12,13,14,15,16,17,18,19,...,woody,xj,xxx,yeti,yoda,you,zamaldelica,zelly,zeus,zombie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afpak,0.023229,0.023229,0.023229,0.046457,0.023229,0.023229,0.023229,0.023229,0.023229,0.023229,...,0.0,0.023229,0.069686,0.023229,0.046457,0.046457,0.023229,0.023229,0.023229,0.023229
African,0.023149,0.023149,0.023149,0.046299,0.023149,0.023149,0.023149,0.023149,0.023149,0.023149,...,0.0,0.023149,0.069448,0.023149,0.046299,0.046299,0.023149,0.023149,0.023149,0.023149
Afternoon Delight,0.022967,0.022967,0.022967,0.045934,0.022967,0.022967,0.022967,0.022967,0.022967,0.022967,...,0.0,0.022967,0.068902,0.022967,0.045934,0.045934,0.022967,0.022967,0.022967,0.022967
Afwreck,0.023206,0.023206,0.023206,0.046413,0.023206,0.023206,0.023206,0.023206,0.023206,0.023206,...,0.0,0.023206,0.069619,0.023206,0.046413,0.046413,0.023206,0.023206,0.023206,0.023206
Agent Orange,0.023292,0.023292,0.023292,0.046584,0.023292,0.023292,0.023292,0.023292,0.023292,0.023292,...,0.0,0.023292,0.069876,0.023292,0.046584,0.046584,0.023292,0.023292,0.023292,0.023292


In [128]:
# Fit on DTM
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [129]:
data.iloc[552]

id                                                           643
name                                                    Damnesia
race                                                      hybrid
flavors                     ['Pepper', 'Spicy/Herbal', 'Earthy']
positive       ['Relaxed', 'Happy', 'Energetic', 'Talkative',...
negative                               ['Dry Mouth', 'Dry Eyes']
medical        ['Depression', 'Pain', 'Stress', 'Lack of Appe...
Rating                                                       4.2
Description    A spin-off of the Amsterdam staple Amnesia Haz...
all_text       hybrid ['Pepper', 'Spicy/Herbal', 'Earthy'] ['...
tokens         [hybrid, ['pepper',, 'spicy/herbal',, 'earthy'...
work_plz       hybrid ['pepper', 'spicy/herbal', 'earthy'] ['...
Name: 552, dtype: object

In [130]:
nn.kneighbors([dtm.iloc[1]])  # The second set of values are the suggested strains.

(array([[0.        , 0.19318651, 0.20225117, 0.2156507 , 0.22227077]]),
 array([[   1,  552, 1955,  633,  494]]))

In [131]:
test_phrase = ["""
I want to feel good
"""]

In [132]:
new = tfidf.transform(test_phrase)

In [133]:
nn.kneighbors(new.todense())  # The second set of values are the suggested strains.

(array([[1., 1., 1., 1., 1.]]), array([[1046, 1460,   55,  755, 1835]]))

In [67]:
from sklearn.externals import joblib 
joblib.dump(tfidf, 'baseline_tfidf.pkl')

['baseline_tfidf.pkl']

In [68]:
joblib.dump(tfidf, 'baseline_tfidf.pkl')

['baseline_tfidf.pkl']

In [81]:
class strainSuggester():
    """
    Generates 5 suggested strains based on input text
    """
    def __init__(self):
        self.tfidf = joblib.load('baseline_tfidf.pkl')
        self.test_model = joblib.load('baseline_model.pkl')
    
    def suggestStrain(self, input, neighbors=5):
        vectorized = (tfidf.transform(input)).todense()
        results = nn.kneighbors(vectorized, neighbors)
        return results[1]

In [82]:
ss = strainSuggester()
ss.suggestStrain(test_phrase)

array([[1046, 1460,   55,  755, 1835]])