In [20]:
import pandas as pd
can = pd.read_csv('can.csv')
print(can.shape)
can.head()

(2351, 6)


Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [21]:
can.isnull().sum()

Strain          0
Type            0
Rating          0
Effects         0
Flavor         46
Description    33
dtype: int64

In [22]:
# Replace any nan values with whitespace (will be stripped in tokenization)
can.Flavor = can.Flavor.fillna(' ')
can.Description = can.Description.fillna('')

In [23]:
# Create new series with information

can['total_text'] = can.Strain + can.Effects + can.Flavor + can['Description']

In [25]:
# Inialize spaCy model and tokenizer
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)

In [29]:
stop_words = nlp.Defaults.stop_words.union({'$', '', None,'...'})

In [30]:
tokens = []
    # Return lower case tokens and ignore stop words/punctuation
for doc in tokenizer.pipe(can['total_text'], batch_size=5):
    
    doc_tokens = []
    
    for token in doc:
        if (token.text not in stop_words) & (token.is_punct == False) & (token.is_space == False) & (token.is_digit == False):
                    doc_tokens.append(token.text.lower())

    tokens.append(doc_tokens)
    
can['spaCy_tokens'] = tokens
can['spaCy_tokens'].head()
       

0    [100-ogcreative,energetic,tingly,euphoric,rela...
1    [98-white-widowrelaxed,aroused,creative,happy,...
2    [1024uplifted,happy,relaxed,energetic,creative...
3    [13-dawgstingly,creative,hungry,relaxed,uplift...
4    [24k-goldhappy,relaxed,euphoric,uplifted,talka...
Name: spaCy_tokens, dtype: object

In [None]:
# Save cleaned df to use in database

# can.to_csv('canabis.csv')

In [3]:
import pandas as pd

canabis = pd.read_csv('canabis.csv')
canabis.head()


Unnamed: 0.1,Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,total_text,spaCy_tokens
0,0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,"100-OgCreative,Energetic,Tingly,Euphoric,Relax...","['100-ogcreative,energetic,tingly,euphoric,rel..."
1,1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,"98-White-WidowRelaxed,Aroused,Creative,Happy,E...","['98-white-widowrelaxed,aroused,creative,happy..."
2,2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,"1024Uplifted,Happy,Relaxed,Energetic,Creative1...","['1024uplifted,happy,relaxed,energetic,creativ..."
3,3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,"13-DawgsTingly,Creative,Hungry,Relaxed,Uplifte...","['13-dawgstingly,creative,hungry,relaxed,uplif..."
4,4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...","24K-GoldHappy,Relaxed,Euphoric,Uplifted,Talkat...","['24k-goldhappy,relaxed,euphoric,uplifted,talk..."


### Vector representation

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer

In [5]:
def gather_data(df_column):
    """ Produces List of Documents from a df column    
    returns list of strings 
    """
    
    #data = []
    
#     for row in df_column:
#         data.append(' '.join(str(row))
    data = [''.join(str(item)) for item in df_column]
        
    
    return data

In [6]:
data = gather_data(canabis['spaCy_tokens'])
data[0]

"['100-ogcreative,energetic,tingly,euphoric,relaxed$100', 'og', '50/50', 'hybrid', 'strain', 'packs', 'strong', 'punch.', 'supposedly', 'refers', 'strength', 'high', 'price', 'started', 'showing', 'hollywood.', 'plant,', '$100', 'og', 'tends', 'produce', 'large', 'dark', 'green', 'buds', 'stems.', 'users', 'report', 'strong', 'body', 'effect', 'indica', 'pain', 'relief', 'alert,', 'cerebral', 'feeling', 'thanks', 'sativa', 'side.earthy,sweet,citrus']"

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# create the transformer
vect = CountVectorizer()

# build vocab

vect.fit(data)

# transform text
dtm = vect.transform(data)
print(dtm.shape)

(2351, 12066)


In [8]:
# Tunning Parameters

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english',
                         ngram_range = (1,3),
                        max_df=.98,
                        min_df=0.025)
                        #tokenizer=tokenize)

# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(data)


# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

Unnamed: 0,10,20,50,60,70,active,afghani,alien,anxiety,appetite,...,way,week,weeks,white,white widow,widow,won,woody,yield,yields
0,0.0,0.0,0.41917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.473365,0.484928,0.482294,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152928,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181631,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161667,0.0,0.0
4,0.0,0.0,0.0,0.188204,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.neighbors import NearestNeighbors

# Fit on TF-IDF Vectors
nn  = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [15]:
# Test model

fake_input = ['hybrid happy strawberry']
fake_dense = tfidf.transform(fake_input)
_, fake_output = nn.kneighbors(fake_dense.todense())

fake_output

array([[2000, 2009, 2017, 2012, 1842]], dtype=int64)

In [16]:
# Convert to train information

response = canabis[['Strain', 'Effects', 'Flavor', 'Description']].iloc[fake_output[0]]
response

Unnamed: 0,Strain,Effects,Flavor,Description
2000,Strawberry-Amnesia,"Creative,Relaxed,Uplifted,Euphoric,Energetic","Strawberry,Sweet,Earthy",A powerful and uplifting flower from Dinafem S...
2009,Strawberry-Fields,"Relaxed,Happy,Uplifted,Hungry,Euphoric","Strawberry,Sweet,Berry",Strawberry Fields by Sagarmatha Seeds is an in...
2017,Strawberry-Satori,"Uplifted,Creative,Relaxed,Happy,Energetic","Strawberry,Berry,Pungent","Strawberry Satori is a mostly sativa strain, f..."
2012,Strawberry-Kush,"Relaxed,Happy,Sleepy,Uplifted,Euphoric","Strawberry,Sweet,Pungent","Strong and sweet, Strawberry Kush is one well-..."
1842,Sequoia-Strawberry,"Uplifted,Relaxed,Energetic,Euphoric,Focused","Strawberry,Sweet,Pine",Sequoia Strawberry by SinCity Seeds is a sativ...


In [17]:
# Example conversion to JSON

response.to_json(orient='records')

'[{"Strain":"Strawberry-Amnesia","Effects":"Creative,Relaxed,Uplifted,Euphoric,Energetic","Flavor":"Strawberry,Sweet,Earthy","Description":"A powerful and uplifting flower from Dinafem Seeds, Strawberry Amnesia is a strain made in sativa heaven.\\u00a0Bred from Strawberry Cough and Amnesia, this strain delivers the familiar sweet strawberry and earthy flavors of its parents.\\u00a0Having the typical energizing and euphoric effects of a sativa, Strawberry Amnesia also induces the calming body high from its distant indica relatives. The dark green buds of Strawberry Amnesia are very dense and heavily coated in resin, so this potent sativa should be handled with caution.\\u00a0\\u00a0"},{"Strain":"Strawberry-Fields","Effects":"Relaxed,Happy,Uplifted,Hungry,Euphoric","Flavor":"Strawberry,Sweet,Berry","Description":"Strawberry Fields by Sagarmatha Seeds is an indica-dominant hybrid strain that takes the flavorful Strawberry Cough and crosses it with an undisclosed indica parent. This strain

In [None]:
# # Create pickles of the model and the transformer for web deployment

# # Imports
# import pickle
# from sklearn.externals import joblib 

# # Save the model as a pickle file 
# joblib.dump(nn, 'nn02_model.pkl') 

# # Save the transformer as a pickle file
# joblib.dump(tf, 'tf_01.pkl')