# Unit 4 Build Week - Med Cabinet

- Create a NLP model that returns the top three strain reccomendations based on what the user is looking for
  - Return output should be in json format
- Host the database in a PostgreSQL or SQLite
- Deploy a Heroku App for the front end web developers to connect to


### Load and clean data

In [None]:
# Load file into Google Colab

from google.colab import files
uploaded = files.upload()

In [None]:
# Imports

import pandas as pd
import numpy as np

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [None]:
# Read in CSV

data = pd.read_csv("cabinet_strain.csv")
print(data.shape)
data.head()

(1644, 8)


Unnamed: 0,strain_id,strain_name,strain_type,strain_rating,effects_profile,flavor_profile,strain_description,model_id
0,3535,1024,Sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,1.0
1,3534,100 OG,Hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,0.0
2,3536,13 Dawgs,Hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,
3,3537,24K Gold,Hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",2.0
4,3539,303 OG,Indica,4.2,"Relaxed,Happy,Euphoric,Uplifted,Giggly","Citrus,Pungent,Earthy",The indica-dominant 303 OG is a Colorado strai...,4.0


In [None]:
# Cleaning the data and changing nan values to unknowns and 0s.

data = data.drop('model_id', axis=1)
data['strain_rating'] = data['strain_rating'].replace(np.nan, 0)
data['effects_profile'] = data['effects_profile'].replace(np.nan, 'Effects currently unknown')
data['flavor_profile'] = data['flavor_profile'].replace(np.nan, 'Flavor profile currently unavailable')
data['strain_description'] = data['strain_description'].replace(np.nan, 'Strain description currently unavailable')
data['strain_type'] = data['strain_type'].replace(np.nan, 'Strain type currently unknown')

In [None]:
# Unique Strain types

data['strain_type'] = data['strain_type'].str.capitalize()

data['strain_type'].unique()

array(['Sativa', 'Hybrid', 'Indica', 'Strain type currently unknown'],
      dtype=object)

In [None]:
# Unique ratings

data['strain_rating'].unique()

array([4.4, 4. , 4.2, 4.6, 4.5, 4.3, 4.7, 5. , 3.8, 4.8, 4.1, 0. , 3.4,
       3.7, 3.9, 4.9, 3.6, 2.8, 3.3, 3.5, 2. , 3. , 3.2])

In [None]:
# Check for remaining nan values

data.isnull().sum()

strain_id             0
strain_name           0
strain_type           0
strain_rating         0
effects_profile       0
flavor_profile        0
strain_description    0
dtype: int64

In [None]:
# Drop nan values

data = data.copy()

data = data.dropna()
data = data.reset_index(drop=True)
data.shape

(1644, 7)

In [None]:
data.head()

Unnamed: 0,strain_id,strain_name,strain_type,strain_rating,effects_profile,flavor_profile,strain_description
0,3535,1024,Sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
1,3534,100 OG,Hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
2,3536,13 Dawgs,Hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
3,3537,24K Gold,Hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."
4,3539,303 OG,Indica,4.2,"Relaxed,Happy,Euphoric,Uplifted,Giggly","Citrus,Pungent,Earthy",The indica-dominant 303 OG is a Colorado strai...


In [None]:
# Create a master profile feature

data['strain_profile'] = data['strain_type'] + ',' + data['effects_profile'] + ',' + data['flavor_profile']

In [None]:
# Save cleaned df to use in database

data.to_csv('med_cabinet_cleaned.csv')

In [None]:
# Vectorizer object

nlp=English()
tokenizer = Tokenizer(nlp.vocab)
my_words = ['unavailable', 'unknown', 'profile', 'currently']
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)

tf = TfidfVectorizer(stop_words='english')

In [None]:
# Create a data-term matrix

dtm = tf.fit_transform(data['strain_profile'].values.astype('U'))
dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names())
print(dtm.shape)
dtm.head()

(1644, 76)


Unnamed: 0,ammonia,apple,apricot,aroused,berry,blue,blueberry,butter,cheese,chemical,chestnut,citrus,coffee,creative,currently,diesel,dry,earthy,effects,energetic,euphoric,flavor,flowery,focused,fruit,giggly,grape,grapefruit,happy,herbal,honey,hungry,hybrid,indica,lavender,lemon,lime,mango,menthol,mint,minty,mouth,nutty,orange,peach,pear,pepper,pine,pineapple,plum,profile,pungent,relaxed,rose,sage,sativa,skunk,sleepy,spicy,strain,strawberry,sweet,talkative,tar,tea,tingly,tobacco,tree,tropical,type,unavailable,unknown,uplifted,vanilla,violet,woody
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.239361,0.0,0.0,0.0,0.0,0.0,0.257072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128996,0.365197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137083,0.0,0.597725,0.280095,0.0,0.0,0.365197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156628,0.0,0.0,0.33792
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.407797,0.0,0.353197,0.0,0.0,0.0,0.271987,0.0,0.379331,0.213393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.323322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27854,0.0,0.0,0.0,0.47171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.635274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.253561,0.0,0.219612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.505945,0.0,0.0,0.0,0.254974,0.201037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293302,0.0,0.0,0.0,0.0,0.0,0.0,0.143705,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.362791,0.0,0.0,0.0,0.0,0.0,0.24197,0.0,0.0,0.189842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.169338,0.0,0.0,0.0,0.28764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.634573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.179953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205611,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.418591,0.0,0.0,0.0,0.0,0.0,0.279187,0.0,0.0,0.219041,0.0,0.0,0.0,0.0,0.507022,0.0,0.0,0.195383,0.0,0.0,0.0,0.0,0.34531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430094,0.207631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237235,0.0,0.0,0.0


In [None]:
# Define and fit model

nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [None]:
# Test model

user01_input = ['none, pineapple, peach, Depression']
user01_dense = tf.transform(user01_input)
_, user01_output = nn.kneighbors(user01_dense.todense())

user01_output

array([[  29, 1440, 1234,  154,  698]])

In [None]:
# Print output

list_strains = []
for n in user01_output:
    for index in n:
        list_strains.append(index)

for n in list_strains:
    print(f"{data.loc[n,:]}\n")

strain_id                                                          3564
strain_name                                              Afghan Big Bud
strain_type                                                      Indica
strain_rating                                                         4
effects_profile                 Euphoric,Happy,Relaxed,Sleepy,Talkative
flavor_profile                                      Pungent,Lemon,Peach
strain_description    Spawn from Afghani and Big Bud, Afghan Big Bud...
strain_profile        Indica,Euphoric,Happy,Relaxed,Sleepy,Talkative...
Name: 29, dtype: object

strain_id                                                          5113
strain_name                                                Tangelo Kush
strain_type                                                      Hybrid
strain_rating                                                         4
effects_profile                Relaxed,Sleepy,Giggly,Talkative,Uplifted
flavor_profile                         

In [None]:
# Pickle the model for web deployment

# Imports
import pickle
from sklearn.externals import joblib 

# To save the trained model as a pickle file . 
saved_model = pickle.dumps(nn) 
  
# Pickle model 
nn_model = pickle.loads(saved_model) 

# Save as a pickle file 
joblib.dump(nn_model, 'nn01_model.pkl') 