# TFIDF KNN Approach 

In [1]:
import pandas as pd

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import pickle

In [2]:
# Instantiate the tokenizer 
nlp=English()
tokenizer = Tokenizer(nlp.vocab)

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/Build-Week-Med-Cabinet-6/DS/mark-dev/data/cannabis.csv")
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [4]:
# drop nan or missing values
df = df.dropna()
df = df.drop(df.index[149])
df = df.reset_index(drop=True)




In [5]:
# Combine the Effects and Flavors in one column
df['Combined'] = df['Effects'] + ',' + df['Flavor']


In [26]:
# Instantiate vecorizer object - call tokenize
dtm_combined_tf = TfidfVectorizer(stop_words='english')


# dtm_combined  (vocabulary) and get word counts 
# effects and flavors combined 
dtm_combined = dtm_combined_tf.fit_transform(df['Combined'].values.astype('U'))
dtm_combined = pd.DataFrame(dtm_combined.todense(), columns=dtm_combined_tf.get_feature_names())
dtm_combined.head()

Unnamed: 0,ammonia,apple,apricot,aroused,berry,blue,blueberry,butter,cheese,chemical,...,tar,tea,tingly,tobacco,tree,tropical,uplifted,vanilla,violet,woody
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.497994,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.363903,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.702087,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.168966,0.0,0.0,0.374893
3,0.0,0.0,0.659428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.294366,0.0,0.0,0.0,0.145443,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.217903,0.0,0.0,0.0


In [27]:
# Fit on TF-IDF Vectors
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm_combined)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [8]:
# Practice passing a strain to the model with this string
ideal_strain = ['Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus']


In [9]:
# Query for similar strains using the test case
new = dtm_combined_tf.transform(ideal_strain)
results = nn.kneighbors(new.todense())

In [29]:
# Results are returned in a tuple of arrays
results[1][0]

array([   0, 1971,  171,   81, 1255])

In [11]:
type(results)

tuple

In [12]:
# Pull the strain name from 1st value (0) of the 1st array (0) of the 2nd tuple (1) - the 0 index
df['Strain'][results[1][0][0]]


'100-Og'

In [13]:
df['Combined'][results[1][0][0]]

'Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus'

In [14]:
df['Description'][results[1][0][0]]

'$100 OG is a 50/50 hybrid strain that packs a strong punch. The name supposedly refers to both its strength and high price when it first started showing up in Hollywood. As a plant, $100 OG tends to produce large dark green buds with few stems. Users report a strong body effect of an indica for pain relief with the more alert, cerebral feeling thanks to its sativa side.'

### Second Value 

In [15]:
# Pull the strain name from 2nd value (1) of the 1st array (0) of the 2nd tuple (1) - the 1972 index
df['Strain'][results[1][0][1]]

'Sunburn'

In [16]:
# Pull the criteria from 2nd value (1) of the 1st array (0) of the 2nd tuple (1) - the 1972 index
df['Combined'][results[1][0][1]]

'Creative,Euphoric,Uplifted,Happy,Energetic,Citrus,Earthy,Sweet'

In [17]:
# Pull the criteria from 2nd value (1) of the 1st array (0) of the 2nd tuple (1) - the 1972 index
df['Description'][results[1][0][1]]

'Sunburn is a lime green sativa-dominant hybrid that reeks of old, sweet genetics. Sunburn began as Island Sweet Skunk crossed with Rug Burn OG. Island Sweet Skunk’s genetics supposedly crossed the ocean with Vietnam Veterans. Colorado Seed Inc. took this beautifully uplifting cut and stabilized the genetics further with the addition of their Gupta Kush. The blend of sweet, pungent, and floral aromas coalesce to create an all-day sativa that stimulates without too much anxiety or paranoia. \xa0\xa0'

In [18]:
# Imagine doing a detailed return of information afterwards 
# #output results here, rec strains criteria and description
# rec_str = [strains['Strain'][results[1][0][i]] for i in range(5)]
# rec_crit = [strains['Combined'][results[1][0][i]] for i in range(5)]
# rec_str_desc = [strains['Description'][results[1][0][i]] for i in range(5)]

# rec1 = rec_str[0] + ' * ' + rec_crit[0] + ' * ' + rec_str_desc[0]
# rec2 = rec_str[1] + ' * ' + rec_crit[1] + ' * ' + rec_str_desc[1]
# rec3 = rec_str[2] + ' * ' + rec_crit[2] + ' * ' + rec_str_desc[2]
# rec4 = rec_str[3] + ' * ' + rec_crit[3] + ' * ' + rec_str_desc[3]
# rec5 = rec_str[4] + ' * ' + rec_crit[4] + ' * ' + rec_str_desc[4]


### Pickling step here

In [28]:
# Pickle the dtm and tf for use in the prediction
#pickle.dump(dtm_combined, open('/content/dtm_combined.pkl', 'wb'))
pickle.dump(dtm_combined_tf, open('../pickles/dtm_combined_tf.pickle', 'wb'))
with open('../pickles/nn.pickle','wb') as fp:
    pickle.dump(nn,fp)

# Effects

In [20]:
# Instantiate vecorizer object - call tokenize
dtm_effects_tf = TfidfVectorizer(stop_words='english')


# dtm_effects  (vocabulary) and get word counts 
# effects and flavors effects 
dtm_effects = dtm_effects_tf.fit_transform(df['Effects'].values.astype('U'))
dtm_effects = pd.DataFrame(dtm_effects.todense(), columns=dtm_effects_tf.get_feature_names())
dtm_effects.head()

Unnamed: 0,aroused,creative,dry,energetic,euphoric,focused,giggly,happy,hungry,mouth,relaxed,sleepy,talkative,tingly,uplifted
0,0.0,0.459088,0.0,0.491342,0.289761,0.0,0.0,0.0,0.0,0.0,0.276635,0.0,0.0,0.622361,0.0
1,0.689667,0.430388,0.0,0.460625,0.0,0.0,0.0,0.244321,0.0,0.0,0.259341,0.0,0.0,0.0,0.0
2,0.0,0.552204,0.0,0.591,0.0,0.0,0.0,0.313474,0.0,0.0,0.332744,0.0,0.0,0.0,0.369872
3,0.0,0.442401,0.0,0.0,0.0,0.0,0.0,0.0,0.534531,0.0,0.266579,0.0,0.0,0.599738,0.296324
4,0.0,0.0,0.0,0.0,0.346825,0.0,0.0,0.311938,0.0,0.0,0.331114,0.0,0.733009,0.0,0.36806


In [21]:
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Combined
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,"Creative,Energetic,Tingly,Euphoric,Relaxed,Ear..."
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,"Relaxed,Aroused,Creative,Happy,Energetic,Flowe..."
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,"Uplifted,Happy,Relaxed,Energetic,Creative,Spic..."
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,"Tingly,Creative,Hungry,Relaxed,Uplifted,Aprico..."
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...","Happy,Relaxed,Euphoric,Uplifted,Talkative,Citr..."


# Flavors

In [22]:
# Instantiate vecorizer object - call tokenize
dtm_flavors_tf = TfidfVectorizer(stop_words='english')


# dtm_flavors  (vocabulary) and get word counts 
# flavors 
dtm_flavors = dtm_flavors_tf.fit_transform(df['Flavor'].values.astype('U'))
dtm_flavors = pd.DataFrame(dtm_flavors.todense(), columns=dtm_flavors_tf.get_feature_names())
dtm_flavors.head()

Unnamed: 0,ammonia,apple,apricot,berry,blue,blueberry,butter,cheese,chemical,chestnut,...,strawberry,sweet,tar,tea,tobacco,tree,tropical,vanilla,violet,woody
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.507516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.826508,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.421438
3,0.0,0.0,0.756868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Leafly API EDA 


In [0]:
import pandas as pd 

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/Build-Week-Med-Cabinet-6/DS/mark-dev/data/cannabis.csv")
df.head()

In [0]:
df.shape

In [0]:
df = df.dropna()


In [0]:
df.shape

In [0]:
df.describe(exclude='number')

In [0]:
pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip


In [0]:

import pandas_profiling
df.profile_report()

# Kushy API 

In [0]:
import pandas as pd 

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/kushyapp/cannabis-dataset/master/Dataset/Strains/strains-kushy_api.2017-11-14.csv")

In [0]:
df.head()

In [0]:
df.shape

In [0]:
df.isnull().values.sum()

In [0]:
df.info()

In [0]:
# These are the columns with a percentage of missing values. 
df.isnull().sum()/len(df)*100


In [0]:
df.describe(exclude="number")

In [0]:
df.describe(include="number")