# TFIDF KNN Approach 

In [0]:
import pandas as pd

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import pickle

In [0]:
# Instantiate the tokenizer 
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/Build-Week-Med-Cabinet-6/DS/mark-dev/data/cannabis.csv")
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [0]:
# drop nan or missing values
df = df.dropna()
#drop entry 149 because its a dupe you dupe 
df = df.drop(df.index[149])
df = df.reset_index(drop=True)


In [0]:
# Combine the Effects and Flavors in one column so everything is together. 
df['Combined'] = df['Effects'] + ',' + df['Flavor']


In [6]:
# Instantiate vecorizer object - call tokenize
dtm_combined_tf = TfidfVectorizer(stop_words='english')


# dtm_combined  (vocabulary) and get word counts 
# effects and flavors combined 
dtm_combined = dtm_combined_tf.fit_transform(df['Combined'].values.astype('U'))
dtm_combined = pd.DataFrame(dtm_combined.todense(), columns=dtm_combined_tf.get_feature_names())
dtm_combined.head()

Unnamed: 0,ammonia,apple,apricot,aroused,berry,blue,blueberry,butter,cheese,chemical,chestnut,citrus,coffee,creative,diesel,dry,earthy,energetic,euphoric,flowery,focused,fruit,giggly,grape,grapefruit,happy,herbal,honey,hungry,lavender,lemon,lime,mango,menthol,mint,minty,mouth,nutty,orange,peach,pear,pepper,pine,pineapple,plum,pungent,relaxed,rose,sage,skunk,sleepy,spicy,strawberry,sweet,talkative,tar,tea,tingly,tobacco,tree,tropical,uplifted,vanilla,violet,woody
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423573,0.0,0.367348,0.0,0.0,0.296072,0.393157,0.231858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221355,0.0,0.0,0.0,0.0,0.0,0.0,0.304395,0.0,0.0,0.0,0.497994,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.363903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.227094,0.343946,0.0,0.0,0.243049,0.0,0.332205,0.0,0.0,0.0,0.0,0.0,0.128916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.702087,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25226,0.0,0.0,0.0,0.269982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143202,0.388126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152005,0.0,0.591172,0.0,0.0,0.388126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168966,0.0,0.0,0.374893
3,0.0,0.0,0.659428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250376,0.0,0.217141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.511427,0.0,0.0,0.0,0.262361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.294366,0.0,0.0,0.0,0.145443,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.375113,0.0,0.0,0.0,0.0,0.262199,0.0,0.205331,0.0,0.0,0.0,0.0,0.0,0.0,0.184677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.663356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.433964,0.0,0.0,0.0,0.0,0.0,0.0,0.217903,0.0,0.0,0.0


In [7]:
# Fit on TF-IDF Vectors
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm_combined)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [0]:
# Practice passing a strain to the model with this string
# ideal_strain = ['Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus']


# Practice passing a strain to the model with this string
ideal_strain2 = ['Creative,Citrus']

In [0]:
# Query for similar strains using the test case
new = dtm_combined_tf.transform(ideal_strain2)
results = nn.kneighbors(new.todense())

In [25]:
# Results are returned in a tuple of arrays
print(results)

(array([[0.81309268, 0.81309268, 0.81309268, 0.81309268, 0.83043702]]), array([[1915, 2080, 1974,  124,  349]]))


In [11]:
type(results)

tuple

In [29]:
# first tuple contains percentage confidence the item is a match - 1st tuple in first array in first entry
print('First Tuple:', results[0][0][0])

# second tuple contains index number of nearest matches - 2nd tuple in first array in first entry 
print('Second Tuple:', results[1][0][0]) 

# Strain is strain name - 2nd tuple in first array in first entry 
df['Strain'][results[1][0][0]]


First Tuple: 0.8130926771833283
Second Tuple: 1915


'Special-K'

In [13]:
# combined is the combination of effects and flavors - 2nd tuple in first array in first entry 
df['Combined'][results[1][0][0]]

'Happy,Uplifted,Euphoric,Relaxed,Creative,Sweet,Citrus,Earthy'

In [14]:
# combined is the combination of effects and flavors - 2nd tuple in first array in first entry 
df['Description'][results[1][0][0]]

'Special K is a hybrid cross between sativa Western Winds and indica Slyder. The plant is tall with substantial girth,\xa0despite its lanky indica influence, with elongated buds.\xa0The effects are long-lasting with a quick onset, beginning with a\xa0physical buzz and\xa0evolving into a heady lift that borders on psychedelic. Special K has been called adventureful, making it a wonderful daytime strain. Indoor or outdoor grows can expect to flower at around 10 weeks.'

In [27]:
# sanity test that the strain at iloc 1915 is indded special k
df.iloc[1915]

Strain                                                 Special-K
Type                                                      hybrid
Rating                                                       4.5
Effects                 Happy,Uplifted,Euphoric,Relaxed,Creative
Flavor                                       Sweet,Citrus,Earthy
Description    Special K is a hybrid cross between sativa Wes...
Combined       Happy,Uplifted,Euphoric,Relaxed,Creative,Sweet...
Name: 1915, dtype: object

### Second Value 

In [32]:
# Results are returned in a tuple of arrays
print(results)

(array([[0.81309268, 0.81309268, 0.81309268, 0.81309268, 0.83043702]]), array([[1915, 2080, 1974,  124,  349]]))


In [34]:
# first tuple contains percentage confidence the item is a match - 1st tuple in first array in second entry
print('First Tuple:', results[0][0][1])

# second tuple contains index number of nearest matches - 2nd tuple in first array in second entry 
print('Second Tuple:', results[1][0][1]) 

# Strain is strain name - results[1] is 2nd tuple [0] is 1st array [0] is first value - the 1915 index
df['Strain'][results[1][0][1]]


First Tuple: 0.8130926771833283
Second Tuple: 2080


'The-Sauce'

'The-Sauce'

'Happy,Uplifted,Relaxed,Euphoric,Creative,Earthy,Sweet,Citrus'

'The Sauce is a 60/40 sativa-dominant hybrid bred by Exotic Genetix. Using a backcross of Green Ribbon to pollenate a Gorilla Glue #4 mother, the Northwest breeder created a potent blend that emits a mix of chocolate, lime and diesel flavors. The Sauce took the prize for Judge’s Choice at the 2015 DOPE Cup in Seattle.'

### Fifth Value 

In [0]:
# Results are returned in a tuple of arrays
print(results)

(array([[0.81309268, 0.81309268, 0.81309268, 0.81309268, 0.83043702]]), array([[1915, 2080, 1974,  124,  349]]))


In [38]:
# first tuple contains percentage confidence the item is a match - 1st tuple in first array in first entry
print('First Tuple:', results[0][0][4])

# second tuple contains index number of nearest matches - 2nd tuple in first array in first entry 
print('Second Tuple:', results[1][0][4]) 

# Strain is strain name - results[1] is 2nd tuple [0] is 1st array [0] is first value - the 2080 index
df['Strain'][results[1][0][4]]


First Tuple: 0.8304370201964505
Second Tuple: 349


'Blukashima'

In [40]:
# strain name from 
df['Strain'][results[1][0][4]]

'Blukashima'

In [41]:
# Pull the criteria from 2nd value (1) of the 1st array (0) of the 2nd tuple (1) - the 1972 index
df['Combined'][results[1][0][4]]

'Creative,None'

In [42]:
# Pull the criteria from 2nd value (1) of the 1st array (0) of the 2nd tuple (1) - the 1972 index
df['Description'][results[1][0][4]]

'Using a Chernobyl male plant to pollenate their Blue Dream cut, Terraform Genetics created Blukashima. This forcible hybrid inherits stress- and pain-relieving qualities as well as invigorating cerebral effects from both parent strains. Designed with potency in mind, proceed with caution if you’re new to cannabis.'

#### Save this snippet for later


In [0]:
# Imagine doing a detailed return of information afterwards 
# #output results here, rec strains criteria and description
# rec_str = [strains['Strain'][results[1][0][i]] for i in range(5)]
# rec_crit = [strains['Combined'][results[1][0][i]] for i in range(5)]
# rec_str_desc = [strains['Description'][results[1][0][i]] for i in range(5)]

# rec1 = rec_str[0] + ' * ' + rec_crit[0] + ' * ' + rec_str_desc[0]
# rec2 = rec_str[1] + ' * ' + rec_crit[1] + ' * ' + rec_str_desc[1]
# rec3 = rec_str[2] + ' * ' + rec_crit[2] + ' * ' + rec_str_desc[2]
# rec4 = rec_str[3] + ' * ' + rec_crit[3] + ' * ' + rec_str_desc[3]
# rec5 = rec_str[4] + ' * ' + rec_crit[4] + ' * ' + rec_str_desc[4]


#### Save this snippet for later


In [0]:
# Imagine doing a detailed return of information afterwards 
# #output results here, rec strains criteria and description
# rec_str = [strains['Strain'][results[1][0][i]] for i in range(5)]
# rec_crit = [strains['Combined'][results[1][0][i]] for i in range(5)]
# rec_str_desc = [strains['Description'][results[1][0][i]] for i in range(5)]

# rec1 = rec_str[0] + ' * ' + rec_crit[0] + ' * ' + rec_str_desc[0]
# rec2 = rec_str[1] + ' * ' + rec_crit[1] + ' * ' + rec_str_desc[1]
# rec3 = rec_str[2] + ' * ' + rec_crit[2] + ' * ' + rec_str_desc[2]
# rec4 = rec_str[3] + ' * ' + rec_crit[3] + ' * ' + rec_str_desc[3]
# rec5 = rec_str[4] + ' * ' + rec_crit[4] + ' * ' + rec_str_desc[4]


### Pickling step here

In [0]:
# Pickle the dtm and tf for use in the prediction
pickle.dump(dtm_combined, open('/content/dtm_combined.pkl', 'wb'))
pickle.dump(dtm_combined_tf, open('/content/dtm_combined_tf.pkl', 'wb'))

# Effects

In [0]:
# Instantiate vecorizer object - call tokenize
dtm_effects_tf = TfidfVectorizer(stop_words='english')


# dtm_effects  (vocabulary) and get word counts 
# effects and flavors effects 
dtm_effects = dtm_effects_tf.fit_transform(df['Effects'].values.astype('U'))
dtm_effects = pd.DataFrame(dtm_effects.todense(), columns=dtm_effects_tf.get_feature_names())
dtm_effects.head()

In [0]:
df.head()

# Flavors

In [0]:
# Instantiate vecorizer object - call tokenize
dtm_flavors_tf = TfidfVectorizer(stop_words='english')


# dtm_flavors  (vocabulary) and get word counts 
# flavors 
dtm_flavors = dtm_flavors_tf.fit_transform(df['Flavor'].values.astype('U'))
dtm_flavors = pd.DataFrame(dtm_flavors.todense(), columns=dtm_flavors_tf.get_feature_names())
dtm_flavors.head()

# Leafly API EDA 


In [0]:
import pandas as pd 

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/Build-Week-Med-Cabinet-6/DS/mark-dev/data/cannabis.csv")
df.head()

In [0]:
df.shape

In [0]:
df = df.dropna()


In [0]:
df.shape

In [0]:
df.describe(exclude='number')

In [0]:
pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip


In [0]:

import pandas_profiling
df.profile_report()

# Kushy API 

In [0]:
import pandas as pd 

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/kushyapp/cannabis-dataset/master/Dataset/Strains/strains-kushy_api.2017-11-14.csv")

In [0]:
df.head()

In [0]:
df.shape

In [0]:
df.isnull().values.sum()

In [0]:
df.info()

In [0]:
# These are the columns with a percentage of missing values. 
df.isnull().sum()/len(df)*100


In [0]:
df.describe(exclude="number")

In [0]:
df.describe(include="number")