# TFIDF KNN Approach 

In [0]:
import pandas as pd

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import pickle

In [0]:
# Instantiate the tokenizer 
nlp=English()
tokenizer = Tokenizer(nlp.vocab)

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/Build-Week-Med-Cabinet-6/DS/mark-dev/data/cannabis.csv")
df.head()

In [0]:
# drop nan or missing values
df = df.dropna()
df = df.drop(df.index[149])
df = df.reset_index(drop=True)




In [0]:
# Combine the Effects and Flavors in one column
df['Combined'] = df['Effects'] + ',' + df['Flavor']


In [0]:
# Instantiate vecorizer object - call tokenize
dtm_combined_tf = TfidfVectorizer(stop_words='english')


# dtm_combined  (vocabulary) and get word counts 
# effects and flavors combined 
dtm_combined = dtm_combined_tf.fit_transform(df['Combined'].values.astype('U'))
dtm_combined = pd.DataFrame(dtm_combined.todense(), columns=dtm_combined_tf.get_feature_names())
dtm_combined.head()

In [0]:
# Fit on TF-IDF Vectors
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm_combined)

In [0]:
# Practice passing a strain to the model with this string
ideal_strain = ['Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus']


In [0]:
# Query for similar strains using the test case
new = dtm_combined_tf.transform(ideal_strain)
results = nn.kneighbors(new.todense())

In [0]:
# Results are returned in a tuple of arrays
results

In [0]:
type(results)

In [0]:
# Pull the strain name from 1st value (0) of the 1st array (0) of the 2nd tuple (1) - the 0 index
df['Strain'][results[1][0][0]]


In [0]:
df['Combined'][results[1][0][0]]

In [0]:
df['Description'][results[1][0][0]]

### Second Value 

In [0]:
# Pull the strain name from 2nd value (1) of the 1st array (0) of the 2nd tuple (1) - the 1972 index
df['Strain'][results[1][0][1]]

In [0]:
# Pull the criteria from 2nd value (1) of the 1st array (0) of the 2nd tuple (1) - the 1972 index
df['Combined'][results[1][0][1]]

In [0]:
# Pull the criteria from 2nd value (1) of the 1st array (0) of the 2nd tuple (1) - the 1972 index
df['Description'][results[1][0][1]]

In [0]:
# Imagine doing a detailed return of information afterwards 
# #output results here, rec strains criteria and description
# rec_str = [strains['Strain'][results[1][0][i]] for i in range(5)]
# rec_crit = [strains['Combined'][results[1][0][i]] for i in range(5)]
# rec_str_desc = [strains['Description'][results[1][0][i]] for i in range(5)]

# rec1 = rec_str[0] + ' * ' + rec_crit[0] + ' * ' + rec_str_desc[0]
# rec2 = rec_str[1] + ' * ' + rec_crit[1] + ' * ' + rec_str_desc[1]
# rec3 = rec_str[2] + ' * ' + rec_crit[2] + ' * ' + rec_str_desc[2]
# rec4 = rec_str[3] + ' * ' + rec_crit[3] + ' * ' + rec_str_desc[3]
# rec5 = rec_str[4] + ' * ' + rec_crit[4] + ' * ' + rec_str_desc[4]


### Pickling step here

In [0]:
# Pickle the dtm and tf for use in the prediction
pickle.dump(dtm_combined, open('/content/dtm_combined.pkl', 'wb'))
pickle.dump(dtm_combined_tf, open('/content/dtm_combined_tf.pkl', 'wb'))

# Effects

In [0]:
# Instantiate vecorizer object - call tokenize
dtm_effects_tf = TfidfVectorizer(stop_words='english')


# dtm_effects  (vocabulary) and get word counts 
# effects and flavors effects 
dtm_effects = dtm_effects_tf.fit_transform(df['Effects'].values.astype('U'))
dtm_effects = pd.DataFrame(dtm_effects.todense(), columns=dtm_effects_tf.get_feature_names())
dtm_effects.head()

In [0]:
df.head()

# Flavors

In [0]:
# Instantiate vecorizer object - call tokenize
dtm_flavors_tf = TfidfVectorizer(stop_words='english')


# dtm_flavors  (vocabulary) and get word counts 
# flavors 
dtm_flavors = dtm_flavors_tf.fit_transform(df['Flavor'].values.astype('U'))
dtm_flavors = pd.DataFrame(dtm_flavors.todense(), columns=dtm_flavors_tf.get_feature_names())
dtm_flavors.head()

# Leafly API EDA 


In [0]:
import pandas as pd 

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/Build-Week-Med-Cabinet-6/DS/mark-dev/data/cannabis.csv")
df.head()

In [0]:
df.shape

In [0]:
df = df.dropna()


In [0]:
df.shape

In [0]:
df.describe(exclude='number')

In [0]:
pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip


In [0]:

import pandas_profiling
df.profile_report()

# Kushy API 

In [0]:
import pandas as pd 

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/kushyapp/cannabis-dataset/master/Dataset/Strains/strains-kushy_api.2017-11-14.csv")

In [0]:
df.head()

In [0]:
df.shape

In [0]:
df.isnull().values.sum()

In [0]:
df.info()

In [0]:
# These are the columns with a percentage of missing values. 
df.isnull().sum()/len(df)*100


In [0]:
df.describe(exclude="number")

In [0]:
df.describe(include="number")