### MedCab4

Notebook co-authored by Brad Brauser and Peggy Krom

In [1]:
# Necessary imports
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import spacy

In [2]:
# Dataset from Peggy's previous MedCab4 build

url1 = 'https://raw.githubusercontent.com/PeggyK1/med_cab3/main/data/strains.csv'
df1 = pd.read_csv(url1)
# Dropping unneeded column
df1 = df1.drop(columns = ['Unnamed: 0'])
#Preping it for tokenizatiopn
df1['name'] = df1['name'].replace('-', ' ', regex=True).str.lower()
# Dropping unneeded collumns
df1 = df1.drop(['id'], axis = 1)
df1.drop(df1.iloc[:, 5:43], inplace = True, axis = 1) 

df1.head()

Unnamed: 0,name,type,effects,ailment,flavor
0,100 og,Hybrid,Focused,Depression,Citrus
1,afghani bullrider,Hybrid,"Uplifted, Relaxed, Happy, Euphoric, Dry Mouth,...","Stress, Depression, Insomnia, Pain","Sweet, Pine, Earthy"
2,aloha,Hybrid,"Energetic, Uplifted, Happy, Creative, Focused,...","Depression, Stress, Pain","Sweet, Citrus"
3,amnesia haze,Hybrid,"Happy, Uplifted, Euphoric, Energetic, Creative...","Stress, Depression, Pain","Citrus, Lemon, Earthy"
4,bc sweet tooth,Hybrid,"Uplifted, Happy, Relaxed, Sleepy, Euphoric, Dr...","Stress, Insomnia, Depression, Nausea, Pain","Sweet, Honey"


In [3]:
# Model I found from a previous study guide
url2 = 'https://raw.githubusercontent.com/bundickm/Study-Guides/master/data/cannabis.csv'
df2 = pd.read_csv(url2)

# Prepping for tokenization
df2['Strain'] = df2['Strain'].replace('-', ' ', regex=True)
df2 = df2.rename(columns = {'Strain':'name'})
df2['name'] = df2['name'].str.lower()

df2.head()

Unnamed: 0,name,Type,Rating,Effects,Flavor,Description
0,100 og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98 white widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13 dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24k gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [4]:
# Merging the two datasets
strains = pd.merge(df1, df2, on='name')
# Dropping NaN values
strains = strains.dropna()
# Dropping duplicate and unneeded columns
strains = strains.drop(columns = ['Type', 'Effects', 'Flavor'])

strains.head(10)

Unnamed: 0,name,type,effects,ailment,flavor,Rating,Description
0,100 og,Hybrid,Focused,Depression,Citrus,4.0,$100 OG is a 50/50 hybrid strain that packs a ...
1,aloha,Hybrid,"Energetic, Uplifted, Happy, Creative, Focused,...","Depression, Stress, Pain","Sweet, Citrus",4.2,Aloha is a sativa strain that leaves users wit...
2,amnesia haze,Hybrid,"Happy, Uplifted, Euphoric, Energetic, Creative...","Stress, Depression, Pain","Citrus, Lemon, Earthy",4.3,"With earthy flavors of lemons and citrus, Amne..."
3,bc sweet tooth,Hybrid,"Uplifted, Happy, Relaxed, Sleepy, Euphoric, Dr...","Stress, Insomnia, Depression, Nausea, Pain","Sweet, Honey",4.3,"Developed in British Columbia by BC Bud Depot,..."
4,berkeley,Hybrid,"Talkative, Happy, Uplifted, Energetic, Focused...","Stress, Depression, Lack of Appetite, Pain","Citrus, Sweet",4.3,Berkeley is rumored to be the super-potent bot...
5,berry white,Hybrid,"Relaxed, Happy, Euphoric, Uplifted, Sleepy, Dr...","Stress, Pain, Depression, Insomnia","Berry, Sweet, Blueberry",4.4,Berry White is a hybrid strain that is the off...
6,big bud,Hybrid,"Relaxed, Sleepy, Hungry, Happy, Euphoric, Dry ...","Stress, Pain, Insomnia, Depression, Muscle Spasms",Earthy,3.9,Developed in the USA before being brought to t...
7,big wreck,Hybrid,"Euphoric, Relaxed, Uplifted, Tingly, Happy, Dr...","Stress, Insomnia, Depression, Lack of Appetite...",Earthy,4.1,Big Wreck is the ndica-dominant cross of Big B...
8,black domina,Hybrid,"Relaxed, Sleepy, Euphoric, Happy, Uplifted, Dr...","Stress, Depression, Pain, Insomnia, Lack of Ap...","Pine, Pepper",4.2,Working with four prime examples of Cannabis A...
9,blackberry kush,Hybrid,"Relaxed, Sleepy, Happy, Euphoric, Hungry, Dry ...","Stress, Pain, Insomnia, Depression, Nausea","Berry, Sweet, Earthy",4.3,This mostly indica strain is a mix of Afghani ...


In [5]:
strains['all'] = strains['type'].str.cat(strains['effects'], sep = ", ")
strains['all'] = strains['all'].str.cat(strains['ailment'], sep = ", ")
strains['all'] = strains['all'].str.cat(strains['flavor'], sep = ", ")

strains.head()

Unnamed: 0,name,type,effects,ailment,flavor,Rating,Description,all
0,100 og,Hybrid,Focused,Depression,Citrus,4.0,$100 OG is a 50/50 hybrid strain that packs a ...,"Hybrid, Focused, Depression, Citrus"
1,aloha,Hybrid,"Energetic, Uplifted, Happy, Creative, Focused,...","Depression, Stress, Pain","Sweet, Citrus",4.2,Aloha is a sativa strain that leaves users wit...,"Hybrid, Energetic, Uplifted, Happy, Creative, ..."
2,amnesia haze,Hybrid,"Happy, Uplifted, Euphoric, Energetic, Creative...","Stress, Depression, Pain","Citrus, Lemon, Earthy",4.3,"With earthy flavors of lemons and citrus, Amne...","Hybrid, Happy, Uplifted, Euphoric, Energetic, ..."
3,bc sweet tooth,Hybrid,"Uplifted, Happy, Relaxed, Sleepy, Euphoric, Dr...","Stress, Insomnia, Depression, Nausea, Pain","Sweet, Honey",4.3,"Developed in British Columbia by BC Bud Depot,...","Hybrid, Uplifted, Happy, Relaxed, Sleepy, Euph..."
4,berkeley,Hybrid,"Talkative, Happy, Uplifted, Energetic, Focused...","Stress, Depression, Lack of Appetite, Pain","Citrus, Sweet",4.3,Berkeley is rumored to be the super-potent bot...,"Hybrid, Talkative, Happy, Uplifted, Energetic,..."


In [6]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

tfidf = TfidfVectorizer(stop_words='english',
        ngram_range=(1, 3),
        max_features=5000,
    )
nn = KNeighborsClassifier(n_neighbors=10, algorithm='auto')
skf = StratifiedKFold(n_splits=2)
pipeline = Pipeline([
    ('vect', tfidf),
    ('clf', nn)
])
param_grid = {
    'vect__stop_words': [None],
    'vect__ngram_range': [(1,2), (1,3)],
    'vect__min_df': (0, 0.15),
    'vect__max_df': (0.55, 1.0),
}
gs = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, cv=skf, n_jobs=-1, verbose=10, return_train_score=True)

In [7]:
feature = strains['all']
target = strains['name']

In [8]:
features = tfidf.fit_transform(feature)
features = pd.DataFrame(features.todense(), columns=tfidf.get_feature_names())

In [9]:
targets = tfidf.fit_transform(target)
targets = pd.DataFrame(targets.todense(), columns=tfidf.get_feature_names())

In [10]:
features.head()

Unnamed: 0,ammonia,ammonia citrus,ammonia citrus coffee,ammonia earthy,ammonia sweet,anxious,anxious depression,anxious depression citrus,anxious depression insomnia,anxious depression lack,anxious depression muscle,anxious depression pain,anxious depression stress,anxious depression sweet,anxious dry,anxious dry mouth,anxious inflammation,anxious inflammation depression,anxious inflammation lack,anxious inflammation pain,anxious inflammation stress,anxious insomnia,anxious insomnia lack,anxious insomnia muscle,anxious insomnia pain,anxious insomnia stress,anxious lack,anxious lack appetite,anxious muscle,anxious muscle spasms,anxious nausea,anxious nausea lack,anxious pain,anxious pain depression,anxious pain inflammation,anxious pain insomnia,anxious pain lack,anxious pain muscle,anxious pain nausea,anxious pain stress,...,uplifted insomnia pain,uplifted paranoid,uplifted paranoid anxious,uplifted paranoid dry,uplifted paranoid muscle,uplifted relaxed,uplifted relaxed anxious,uplifted relaxed creative,uplifted relaxed dry,uplifted relaxed energetic,uplifted relaxed euphoric,uplifted relaxed focused,uplifted relaxed happy,uplifted relaxed hungry,uplifted relaxed insomnia,uplifted relaxed sleepy,uplifted relaxed talkative,uplifted sleepy,uplifted sleepy creative,uplifted sleepy dry,uplifted sleepy euphoric,uplifted sleepy focused,uplifted sleepy happy,uplifted sleepy paranoid,uplifted sleepy tingly,uplifted talkative,uplifted talkative creative,uplifted talkative depression,uplifted talkative dry,uplifted talkative energetic,uplifted talkative euphoric,uplifted talkative focused,uplifted talkative horny,uplifted tingly,uplifted tingly creative,uplifted tingly depression,uplifted tingly dry,uplifted tingly happy,vanilla,vanilla sweet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.079645,0.162546,0.0,0.0,0.0,0.0,0.0,0.179037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.076254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.067459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.178115,0.178115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
example = pd.DataFrame({'ailment': ['insomnia'],
                        'type': ['indica'],
                        'effects': ['focused'],
                        'flavor': ['earthy']})

In [12]:
ex = tfidf.fit_transform(example)
ex

<4x4 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [22]:
gs.fit(strains['all'], strains['name'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1602s.) Setting batch_size=2.


Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   7 out of  16 | elapsed:    0.4s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:    0.7s finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
                   error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('vect',
                                              TfidfVectorizer(analyzer='word',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.float64'>,
                                                              encoding='utf-8',
                                                              input='content',
                                                              lowercase=True,
                                                              max_df=1.0,
                                                              max_features=5000,
                                                 

In [23]:
gs.predict(example)

array(['biochem', 'biochem', 'biochem', 'biochem'], dtype=object)

In [31]:
example2 = ['Insomnia, Grape']
gs.predict(example2)

array(['california grapefruit'], dtype=object)

In [16]:
from joblib import dump
dump(gs, 'gs_2.joblib', compress=True)

['gs_2.joblib']