In [1]:
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import numpy as np
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from spacy.tokenizer import Tokenizer
from sklearn.neighbors import NearestNeighbors
import en_core_web_lg

In [2]:
df = pd.read_csv('medical.csv')
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,alments
0,303-og,indica,4.2,"Relaxed,Happy,Euphoric,Uplifted,Giggly","Citrus,Pungent,Earthy",The indica-dominant 303 OG is a Colorado strai...,"['multiple-sclerosis', 'seizures']"
1,818-og,indica,4.7,"Relaxed,Happy,Euphoric,Giggly,Sleepy","Earthy,Diesel,Flowery",Named after the telephone area code of the San...,['muscular-dystrophy']
2,acdc,hybrid,4.5,"Relaxed,Happy,Uplifted,Focused,Euphoric","Earthy,Pine,Woody",ACDC is a sativa-dominant phenotype of the hig...,['muscle-spasms']
3,afghan-haze,hybrid,4.3,"Sleepy,Relaxed,Giggly,Happy,Creative","Earthy,Flowery,Tea",Afghan Haze is a sativa-dominant hybrid that c...,['gastrointestinal-disorder']
4,afghan-skunk,indica,4.3,"Sleepy,Relaxed,Happy,Hungry,Giggly","Skunk,Woody,Sweet",This popular classic strain was originally dev...,['lack-of-appetite']


In [3]:
def make_into_list(words):
    lst = list(words.split(" "))
    return lst

In [4]:
df = df.replace('None', np.nan)
df = df.dropna().reset_index()

In [5]:
df['ailments'] = df['alments'].str.replace(',', ' ')


df.head()

Unnamed: 0,index,Strain,Type,Rating,Effects,Flavor,Description,alments,ailments
0,0,303-og,indica,4.2,"Relaxed,Happy,Euphoric,Uplifted,Giggly","Citrus,Pungent,Earthy",The indica-dominant 303 OG is a Colorado strai...,"['multiple-sclerosis', 'seizures']",['multiple-sclerosis' 'seizures']
1,1,818-og,indica,4.7,"Relaxed,Happy,Euphoric,Giggly,Sleepy","Earthy,Diesel,Flowery",Named after the telephone area code of the San...,['muscular-dystrophy'],['muscular-dystrophy']
2,2,acdc,hybrid,4.5,"Relaxed,Happy,Uplifted,Focused,Euphoric","Earthy,Pine,Woody",ACDC is a sativa-dominant phenotype of the hig...,['muscle-spasms'],['muscle-spasms']
3,3,afghan-haze,hybrid,4.3,"Sleepy,Relaxed,Giggly,Happy,Creative","Earthy,Flowery,Tea",Afghan Haze is a sativa-dominant hybrid that c...,['gastrointestinal-disorder'],['gastrointestinal-disorder']
4,4,afghan-skunk,indica,4.3,"Sleepy,Relaxed,Happy,Hungry,Giggly","Skunk,Woody,Sweet",This popular classic strain was originally dev...,['lack-of-appetite'],['lack-of-appetite']


In [6]:
print(spacy.__version__)  

2.3.2


In [7]:
nlp = spacy.load("en_core_web_lg")
df.head()

# The Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Make the tokens for description
combined_tokens = []
for txt in tokenizer.pipe(df['ailments'], batch_size=500):
    txt_tokens = [token.text for token in txt]
    combined_tokens.append(txt_tokens)
df['combined_tokens'] = combined_tokens
print(df['combined_tokens'].head())

0    [['multiple-sclerosis',  , 'seizures']]
1                   [['muscular-dystrophy']]
2                        [['muscle-spasms']]
3            [['gastrointestinal-disorder']]
4                     [['lack-of-appetite']]
Name: combined_tokens, dtype: object


In [8]:
def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [9]:
# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words = 'english',
                       ngram_range = (1,2),
                       max_features = 2000)

# Create a vocabulary and tf-idf score per document
dtm = tfidf.fit_transform(df['ailments'])
                         

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
nn = NearestNeighbors(n_neighbors=4, algorithm='kd_tree')
nn.fit(dtm)


# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(296, 154)


Unnamed: 0,add,add adhd,adhd,adhd arthritis,adhd bipolar,adhd muscle,adhd spinal,aids,aids muscular,alzheimers,...,spasms,spasticity,spinal,spinal cord,stress,syndrome,tinnitus,tinnitus tourettes,tourettes,tourettes syndrome
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
dtm.shape

(296, 154)

[['gastrointestinal-disorder']]                       16
[['lack-of-appetite']]                                14
[['phantom-limb-pain']]                               10
[['bipolar-disorder']]                                10
[['tourettes-syndrome']]                               9
                                                      ..
[['cachexia',  , 'tourettes-syndrome']]                1
[['anxiety',  , 'insomnia']]                           1
[['alzheimers',  , 'parkinsons']]                      1
[['eye-pressure',  , 'hypertension']]                  1
[['anorexia',  , 'fibromyalgia',  , 'parkinsons']]     1
Name: combined_tokens, Length: 112, dtype: int64

In [14]:
ideal = ["""
lack-of-appetite,bipolar-disorder,hypertension
"""]

# Query the ideal descprition
new = tfidf.transform(ideal)
new

<1x154 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [15]:
nn.kneighbors(new.todense())

(array([[0.92869166, 0.92869166, 0.92869166, 0.92869166]]),
 array([[255, 183, 262, 190]]))

In [17]:
import pickle
# Dump the trained classifier (nn)  with Pickle
pickle_filename = 'ailments_model.pkl2'
pickled_model = open(pickle_filename, 'wb')  # Open the file to save as pkl file
pickle.dump(nn, pickled_model)
pickled_model.close() # Close the pickle instances

In [18]:
# Loading the saved model
ailments_model_pkl2 = open(pickle_filename, 'rb')
ailments_nn_model2 = pickle.load(ailments_model_pkl2)
print ("Loaded model :: ", ailments_nn_model2)  # print to verify

Loaded model ::  NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                 radius=1.0)


In [19]:
# Dump the trained classifier (tfidf)  with Pickle
pickle_filename_1 = 'ailments_tfidf.pkl2'
pickled_model_1 = open(pickle_filename_1, 'wb')  # Open the file to save as pkl file
pickle.dump(tfidf, pickled_model_1)
pickled_model_1.close() # Close the pickle instances

In [22]:
# Loading the saved model
ailments_model_pkl_1 = open(pickle_filename_1, 'rb')
ailments_tfidf_model2 = pickle.load(ailments_model_pkl_1)
print ("Loaded model :: ", tfidf_model2)  # print to verify

Loaded model ::  TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=2000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)


In [23]:
df['combined_tokens'].sample(10)

201                                     [['anorexia']]
52                            [['muscular-dystrophy']]
245                                   [['depression']]
191                                     [['insomnia']]
65                                     [['arthritis']]
270                           [['multiple-sclerosis']]
60                                        [['asthma']]
92                     [['gastrointestinal-disorder']]
235    [['lack-of-appetite',  , 'tourettes-syndrome']]
15                                      [['add-adhd']]
Name: combined_tokens, dtype: object

In [24]:
ideal2 = ['anorexia,muscular-dystropy,insomnia,add-adhd']

In [25]:
new2 = tfidf_model2.transform(ideal2)
new2

<1x154 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [27]:
ailments_nn_model2.kneighbors(new2.todense())

(array([[0.72857198, 0.72857198, 0.99004848, 0.99461667]]),
 array([[ 15,  84, 162,  45]]))

In [32]:
df['Strain'][162]

'mochi'

In [33]:
df['ailments'][162]

"['add-adhd'  'arthritis'  'muscular-dystrophy']"

In [38]:
import json
def recommend(user_input):
    temp_df = ailments_nn_model2.kneighbors(tfidf_model2.transform([user_input]).todense())[1]
    

    #print(temp_df)
    
    for i in range(4):
        info = df.loc[temp_df[0][i]]['Strain']
        info_effects = df.loc[temp_df[0][i]]['Effects']
        info_flavor = df.loc[temp_df[0][i]]['Flavor']
        info_description = df.loc[temp_df[0][i]]['Description']
        info_rating = df.loc[temp_df[0][i]]['Rating']
        info_ailments = df.loc[temp_df[0][i]]['ailments']

        
        print(json.dumps(info))
        print(json.dumps(info_ailments))
        print(json.dumps(info_effects))
        print(json.dumps(info_flavor))
        print(json.dumps(info_description))
        print(json.dumps(info_rating))
        

        
        #return json.dumps(info)  #for engineeers, the return does not work in jupyter lab.  Should work in vsCode.
        #return json.dumps(info_ailments)
        #return json.dumps(info_effects)
        #return json.dumps(info_flavor)
        #return json.dump(info_description)
        #return json.dumps(info_rating)
        


In [39]:
recommend('for arthritis')

"medibud"
"Uplifted,Relaxed,Euphoric,Happy,Focused"
"Sweet,Earthy,Skunk"
"Medibud (or Medi Bud) is an uplifting hybrid strain of unknown genetic origins, but many attribute its upbeat, active effects to sativa parentage. Others claim Medibud is a 60/40 indica-dominant cross, so it\u2019s possible that this strain expresses itself in various phenotypes. Typically, you can expect high-energy euphoria and heightened sensory awareness from Medibud, but its indica-leaning phenotypes may induce heavier, more relaxing effects.\u00a0"
4.5
"['arthritis']"
"la-chocolat"
"Relaxed,Happy,Euphoric,Sleepy,Hungry"
"Earthy,Coffee,Sweet"
"LA Chocolate, bred by DNA Genetics, is a 60/40 indica-dominant strain parented by their cherished LA Confidential indica and Chocolope sativa. This power-couple collectively passes on the best of their qualities: thick resin production, heavy yields, and a sweet, earthy aroma of chocolate and coffee. Its soothing physical effects sink through the body, keeping you rela