In [1]:
# Imports
import pandas as pd

## IMPORT STRAIN & SYMPTOM DATA

In [2]:
# Read in the symptoms_updated.csv file
df_data = pd.read_csv("https://dsfiles.dananderson.dev/files/symptoms_updated.csv")

In [3]:
df_data.head()

Unnamed: 0,index,Strain,Type,Rating,Effects,Flavor,symptoms_diseases,Description,text_all,Effects_and_Flavor,tokens,description_formatted
0,0,100-Og,hybrid,4.0,"Creative, Energetic, Tingly, Euphoric, Relaxed","Earthy, Sweet, Citrus","ms, pain, pain, spasticity,",$100 OG is a 50/50 hybrid strain that packs a ...,Creative Energetic Tingly Euphoric Relaxed...,"Creative, Energetic, Tingly, Euphoric, Relaxed...","['creative', 'energetic', 'tingly', 'euphoric'...",$100 og is a 50/50 hybrid strain that packs a ...
1,1,98-White-Widow,hybrid,4.7,"Relaxed, Aroused, Creative, Happy, Energetic","Flowery, Violet, Diesel","spasticity,",The ‚Äò98 Aloha White Widow is an especially p...,Relaxed Aroused Creative Happy Energetic ...,"Relaxed, Aroused, Creative, Happy, Energetic, ...","['relaxed', 'aroused', 'creative', 'happy', 'e...",the ‚Äò98 aloha white widow is an especially p...
2,2,1024,sativa,4.4,"Uplifted, Happy, Relaxed, Energetic, Creative","Spicy/Herbal, Sage, Woody","pain, pain, spasticity,",1024 is a sativa-dominant hybrid bred in Spain...,Uplifted Happy Relaxed Energetic Creative ...,"Uplifted, Happy, Relaxed, Energetic, Creative,...","['uplifted', 'happy', 'relaxed', 'energetic', ...",1024 is a sativa-dominant hybrid bred in spain...
3,3,13-Dawgs,hybrid,4.2,"Tingly, Creative, Hungry, Relaxed, Uplifted","Apricot, Citrus, Grapefruit","appetite, appetite, depression, spasticity,",13 Dawgs is a hybrid of G13 and Chemdawg genet...,Tingly Creative Hungry Relaxed Uplifted A...,"Tingly, Creative, Hungry, Relaxed, Uplifted, A...","['tingly', 'creative', 'hungry', 'relaxed', 'u...",13 dawgs is a hybrid of g13 and chemdawg genet...
4,4,24K-Gold,hybrid,4.6,"Happy, Relaxed, Euphoric, Uplifted, Talkative","Citrus, Earthy, Orange","spasticity,",Also known as Kosher Tangie 24k Gold is a 60%...,Happy Relaxed Euphoric Uplifted Talkative ...,"Happy, Relaxed, Euphoric, Uplifted, Talkative,...","['happy', 'relaxed', 'euphoric', 'uplifted', '...",also known as kosher tangie 24k gold is a 60%...


## SPLIT DESCRIPTIONS OF EFFECTS, FLAVORS, AND SYMPTOMS

In [4]:
# 'split2list' splits a comma delimited string and returns a list of lower case tokens
def split2list(val):
  ret_list = []

  # Is the passed value Nan?
  if pd.isna(val):
    return ['none']

  # Is the value an empty string?
  if val == "":
    return ['none']

  # Lower case the input value
  tmp_lower = val.lower()

  # Is the value "none"?
  if tmp_lower == "none":
    return ['none']

  # Split the string value into tokens delimited by commas
  tmp_list = tmp_lower.split(',')

  # Is tmp_list empty?
  if len(tmp_list) == 0:
    return ['none']

  # Iterate through the temp list and trim values
  for elm in tmp_list:
    # If the list element is all spaces then skip
    if elm.strip() == "":
      continue

    # Otherwise strip the string of leading and trailing spaces and add to the
    #  return list
    ret_list.append(elm.strip())

  # Return the list
  return ret_list


In [5]:
# Create a working copy of the initial dataframe
df_data_upd = df_data.copy()

In [6]:
# Create dataframe columns which consist of lists of tokens for Effects, Flavors, and Symptoms respectively
df_data_upd["Effects_list"]           = df_data_upd["Effects"].apply(split2list)
df_data_upd["Flavor_list"]            = df_data_upd["Flavor"].apply(split2list)
df_data_upd["symptoms_diseases_list"] = df_data_upd["symptoms_diseases"].apply(split2list)


In [7]:
# Sample the 'Effects_list' column
df_data_upd["Effects_list"].sample(10)

573       [hungry, relaxed, tingly, uplifted, creative]
1551    [euphoric, happy, relaxed, creative, energetic]
1649    [happy, uplifted, creative, focused, talkative]
975         [happy, relaxed, euphoric, focused, sleepy]
870        [relaxed, happy, euphoric, uplifted, sleepy]
331       [energetic, relaxed, euphoric, happy, sleepy]
1311     [focused, energetic, uplifted, happy, relaxed]
35      [focused, uplifted, happy, euphoric, talkative]
488      [relaxed, happy, euphoric, uplifted, creative]
2240         [relaxed, happy, euphoric, tingly, hungry]
Name: Effects_list, dtype: object

In [8]:
# Sample the 'Flavor_list' column
df_data_upd["Flavor_list"].sample(10)

1328           [minty, orange, lemon]
1948           [sweet, citrus, berry]
793             [pine, woody, citrus]
561         [diesel, flowery, earthy]
1045        [earthy, pungent, cheese]
1018                           [none]
1955         [citrus, diesel, orange]
1772            [pine, woody, earthy]
1873             [grape, pine, nutty]
2347    [earthy, sweet, spicy/herbal]
Name: Flavor_list, dtype: object

In [9]:
# Sample the 'symptoms_diseases_list' column
df_data_upd["symptoms_diseases_list"].sample(10)

2324                              [ms, spasticity]
114                                   [spasticity]
1555    [headache, nausea, pain, pain, spasticity]
1351                                  [spasticity]
907                                   [spasticity]
2005                        [insomnia, spasticity]
1259                                  [spasticity]
1024                              [ms, spasticity]
2022               [migraines, nausea, spasticity]
1386                             [hiv, spasticity]
Name: symptoms_diseases_list, dtype: object

In [10]:
map_effects = {}
map_flavors = {}
map_symptoms = {}

# gen_unique_values takes an inbound list of strings and adds them as keys to a map
def gen_unique_values(lst, mp):
  # Iterate through the list
  for elm in lst:
    # Is elm an empty string? Error condition
    if elm == "":
      print("error - encountered an empty string")
      return 

    mp[elm] = 0
    return 


In [11]:
# Generate maps that contain unique Effects, Flavors, and Symtoms tokens
df_data_upd["Effects_list"].apply(gen_unique_values, mp=map_effects)
df_data_upd["Flavor_list"].apply(gen_unique_values, mp=map_flavors)
df_data_upd["symptoms_diseases_list"].apply(gen_unique_values, mp=map_symptoms)

0       None
1       None
2       None
3       None
4       None
5       None
6       None
7       None
8       None
9       None
10      None
11      None
12      None
13      None
14      None
15      None
16      None
17      None
18      None
19      None
20      None
21      None
22      None
23      None
24      None
25      None
26      None
27      None
28      None
29      None
        ... 
2320    None
2321    None
2322    None
2323    None
2324    None
2325    None
2326    None
2327    None
2328    None
2329    None
2330    None
2331    None
2332    None
2333    None
2334    None
2335    None
2336    None
2337    None
2338    None
2339    None
2340    None
2341    None
2342    None
2343    None
2344    None
2345    None
2346    None
2347    None
2348    None
2349    None
Name: symptoms_diseases_list, Length: 2350, dtype: object

## CLEAN STRAIN DESCRIPTIONS

In [12]:
# Steps to Clean the Strain Descriptions
import re
import pandas as pd

# Regular expression used to remove non-standard characters
rgxNotStdChars = re.compile(r'[^a-zA-z0-9.,!?/:;\"\'\s]')
rgxMultWhtSpce = re.compile(r'\s{2,}')

# 'retain_std_chars' takes a string and returns that string with non-standard
#    characters removed
def retain_std_chars(val):
  # Is the passed value NaN?
  if pd.isna(val):
    return 'none'

  # Is the passed not a string?
  if type(val) != str:
    return 'none'

  # Is the value an empty string?
  if val == "":
    return 'none'

  # Lower case the input value
  tmp_lower = val.lower()

  # Is the value "none"?
  if tmp_lower == "none":
    return 'none'

  # Remove non-standard characters
  tmp_std = re.sub(rgxNotStdChars, "", tmp_lower)

  # Convert multiple whitespace characters to one whitespace character
  tmp_wht = re.sub(rgxMultWhtSpce, "", tmp_std)

  # Strip leading and trailing whitespace
  tmp_rtn = tmp_wht.strip()
  
  return tmp_rtn

In [13]:
df_data_upd['Description_cleaned'] = df_data_upd['Description'].apply(retain_std_chars)

## GENERATE WORK 'TOKENS' FOR THE STRAIN DESCRIPTIONS & CHARACTERISTICS

In [14]:
!pip install spacy

You should consider upgrading via the '/Users/danoand/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [15]:
!python -m spacy download en_core_web_sm

You should consider upgrading via the '/Users/danoand/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [16]:
# Use the spacy library to generate strain description tokens
import spacy

# Instantiate a spacy object
nlp = spacy.load('en_core_web_sm')

# tnkize_text takes a string and returns a list of tokens generated via the spacy library
def tnkize_text(val):
  tmp_list = []
  tmp_doc = nlp(val)

  # Iterate through the text's tokenized objects
  for tkn in tmp_doc:
    tmp_list.append(tkn.text)

  return tmp_list


In [17]:
# Generate a list of description tokens
df_data_upd['Description_tokens'] =  df_data_upd['Description_cleaned'].apply(tnkize_text)

In [18]:
df_data_upd['Description_tokens'].sample(20)

1528    [orange, dream, is, a, sativadominant, hybrid,...
190     [berry, noir, is, an, indica, strain, that, co...
2310    [white, zombie, is, a, potent, hybrid, strain,...
2205    [twista, from, denvers, green, solution, is, a...
1220    [originating, in, los, angelescaliforniala, ku...
276     [a, descendent, of, the, famous, blue, dream, ...
696     [dr, ., bubbles, is, a, delicious, strain, wit...
950     [green, lantern, is, a, mostly, sativa, strain...
1896    [smooth, operator, is, an, ideal, cut, for, co...
793     [firewalker, og, is, a, sativadominant, cross,...
1477    [northwest, pineapple, is, an, indicadominant,...
440     [reeferman, seeds, created, the, vibrant, camb...
1501    [og, strawberrygrown, on, the, southwest, ridg...
998     [hawaiian, cookies, is, a, sativadominant, hyb...
1336    [this, hybrid, from, the, bank, cannabis, gene...
128     [arjan, 's, strawberry, haze, was, created, by...
1511    [old, mother, sativa, is, an, enormous, plant,...
524     [choco

In [19]:
# remove_one_letter_words removes one character strings from a list of strings
def remove_one_letter_words(lst):
  ret_list = []

  # Iterate through the passed list
  for elm in lst:
    # Is the element not a string?
    if type(elm) != str:
      # Element not a string, skip
      continue

    # Does the element have a length of 0 or 1? Skip
    if elm == "" or len(elm) == 1:
      continue

    ret_list.append(elm)

  # Return the updated list
  return ret_list


# gen_all_tokens creates a column consisting of description, effects, 
#    flavors, and symptoms tokens ("ALL_TOKENTS")
def gen_all_tokens(DF):
  DF_WRK = DF.copy()

  DF_WRK['ALL_TOKENS'] = DF_WRK['Description_tokens'] \
    + DF_WRK['Effects_list'] \
    + DF_WRK['Flavor_list'] \
    + DF_WRK['symptoms_diseases_list']

  return DF_WRK

In [20]:
# Create a column ('ALL_TOKENS') of strain tokens that
#   consist of the description, effects, flavors, and symptom tokens
df_data_upd = gen_all_tokens(df_data_upd)

# Remove all one character word tokens from the 'ALL_TOKENS' column
df_data_upd['ALL_TOKENS'] = df_data_upd['ALL_TOKENS'].apply(remove_one_letter_words)

In [21]:
df_data_upd.to_csv('df.csv')

## VECTORIZE EACH STRAIN'S DESCRIPTION & CHARACTERISTICS

In [22]:
# Use TFIDF and vectorize the strain tokens ('ALL_TOKENS')
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_func(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    stop_words = 'english',
    tokenizer=dummy_func,
    preprocessor=dummy_func,
    token_pattern=None) 

In [25]:
# Create a vocabulary and tf-idf score per document
dtm = tfidf.fit_transform(df_data_upd['ALL_TOKENS'])

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(2350, 16383)


Unnamed: 0,'d,'s,...,0.14the,0.18,0.23,0.3,0.36,0.38,0.47,....1,zion,zipping,zkittlez,zombie,zombiewith,zone,zonethe,zoning,zoom,zs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## CONSTRUCT A NEAREST NEIGHBORS MODEL TO GENERATE RECOMMENDATIONS

In [26]:
# Define a Nearest Neighbors model on which to compare incoming text
from sklearn.neighbors import NearestNeighbors

# Fit on the nearest neighbors model TF-IDF feature matrix created above 
nn = NearestNeighbors(n_neighbors=8, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=8, p=2,
                 radius=1.0)

In [27]:
nn.kneighbors([dtm.iloc[10].values])

(array([[0.        , 1.06450694, 1.19133723, 1.26735898, 1.27620083,
         1.29822153, 1.30003998, 1.30162712]]),
 array([[  10, 1929,  741,  649, 1372,  543, 1938, 1589]]))

## GENERATE A SAMPLE RECOMMENDATION

In [28]:
# Score a new document and return it's nearest neighbors
new_doc_score = tfidf.transform(["I want to feel super relaxed, yet energetic and creative"])

# Execute the nearest neighbors model using the newly scored document
nn.kneighbors(new_doc_score.todense())

(array([[1., 1., 1., 1., 1., 1., 1., 1.]]),
 array([[2199,  160,  776,  602, 2044,  852,  826, 1626]]))

In [29]:
df_data_upd.iloc[84]

index                                                                    84
Strain                                                     Alien-Technology
Type                                                                 indica
Rating                                                                  4.5
Effects                         Happy, Relaxed, Uplifted, Euphoric, Focused
Flavor                                          Earthy, Spicy/Herbal, Woody
symptoms_diseases                                              spasticity, 
Description               Very little is known about Alien Technology ot...
text_all                  Happy  Relaxed  Uplifted  Euphoric  Focused  E...
Effects_and_Flavor        Happy, Relaxed, Uplifted, Euphoric, Focused, E...
tokens                    ['happy', 'relaxed', 'uplifted', 'euphoric', '...
description_formatted     very little is known about alien technology ot...
Effects_list                  [happy, relaxed, uplifted, euphoric, focused]
Flavor_list 

In [30]:
import pickle

# Open a file and write the Nearest Neighbors model disk (pickling)
pkl_file = open("nn_model.pkl", 'wb')
pickle.dump(nn, pkl_file)
pkl_file.close()

In [31]:
# Open a file and write the TfidfVectorizer model to disk
pkl_tfidf_file = open("tfidf_model.pkl", 'wb')
pickle.dump(tfidf, pkl_tfidf_file)
pkl_tfidf_file.close()

In [32]:
# Open a file for reading (unpickling)
pkl_file = open("nn_model.pkl", 'rb')
nn_prime = pickle.load(pkl_file)
pkl_file.close()

In [33]:
nn_prime.n_neighbors

8

In [34]:
nn.kneighbors(new_doc_score.todense())[1][0]

array([2199,  160,  776,  602, 2044,  852,  826, 1626])