In [5]:
# Imports
import pandas as pd

## IMPORT STRAIN & SYMPTOM DATA

In [6]:
# Read in the symptoms_updated.csv file
df_data = pd.read_csv("https://dsfiles.dananderson.dev/files/symptoms_updated.csv")

In [7]:
df_data.head()

Unnamed: 0,index,Strain,Type,Rating,Effects,Flavor,symptoms_diseases,Description,text_all,Effects_and_Flavor,tokens,description_formatted
0,0,100-Og,hybrid,4.0,"Creative, Energetic, Tingly, Euphoric, Relaxed","Earthy, Sweet, Citrus","ms, pain, pain, spasticity,",$100 OG is a 50/50 hybrid strain that packs a ...,Creative Energetic Tingly Euphoric Relaxed...,"Creative, Energetic, Tingly, Euphoric, Relaxed...","['creative', 'energetic', 'tingly', 'euphoric'...",$100 og is a 50/50 hybrid strain that packs a ...
1,1,98-White-Widow,hybrid,4.7,"Relaxed, Aroused, Creative, Happy, Energetic","Flowery, Violet, Diesel","spasticity,",The ‚Äò98 Aloha White Widow is an especially p...,Relaxed Aroused Creative Happy Energetic ...,"Relaxed, Aroused, Creative, Happy, Energetic, ...","['relaxed', 'aroused', 'creative', 'happy', 'e...",the ‚Äò98 aloha white widow is an especially p...
2,2,1024,sativa,4.4,"Uplifted, Happy, Relaxed, Energetic, Creative","Spicy/Herbal, Sage, Woody","pain, pain, spasticity,",1024 is a sativa-dominant hybrid bred in Spain...,Uplifted Happy Relaxed Energetic Creative ...,"Uplifted, Happy, Relaxed, Energetic, Creative,...","['uplifted', 'happy', 'relaxed', 'energetic', ...",1024 is a sativa-dominant hybrid bred in spain...
3,3,13-Dawgs,hybrid,4.2,"Tingly, Creative, Hungry, Relaxed, Uplifted","Apricot, Citrus, Grapefruit","appetite, appetite, depression, spasticity,",13 Dawgs is a hybrid of G13 and Chemdawg genet...,Tingly Creative Hungry Relaxed Uplifted A...,"Tingly, Creative, Hungry, Relaxed, Uplifted, A...","['tingly', 'creative', 'hungry', 'relaxed', 'u...",13 dawgs is a hybrid of g13 and chemdawg genet...
4,4,24K-Gold,hybrid,4.6,"Happy, Relaxed, Euphoric, Uplifted, Talkative","Citrus, Earthy, Orange","spasticity,",Also known as Kosher Tangie 24k Gold is a 60%...,Happy Relaxed Euphoric Uplifted Talkative ...,"Happy, Relaxed, Euphoric, Uplifted, Talkative,...","['happy', 'relaxed', 'euphoric', 'uplifted', '...",also known as kosher tangie 24k gold is a 60%...


## SPLIT DESCRIPTIONS OF EFFECTS, FLAVORS, AND SYMPTOMS

In [8]:
# 'split2list' splits a comma delimited string and returns a list of lower case tokens
def split2list(val):
  ret_list = []

  # Is the passed value Nan?
  if pd.isna(val):
    return ['none']

  # Is the value an empty string?
  if val == "":
    return ['none']

  # Lower case the input value
  tmp_lower = val.lower()

  # Is the value "none"?
  if tmp_lower == "none":
    return ['none']

  # Split the string value into tokens delimited by commas
  tmp_list = tmp_lower.split(',')

  # Is tmp_list empty?
  if len(tmp_list) == 0:
    return ['none']

  # Iterate through the temp list and trim values
  for elm in tmp_list:
    # If the list element is all spaces then skip
    if elm.strip() == "":
      continue

    # Otherwise strip the string of leading and trailing spaces and add to the
    #  return list
    ret_list.append(elm.strip())

  # Return the list
  return ret_list


In [9]:
# Create a working copy of the initial dataframe
df_data_upd = df_data.copy()

In [10]:
# Create dataframe columns which consist of lists of tokens for Effects, Flavors, and Symptoms respectively
df_data_upd["Effects_list"]           = df_data_upd["Effects"].apply(split2list)
df_data_upd["Flavor_list"]            = df_data_upd["Flavor"].apply(split2list)
df_data_upd["symptoms_diseases_list"] = df_data_upd["symptoms_diseases"].apply(split2list)


In [11]:
# Sample the 'Effects_list' column
df_data_upd["Effects_list"].sample(10)

933      [happy, uplifted, relaxed, focused, creative]
2191       [relaxed, focused, sleepy, euphoric, happy]
2095    [aroused, talkative, euphoric, happy, relaxed]
756        [sleepy, happy, euphoric, focused, aroused]
185        [happy, sleepy, giggly, euphoric, uplifted]
875         [euphoric, relaxed, happy, sleepy, tingly]
413                                             [none]
41        [hungry, relaxed, aroused, sleepy, euphoric]
1583        [relaxed, euphoric, giggly, happy, tingly]
955         [relaxed, happy, euphoric, sleepy, hungry]
Name: Effects_list, dtype: object

In [12]:
# Sample the 'Flavor_list' column
df_data_upd["Flavor_list"].sample(10)

1825         [mango, earthy, pine]
1211        [pungent, pine, sweet]
645       [earthy, pungent, sweet]
1638        [sweet, grape, citrus]
1604    [pineapple, sweet, earthy]
772        [sweet, earthy, butter]
816         [earthy, sweet, skunk]
405          [earthy, woody, pine]
1439                        [none]
1100      [sweet, citrus, pungent]
Name: Flavor_list, dtype: object

In [13]:
# Sample the 'symptoms_diseases_list' column
df_data_upd["symptoms_diseases_list"].sample(10)

892               [depression, spasticity]
1831                          [spasticity]
615     [insomnia, pain, pain, spasticity]
1788    [headache, pain, pain, spasticity]
720                           [spasticity]
1221                          [spasticity]
1248              [pain, pain, spasticity]
1245                          [spasticity]
1813                          [spasticity]
1801                      [ms, spasticity]
Name: symptoms_diseases_list, dtype: object

In [14]:
map_effects = {}
map_flavors = {}
map_symptoms = {}

# gen_unique_values takes an inbound list of strings and adds them as keys to a map
def gen_unique_values(lst, mp):
  # Iterate through the list
  for elm in lst:
    # Is elm an empty string? Error condition
    if elm == "":
      print("error - encountered an empty string")
      return 

    mp[elm] = 0
    return 


In [15]:
# Generate maps that contain unique Effects, Flavors, and Symtoms tokens
df_data_upd["Effects_list"].apply(gen_unique_values, mp=map_effects)
df_data_upd["Flavor_list"].apply(gen_unique_values, mp=map_flavors)
df_data_upd["symptoms_diseases_list"].apply(gen_unique_values, mp=map_symptoms)

0       None
1       None
2       None
3       None
4       None
        ... 
2345    None
2346    None
2347    None
2348    None
2349    None
Name: symptoms_diseases_list, Length: 2350, dtype: object

## CLEAN STRAIN DESCRIPTIONS

In [16]:
# Steps to Clean the Strain Descriptions
import re
import pandas as pd

# Regular expression used to remove non-standard characters
rgxNotStdChars = re.compile(r'[^a-zA-z0-9.,!?/:;\"\'\s]')
rgxMultWhtSpce = re.compile(r'\s{2,}')

# 'retain_std_chars' takes a string and returns that string with non-standard
#    characters removed
def retain_std_chars(val):
  # Is the passed value NaN?
  if pd.isna(val):
    return 'none'

  # Is the passed not a string?
  if type(val) != str:
    return 'none'

  # Is the value an empty string?
  if val == "":
    return 'none'

  # Lower case the input value
  tmp_lower = val.lower()

  # Is the value "none"?
  if tmp_lower == "none":
    return 'none'

  # Remove non-standard characters
  tmp_std = re.sub(rgxNotStdChars, "", tmp_lower)

  # Convert multiple whitespace characters to one whitespace character
  tmp_wht = re.sub(rgxMultWhtSpce, "", tmp_std)

  # Strip leading and trailing whitespace
  tmp_rtn = tmp_wht.strip()
  
  return tmp_rtn

In [17]:
df_data_upd['Description_cleaned'] = df_data_upd['Description'].apply(retain_std_chars)

## GENERATE WORK 'TOKENS' FOR THE STRAIN DESCRIPTIONS & CHARACTERISTICS

In [20]:
!pip install spacy

Collecting spacy
  Downloading spacy-2.3.0-cp37-cp37m-macosx_10_9_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 1.0 MB/s eta 0:00:01
[?25hCollecting murmurhash<1.1.0,>=0.28.0
  Using cached murmurhash-1.0.2-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (34 kB)
Collecting preshed<3.1.0,>=3.0.2
  Using cached preshed-3.0.2-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (211 kB)
Collecting catalogue<1.1.0,>=0.0.7
  Using cached catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting wasabi<1.1.0,>=0.4.0
  Downloading wasabi-0.7.0.tar.gz (22 kB)
Collecting blis<0.5.0,>=0.4.0
  Using cached blis-0.4.1-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (4.0 MB)
Collecting srsly<1.1.0,>=1.0.2
  Using cached srsly-1.0.2-cp37-cp37m-macosx_10_9_x86_64.whl (182 kB)
Collecting plac<1.2.0,>=

In [22]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.0/en_core_web_sm-2.3.0.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 1.3 MB/s eta 0:00:01    |██████████████                  | 5.3 MB 1.3 MB/s eta 0:00:06
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.0-py3-none-any.whl size=12048607 sha256=687a043981eb787d368c6383d7441d5b9daaffa817fa8f59a626c75cdd642cb9
  Stored in directory: /private/var/folders/0s/fkfxqrmx6sx5q2_70shx93lm0000gn/T/pip-ephem-wheel-cache-gn4vxgs0/wheels/71/4a/56/e48f8ad9359a6780edd8cdd42955519b1a21d6365ad15628a2
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.3.0
You should consider upgrading via the '/Users/danoand/anaconda3/bin/python -m pip install --upgrade pi

In [26]:
# Use the spacy library to generate strain description tokens
import spacy

# Instantiate a spacy object
nlp = spacy.load("en-core-web-sm-2.3.0")

# tnkize_text takes a string and returns a list of tokens generated via the spacy library
def tnkize_text(val):
  tmp_list = []
  tmp_doc = nlp(val)

  # Iterate through the text's tokenized objects
  for tkn in tmp_doc:
    tmp_list.append(tkn.text)

  return tmp_list


OSError: [E050] Can't find model 'en-core-web-sm-2.3.0'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [15]:
# Generate a list of description tokens
df_data_upd['Description_tokens'] =  df_data_upd['Description_cleaned'].apply(tnkize_text)

In [16]:
df_data_upd['Description_tokens'].sample(20)

892     [golden, pineapple, is, a, hybrid, cross, betw...
700     [dr, ., quantum, is, an, indicadominant, cross...
431     [this, indicadominant, california, strain, wil...
311     [blue, og, from, g13, labs, is, a, flavorful, ...
1881    [skunk, dawg, is, a, sativadominantcross, betw...
1776    [rene, mist, is, a, highenergy, hybrid, cross,...
1730    [purple, wreck, is, the, offspring, of, two, v...
347     [blueberry, silvertip, first, took, root, in, ...
2334    [also, known, as, simply, ythe, 80, sativa, y,...
1960    [south, american, refers, tothe, indigenous, v...
1002    [fire, is, a, slang, term, for, really, good, ...
888     [golden, goat, was, created, by, accident, in,...
1328    [magic, jordan, is, another, mysterious, hawai...
1159    [another, banger, from, snow, high, seedskalei...
1600    [bred, by, mtg, seedspineapple, purple, skunk,...
1289    [lime, purple, mist, is, an, indicadominant, s...
1042    [do, nt, worryhippie, crippler, is, nt, some, ...
304     [blue,

In [18]:
# remove_one_letter_words removes one character strings from a list of strings
def remove_one_letter_words(lst):
  ret_list = []

  # Iterate through the passed list
  for elm in lst:
    # Is the element not a string?
    if type(elm) != str:
      # Element not a string, skip
      continue

    # Does the element have a length of 0 or 1? Skip
    if elm == "" or len(elm) == 1:
      continue

    ret_list.append(elm)

  # Return the updated list
  return ret_list


# gen_all_tokens creates a column consisting of description, effects, 
#    flavors, and symptoms tokens ("ALL_TOKENTS")
def gen_all_tokens(DF):
  DF_WRK = DF.copy()

  DF_WRK['ALL_TOKENS'] = DF_WRK['Description_tokens'] \
    + DF_WRK['Effects_list'] \
    + DF_WRK['Flavor_list'] \
    + DF_WRK['symptoms_diseases_list']

  return DF_WRK

In [19]:
# Create a column ('ALL_TOKENS') of strain tokens that
#   consist of the description, effects, flavors, and symptom tokens
df_data_upd = gen_all_tokens(df_data_upd)

# Remove all one character word tokens from the 'ALL_TOKENS' column
df_data_upd['ALL_TOKENS'] = df_data_upd['ALL_TOKENS'].apply(remove_one_letter_words)

In [20]:
from google.colab import files
df_data_upd.to_csv('df.csv')
files.download('df.csv')

## VECTORIZE EACH STRAIN'S DESCRIPTION & CHARACTERISTICS

In [21]:
# Use TFIDF and vectorize the strain tokens ('ALL_TOKENS')
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_func(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    stop_words = 'english',
    tokenizer=dummy_func,
    preprocessor=dummy_func,
    token_pattern=None) 

In [24]:
# Create a vocabulary and tf-idf score per document
dtm = tfidf.fit_transform(df_data_upd['ALL_TOKENS'])

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(2350, 16383)


Unnamed: 0,'d,'s,...,0.14the,0.18,0.23,0.3,0.36,0.38,0.47,0.5,0.5cherry,0.86.with,00,09,1.07,1.1neville,1.2,1.3,1.4the,1.the,1/4,10,100,1011,1012,1013,1015,1024,10as,10dynasty,10jenni,10of,10th,10week,11,1113,115,11bred,11meaning,....1,ythe,yum,yumboldt,yummy,yunnan,yunnanaceseeds,yunnanorient,yunnans,zacatecascolombian,zamal,zamaldelica,zambeza,zappas,zealand,zealandand,zealandmt,zealands,zealously,zellys,zen,zens,zero,zest,zestful,zesty,zestycitrusyand,zestyfloral,zeta,zeus,zingerslemon,zion,zipping,zkittlez,zombie,zombiewith,zone,zonethe,zoning,zoom,zs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## CONSTRUCT A NEAREST NEIGHBORS MODEL TO GENERATE RECOMMENDATIONS

In [25]:
# Define a Nearest Neighbors model on which to compare incoming text
from sklearn.neighbors import NearestNeighbors

# Fit on the nearest neighbors model TF-IDF feature matrix created above 
nn = NearestNeighbors(n_neighbors=8, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=8, p=2,
                 radius=1.0)

In [26]:
nn.kneighbors([dtm.iloc[10].values])

(array([[0.        , 1.06450694, 1.19133723, 1.26735898, 1.27620083,
         1.29822153, 1.30003998, 1.30162712]]),
 array([[  10, 1929,  741,  649, 1372,  543, 1938, 1589]]))

## GENERATE A SAMPLE RECOMMENDATION

In [27]:
# Score a new document and return it's nearest neighbors
new_doc_score = tfidf.transform(["I want to feel super relaxed, yet energetic and creative"])

# Execute the nearest neighbors model using the newly scored document
nn.kneighbors(new_doc_score.todense())

(array([[1., 1., 1., 1., 1., 1., 1., 1.]]),
 array([[2199,  160,  776,  602, 2044,  852,  826, 1626]]))

In [28]:
df_data_upd.iloc[84]

index                                                                    84
Strain                                                     Alien-Technology
Type                                                                 indica
Rating                                                                  4.5
Effects                         Happy, Relaxed, Uplifted, Euphoric, Focused
Flavor                                          Earthy, Spicy/Herbal, Woody
symptoms_diseases                                              spasticity, 
Description               Very little is known about Alien Technology ot...
text_all                  Happy  Relaxed  Uplifted  Euphoric  Focused  E...
Effects_and_Flavor        Happy, Relaxed, Uplifted, Euphoric, Focused, E...
tokens                    ['happy', 'relaxed', 'uplifted', 'euphoric', '...
description_formatted     very little is known about alien technology ot...
Effects_list                  [happy, relaxed, uplifted, euphoric, focused]
Flavor_list 

In [29]:
import pickle

# Open a file and write the Nearest Neighbors model disk (pickling)
pkl_file = open("nn_model.pkl", 'wb')
pickle.dump(nn, pkl_file)
pkl_file.close()

In [30]:
# Open a file and write the TfidfVectorizer model to disk
pkl_tfidf_file = open("tfidf_model.pkl", 'wb')
pickle.dump(tfidf, pkl_tfidf_file)
pkl_tfidf_file.close()

In [31]:
# Open a file for reading (unpickling)
pkl_file = open("nn_model.pkl", 'rb')
nn_prime = pickle.load(pkl_file)
pkl_file.close()

In [32]:
nn_prime.n_neighbors

8

In [33]:
nn.kneighbors(new_doc_score.todense())[1][0]

array([2199,  160,  776,  602, 2044,  852,  826, 1626])