In [None]:
# Imports
import pandas as pd

## IMPORT STRAIN & SYMPTOM DATA

In [None]:
# Read in the symptoms_updated.xls file
df_data = pd.read_excel("https://dsfiles.dananderson.dev/files/symptoms_updated.xls")

In [None]:
df_data.head()

Unnamed: 0,index,Strain,Type,Rating,Effects,Flavor,symptoms_diseases,Description,text_all,Effects_and_Flavor,tokens,description_formatted
0,0,100-Og,hybrid,4.0,"Creative, Energetic, Tingly, Euphoric, Relaxed","Earthy, Sweet, Citrus","ms, pain, pain, spasticity,",$100 OG is a 50/50 hybrid strain that packs a ...,Creative Energetic Tingly Euphoric Relaxed...,"Creative, Energetic, Tingly, Euphoric, Relaxed...","['creative', 'energetic', 'tingly', 'euphoric'...",$100 og is a 50/50 hybrid strain that packs a ...
1,1,98-White-Widow,hybrid,4.7,"Relaxed, Aroused, Creative, Happy, Energetic","Flowery, Violet, Diesel","spasticity,",The ‚Äò98 Aloha White Widow is an especially p...,Relaxed Aroused Creative Happy Energetic ...,"Relaxed, Aroused, Creative, Happy, Energetic, ...","['relaxed', 'aroused', 'creative', 'happy', 'e...",the ‚Äò98 aloha white widow is an especially p...
2,2,1024,sativa,4.4,"Uplifted, Happy, Relaxed, Energetic, Creative","Spicy/Herbal, Sage, Woody","pain, pain, spasticity,",1024 is a sativa-dominant hybrid bred in Spain...,Uplifted Happy Relaxed Energetic Creative ...,"Uplifted, Happy, Relaxed, Energetic, Creative,...","['uplifted', 'happy', 'relaxed', 'energetic', ...",1024 is a sativa-dominant hybrid bred in spain...
3,3,13-Dawgs,hybrid,4.2,"Tingly, Creative, Hungry, Relaxed, Uplifted","Apricot, Citrus, Grapefruit","appetite, appetite, depression, spasticity,",13 Dawgs is a hybrid of G13 and Chemdawg genet...,Tingly Creative Hungry Relaxed Uplifted A...,"Tingly, Creative, Hungry, Relaxed, Uplifted, A...","['tingly', 'creative', 'hungry', 'relaxed', 'u...",13 dawgs is a hybrid of g13 and chemdawg genet...
4,4,24K-Gold,hybrid,4.6,"Happy, Relaxed, Euphoric, Uplifted, Talkative","Citrus, Earthy, Orange","spasticity,",Also known as Kosher Tangie 24k Gold is a 60%...,Happy Relaxed Euphoric Uplifted Talkative ...,"Happy, Relaxed, Euphoric, Uplifted, Talkative,...","['happy', 'relaxed', 'euphoric', 'uplifted', '...",also known as kosher tangie 24k gold is a 60%...


## SPLIT DESCRIPTIONS OF EFFECTS, FLAVORS, AND SYMPTOMS

In [None]:
# 'split2list' splits a comma delimited string and returns a list of lower case tokens
def split2list(val):
  ret_list = []

  # Is the passed value Nan?
  if pd.isna(val):
    return ['none']

  # Is the value an empty string?
  if val == "":
    return ['none']

  # Lower case the input value
  tmp_lower = val.lower()

  # Is the value "none"?
  if tmp_lower == "none":
    return ['none']

  # Split the string value into tokens delimited by commas
  tmp_list = tmp_lower.split(',')

  # Is tmp_list empty?
  if len(tmp_list) == 0:
    return ['none']

  # Iterate through the temp list and trim values
  for elm in tmp_list:
    # If the list element is all spaces then skip
    if elm.strip() == "":
      continue

    # Otherwise strip the string of leading and trailing spaces and add to the
    #  return list
    ret_list.append(elm.strip())

  # Return the list
  return ret_list


In [None]:
# Create a working copy of the initial dataframe
df_data_upd = df_data.copy()

In [None]:
# Create dataframe columns which consist of lists of tokens for Effects, Flavors, and Symptoms respectively
df_data_upd["Effects_list"]           = df_data_upd["Effects"].apply(split2list)
df_data_upd["Flavor_list"]            = df_data_upd["Flavor"].apply(split2list)
df_data_upd["symptoms_diseases_list"] = df_data_upd["symptoms_diseases"].apply(split2list)


In [None]:
# Sample the 'Effects_list' column
df_data_upd["Effects_list"].sample(10)

In [None]:
# Sample the 'Flavor_list' column
df_data_upd["Flavor_list"].sample(10)

In [None]:
# Sample the 'symptoms_diseases_list' column
df_data_upd["symptoms_diseases_list"].sample(10)

In [None]:
map_effects = {}
map_flavors = {}
map_symptoms = {}

# gen_unique_values takes an inbound list of strings and adds them as keys to a map
def gen_unique_values(lst, mp):
  # Iterate through the list
  for elm in lst:
    # Is elm an empty string? Error condition
    if elm == "":
      print("error - encountered an empty string")
      return 

    mp[elm] = 0
    return 


In [None]:
# Generate maps that contain unique Effects, Flavors, and Symtoms tokens
df_data_upd["Effects_list"].apply(gen_unique_values, mp=map_effects)
df_data_upd["Flavor_list"].apply(gen_unique_values, mp=map_flavors)
df_data_upd["symptoms_diseases_list"].apply(gen_unique_values, mp=map_symptoms)

## CLEAN STRAIN DESCRIPTIONS

In [None]:
# Steps to Clean the Strain Descriptions
import re
import pandas as pd

# Regular expression used to remove non-standard characters
rgxNotStdChars = re.compile(r'[^a-zA-z0-9.,!?/:;\"\'\s]')
rgxMultWhtSpce = re.compile(r'\s{2,}')

# 'retain_std_chars' takes a string and returns that string with non-standard
#    characters removed
def retain_std_chars(val):
  # Is the passed value NaN?
  if pd.isna(val):
    return 'none'

  # Is the passed not a string?
  if type(val) != str:
    return 'none'

  # Is the value an empty string?
  if val == "":
    return 'none'

  # Lower case the input value
  tmp_lower = val.lower()

  # Is the value "none"?
  if tmp_lower == "none":
    return 'none'

  # Remove non-standard characters
  tmp_std = re.sub(rgxNotStdChars, "", tmp_lower)

  # Convert multiple whitespace characters to one whitespace character
  tmp_wht = re.sub(rgxMultWhtSpce, "", tmp_std)

  # Strip leading and trailing whitespace
  tmp_rtn = tmp_wht.strip()
  
  return tmp_rtn

In [None]:
df_data_upd['Description_cleaned'] = df_data_upd['Description'].apply(retain_std_chars)

## GENERATE WORK 'TOKENS' FOR THE STRAIN DESCRIPTIONS & CHARACTERISTICS

In [None]:
# Use the spacy library to generate strain description tokens
import spacy

# Instantiate a spacy object
nlp = spacy.load("en_core_web_sm")

# tnkize_text takes a string and returns a list of tokens generated via the spacy library
def tnkize_text(val):
  tmp_list = []
  tmp_doc = nlp(val)

  # Iterate through the text's tokenized objects
  for tkn in tmp_doc:
    tmp_list.append(tkn.text)

  return tmp_list


In [None]:
# Generate a list of description tokens
df_data_upd['Description_tokens'] =  df_data_upd['Description_cleaned'].apply(tnkize_text)

In [None]:
df_data_upd['Description_tokens'].sample(20)

1407    [originating, in, amsterdam, and, currently, b...
315     [blue, rhino, is, a, potent, cross, of, bluebe...
8       [3d, cbd, from, snoop, doggs, branded, line, o...
1498    [og, sharka, rare, strain, found, primarily, i...
40      [this, popular, classic, strain, was, original...
132     [ash, is, an, indicadominanthybrid, cross, bet...
356     [bluniversealso, called, blue, universeis, a, ...
1536    [orange, skunk, is, a, clearheaded, hybrid, cr...
2119    [tardisor, the, tardisis, a, sativadominant, s...
1176    [khola, is, a, sociable, cross, between, brazi...
2001    [strawberry, blondie, by, los, angeles, kush, ...
661     [diamond, valley, kush, is, an, indicadominant...
597     [critical, 47, is, hybrid, in, genetics, and, ...
444     [cannatsu, is, a, hybrid, cannabis, strain, th...
1769    [red, widow, is, a, hybrid, cross, of, red, dr...
1251    [lem, chem, is, a, potent, hybrid, that, is, b...
1859                                               [none]
1046    [holla

In [None]:
# gen_all_tokens creates a column consisting of description, effects, 
#    flavors, and symptoms tokens ("ALL_TOKENTS")
def gen_all_tokens(DF):
  DF_WRK = DF.copy()

  DF_WRK['ALL_TOKENS'] = DF_WRK['Description_tokens'] \
    + DF_WRK['Effects_list'] \
    + DF_WRK['Flavor_list'] \
    + DF_WRK['symptoms_diseases_list']

  return DF_WRK

In [None]:
# Create a column ('ALL_TOKENS') of strain tokens that
#   consist of the description, effects, flavors, and symptom tokens
df_data_upd = gen_all_tokens(df_data_upd)

## VECTORIZE EACH STRAIN'S DESCRIPTION & CHARACTERISTICS

In [None]:
# Use TFIDF and vectorize the strain tokens ('ALL_TOKENS')
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    stop_words = 'english',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None) 

In [None]:
# Create a vocabulary and tf-idf score per document
dtm = tfidf.fit_transform(df_data_upd['ALL_TOKENS'])

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(2350, 16412)


Unnamed: 0,!,','d,'s,.,...,/,0.14the,0.18,0.23,0.3,0.36,0.38,0.47,0.5,0.5cherry,0.86.with,00,09,1,1.07,1.1neville,1.2,1.3,1.4the,1.the,1/4,10,100,1011,1012,1013,1015,1024,10as,10dynasty,10jenni,10of,10th,10week,....1,ythe,yum,yumboldt,yummy,yunnan,yunnanaceseeds,yunnanorient,yunnans,zacatecascolombian,zamal,zamaldelica,zambeza,zappas,zealand,zealandand,zealandmt,zealands,zealously,zellys,zen,zens,zero,zest,zestful,zesty,zestycitrusyand,zestyfloral,zeta,zeus,zingerslemon,zion,zipping,zkittlez,zombie,zombiewith,zone,zonethe,zoning,zoom,zs
0,0.0,0.0,0.0,0.0,0.123948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.101194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.083726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221596,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.097243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.130083,0.0,0.0,0.0,0.071542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## CONSTRUCT A NEAREST NEIGHBORS MODEL TO GENERATE RECOMMENDATIONS

In [None]:
# Define a Nearest Neighbors model on which to compare incoming text
from sklearn.neighbors import NearestNeighbors

# Fit on the nearest neighbors model TF-IDF feature matrix created above 
nn = NearestNeighbors(n_neighbors=8, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=8, p=2,
                 radius=1.0)

In [None]:
nn.kneighbors([dtm.iloc[10].values])

(array([[0.        , 1.06383587, 1.18811097, 1.26611709, 1.27393395,
         1.29394996, 1.29730651, 1.29739092]]),
 array([[  10, 1929,  741,  649, 1372,  543, 1938, 1589]]))

## GENERATE A SAMPLE RECOMMENDATION

In [None]:
# Score a new document and return it's nearest neighbors
new_doc_score = tfidf.transform(["I want to feel super relaxed, yet energetic and creative"])

# Execute the nearest neighbors model using the newly scored document
nn.kneighbors(new_doc_score.todense())

(array([[1.2672429 , 1.30677659, 1.30944854, 1.35269312, 1.35478938,
         1.36501368, 1.36675039, 1.37157906]]),
 array([[1178, 1817,   84, 1140,  103,  628, 1354, 2334]]))

In [None]:
df_data_upd.iloc[84]

index                                                                    84
Strain                                                     Alien-Technology
Type                                                                 indica
Rating                                                                  4.5
Effects                         Happy, Relaxed, Uplifted, Euphoric, Focused
Flavor                                          Earthy, Spicy/Herbal, Woody
symptoms_diseases                                              spasticity, 
Description               Very little is known about Alien Technology ot...
text_all                  Happy  Relaxed  Uplifted  Euphoric  Focused  E...
Effects_and_Flavor        Happy, Relaxed, Uplifted, Euphoric, Focused, E...
tokens                    ['happy', 'relaxed', 'uplifted', 'euphoric', '...
description_formatted     very little is known about alien technology ot...
Effects_list                  [happy, relaxed, uplifted, euphoric, focused]
Flavor_list 