# Cannabis recommender for med_cabinet_4
## Author: JAE Finger
### Updated: 06/20/2020

## Import packages and necessaries

In [1]:
#Import Packages
# Data analysis
import pandas as pd
import numpy as np
import json
import urllib.request

# Data cleaning
import re

# Tokenizing words
import spacy
from sklearn.feature_extraction import text 

# TFIDF / Word Embeddings
from sklearn.feature_extraction.text import TfidfVectorizer

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

# Deployment
import pickle

## Import strain data from github

In [2]:
# Import csv obtained from Kaggle
df = pd.read_csv("https://raw.githubusercontent.com/jae-finger/med_cabinet_4/master/cannabis.csv")

# Drop nulls
df = df.dropna()

# Get feature names
df_features = ['Strain', 'Type', 'Effects', "Flavor", 'Description']

# Lowercase and remove symbols from data
for each in df_features:
  df[each] = df[each].apply(lambda x: x.lower())
  df[each] = df[each].apply(lambda x: re.sub('[^a-zA-Z 0-9]', ' ', x))

# Create the combined text
df['combined_text'] = df['Type'] + ' ' + df['Effects'] + ' ' + df['Flavor'] + df['Description'] + ' '

# # Summarize
# print(df.shape)
# print(list(df.columns))
# df.head()

# Mock Model Usage

Accept a JSON that is formatted the correct way.

In [3]:
# Second test JSON (Unique Info) 
JSON_URL = "https://raw.githubusercontent.com/jae-finger/med_cabinet_4/master/jons_strain.json"

In [4]:
# Create a function to load and format json

def load_json(link):
  """
  A function that takes in a URL pointing to a JSON formatted correctly as per the repo
  and creates a dataframe formatted right. Next step would be to preprocess the text.
  """
  with urllib.request.urlopen(link) as url:
    input_json = json.loads(url.read().decode())
  json_strain = pd.DataFrame.from_records(input_json, index=[0], columns=['Strain', 'Type', 'Effects', 'Flavor', 'Description'])
  return json_strain

Test load the custom strain JSON as seed_strain. Load a `JSON` from `JSON_URL` using `load_json(link)`

In [5]:
seed_strain = load_json(JSON_URL)
seed_strain

Unnamed: 0,Strain,Type,Effects,Flavor,Description
0,User_Strain,Hybrid,"Creative, Relaxed, Sleepy","Sweet, Lemon, Gas",Looking for a strain that gives pain relief wi...


### Preprocess Text

In [6]:
# Create a function to preprocces the JSON loaded in the previous step
def preprocess_strain(strain):
  """
  A function that preprocesses a JSON created using load_json().
  It also adds an extra column needed for the model.
  Finally, it transforms the data using a pre_loaded model.
  """
  # Creates temp list of feature names
  c_f = ['Strain', 'Type', 'Effects', "Flavor", 'Description']

  # Lowercases and removes symbols
  for feature in c_f:
    strain[feature] = strain[feature].apply(lambda x: x.lower())
    strain[feature] = strain[feature].apply(lambda x: re.sub('[^a-zA-Z 0-9]', ' ', x))

  # Combines text
  strain['combined_text'] = strain['Type'] + ' ' + strain['Effects'] + ' ' + strain['Flavor'] + strain['Description'] + ' '
  return strain

Test by preprocessing the `seed_strain` created before to create `processed strain` using `preprocess_strain(strain)`.

In [7]:
processed_strain = preprocess_strain(seed_strain)
processed_strain

Unnamed: 0,Strain,Type,Effects,Flavor,Description,combined_text
0,user strain,hybrid,creative relaxed sleepy,sweet lemon gas,looking for a strain that gives pain relief wi...,hybrid creative relaxed sleepy sweet lemon ...


#### Find Recommendations (Transform, Cosine Similarity)

Calculate the top 5 most similar strains to our `processed_strain` by using `find_rec_strains(p_strain)`.

In [8]:
# Create a function that identifies the 5 most similar strains
def find_rec_strains(p_strain):
  """
  This function takes in a preprocessed JSON from preprocess_strain(strain).
  It creates a JSON containing info on the 5 most similar strains
  """
  # load the model from disk
  filename = 'pickled_model.pkl'
  pickled_model = pickle.load(open('/content/pickled_model.pkl', 'rb'))
  strain_list = pd.read_pickle("/content/pickled_df.pkl")
  pickled_dtm = pickle.load(open("/content/pickled_dtm.pkl", 'rb'))

  # Transforms preprocessed strain and appends
  input_dtm = pd.DataFrame((pickled_model.transform(p_strain['combined_text'])).todense(), columns=pickled_model.get_feature_names())
  dtm_1 = (pickled_dtm.append(input_dtm)).reset_index(drop=True)

  # Calculate similarity of all strains
  cosine_df = pd.DataFrame(cosine_similarity(dtm_1))

  #Grab top 5 results that are most similar to user inputted strain
  cosine_results = (pd.DataFrame(cosine_df[cosine_df[0] < 1][len(cosine_df)-1].sort_values(ascending=False)[1:6])).reset_index()
  cos_results = cosine_results['index'].values.tolist()
  recs = []
  for each in cos_results:
    temp = strain_list.iloc[each]
    recs.append(temp)
  recs = (pd.DataFrame(recs)).to_json("predicted_recs.json", orient='records')
  print('Created predicted_recs.json')

Test the recommender function opening the JSON it created.

In [9]:
find_rec_strains(processed_strain)

Created predicted_recs.json


In [10]:
# This section loads the json and displays it
with open('/content/predicted_recs.json') as f:
  final_test = json.load(f)
final_test = pd.DataFrame.from_records(final_test)
final_test

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,Richie-Rich,hybrid,4.3,"Happy,Euphoric,Relaxed,Hungry,Creative","Sweet,Lemon,Tropical",Richie Rich cannabis is a Northern Lights #5 c...
1,Charlie-Sheen,hybrid,4.6,"Happy,Euphoric,Relaxed,Sleepy,Focused","Pine,Sweet,Lemon","Charlie Sheen is an indica-dominant hybrid, pa..."
2,A-Train,hybrid,4.1,"Creative,Euphoric,Relaxed,Happy,Hungry","Earthy,Woody,Citrus",A-Train is a hybrid between Mazar I Sharif and...
3,Skunky-Diesel,hybrid,4.2,"Happy,Uplifted,Energetic,Relaxed,Sleepy","Skunk,Diesel,Earthy",Skunky Diesel is a nice indica-dominant strain...
4,White-Knight,hybrid,4.6,"Creative,Relaxed,Happy,Sleepy,Energetic","Earthy,Pungent,Sweet",White Knight is a “Chong Certified” hybrid str...
