In [1]:
import pickle

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

leafly = pd.read_json('../data/leafly.json')
leafly['aka'] = leafly['aka'].fillna("unknown")
leafly = leafly.dropna()
leafly.shape

(1044, 37)

## Data Wrangling

In [2]:
leafly['feelings'] = leafly['feeling_1'] + ' ' + leafly['feeling_2'] + ' ' + leafly['feeling_3'] + ' ' + leafly['feeling_4'] + ' ' + leafly['feeling_5']

leafly['helps'] = leafly['helps_1'] + ' ' + leafly['helps_2'] + ' ' + leafly['helps_3'] + ' ' + leafly['helps_4'] + ' ' + leafly['helps_5']

leafly['feelings_helps_description'] = leafly["feelings"] + ' ' + leafly["helps"] + ' ' + leafly['description']

leafly['feelings_helps_description']

0       Happy Euphoric Creative Relaxed Uplifted Stres...
5       Happy Energetic Uplifted Creative Focused Stre...
7       Euphoric Relaxed Happy Uplifted Creative Pain ...
9       Happy Relaxed Uplifted Euphoric Hungry Stress ...
11      Relaxed Happy Euphoric Sleepy Uplifted Stress ...
                              ...                        
3423    Euphoric Uplifted Happy Relaxed Energetic Stre...
3425    Relaxed Euphoric Happy Sleepy Giggly Pain Stre...
3432    Relaxed Sleepy Euphoric Happy Hungry Insomnia ...
3434    Happy Energetic Relaxed Focused Uplifted Stres...
3437    Happy Creative Aroused Energetic Focused Depre...
Name: feelings_helps_description, Length: 1044, dtype: object

In [3]:
leafly['feelings_helps_description'][45]

'Relaxed Happy Euphoric Hungry Focused Stress Anxiety Nausea Pain Depression Blurple, also known as Blue Dream Purple, is a balanced hybrid cross between Blue Dream and Mendocino Purps. Its effects position themselves cerebrally at first, with gentle body relaxation that keeps you feeling light and free of tension. Drawing from both sides of the family, Blurple carries a sweet, dessert-like berry and grape aroma that comes to life on the inhale. This strain earns its name as bluish purple hues swirl throughout Blurple’s green buds.'

## Build preprocessor and model

In [4]:
tfidf = TfidfVectorizer(stop_words = 'english',ngram_range = (1,2),max_features = 2000)
dtm = tfidf.fit_transform(leafly['feelings_helps_description'])
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
nn = NearestNeighbors(n_neighbors=4, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', n_neighbors=4)

## Pickle preprocessor and model

In [5]:
import sklearn
sklearn.__version__

'0.23.2'

In [6]:
with open('../pickles/nn_model.pkl', 'wb') as nn_pkl:
    pickle.dump(nn, nn_pkl)

with open('../pickles/tfidf.pkl', 'wb') as tfidf_pkl:
    pickle.dump(tfidf, tfidf_pkl)

## Pickle minified data


In [7]:
min_data = leafly[['strain', 'type', 'feeling_1', 'feeling_2', 'feeling_3', 'feeling_4', 'feeling_5', 'helps_1', 'helps_2', 'helps_3', 'helps_4', 'helps_5', 'description']].to_records()

with open('../pickles/min_data.pkl', 'wb') as data_pkl:
    pickle.dump(min_data, data_pkl)