# About this notebook:
## Steps:
#### 1. Ingest user selected filters
#### 2. Filter plants
#### 3. Recommendation using item-simialrity

### Import library

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None)

import copy
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
from sklearn.metrics import silhouette_samples, silhouette_score

  from pandas.core.computation.check import NUMEXPR_INSTALLED


### Data

In [2]:
!ls '../../plant_master_v4.csv'

../../plant_master_v4.csv


In [3]:
# filter json path
pref_path = 'filters.json'
# filter_url = 'http://20.127.87.137/recommendations/data'

# plant data path
# plant_path = 'plant_master_v4.csv'
plant_path ='../../plant_master_v4.csv'

# environment data path
envr_path = 'envr.csv'

# User preference ingestion

TODO: change UI filters to UI filters to the following set:

type: {'0': 'edible', '1': 'flowering', '2': 'greens', '3': 'hybrid'}

height: range(min, max)

width: range(min, max)

smell:(1|0)

showy :(1|0)

sun: {'0': 'full sun',
              '2': 'full shade',
              '4': 'part sun'}
              
maintenance:  
             {'0': 'medium', 
              '1': 'low', 
              '2': 'high'}
month: 

In [4]:
# Opening JSON file
# with open(pref_path) as json_file:
#     user = json.load(json_file)

In [5]:
## current filters that work for the master_v3
test = {
    'type': 
             {'1': 'hybrid'},  #, , '2': 'greens', '3': 'hybrid'
    'height': 
             {'max': 10, 
              'min': 0}, 
    'width':
             {'max': 30, 
              'min': 0},
    'smell': 
             {'min': 0}, 
    'showy': 
             {'max': 0},
    'maintenance': 
             {'1': 'high'},
    'zipcode': 
             {'ip': '67.170.250.166',
              'zip': '83703'},
    'month': {'0': 6}
}


In [6]:
# envr = pd.read_csv(envr_path, index_col=0, dtype={'zip_code': 'string'})
# envr.zip_code.sample(5)

# 45507     20850
# 24323     11385
# 188493    83703
# 23008     10970
# 59375     27560

TODO: adapt ingest_filters to new filter json format

In [7]:
# variables
def ingest_filters(j):
    params = []
    envr = []
    for _, typ in j['type'].items():
        type_col = typ
    for _, sm in j['smell'].items():
        smell = sm
    for _, sw in j['showy'].items():
        showy = sw
#     for _, su in j['sun'].items():
#         sun = su
    for _, mt in j['maintenance'].items():
        if mt == 'high':
            maintenance = ['low', 'medium', 'high', 'none']
        elif mt == 'medium':
            maintenance = ['low', 'medium', 'none']
        else: 
            maintenance = ['low', 'none']
        
            
    for _, mn in j['month'].items():
        month = mn

    height_max = j['height']['max']
    height_min = j['height']['min']
    width_max = j['width']['max']
    width_min = j['width']['min']
    zipcode = j['zipcode']['zip']
    
    params = [type_col, smell, showy, maintenance, height_max, height_min, width_max, width_min]
    envr = [zipcode, month]
    
    return envr, params

In [8]:
envr_filters, plant_filters = ingest_filters(test)

In [9]:
# envr = pd.read_csv(envr_path, index_col=0, dtype={'zip_code': 'string'})
# envr

# Use user's zipcode, month to find temperature/sun/zone


Full sun - 6 or more hours of direct sun per day
Part sun - 4 to 6 hours of direct sun per day, including some afternoon sun
Part shade - 4 to 6 hours of direct sun per day, mostly before midday
Full shade - less than 4 hours of direct sun per day

In [10]:
def envr_features(envr_path, envr_filters):
    # unpack filters
    zipcode, month = envr_filters
    
    def temp_bucket(x):
        x =round(x)
        if x <= 55:
            o = 'low'
        elif x in range(56, 66):
            o = 'cool'
        elif x in range(66, 75):
            o = 'mild'
        elif x in range(75, 85):
            o = 'warm'
        else:
            o = 'hot'
        return o

    def sun_bucket(x):
        '''
        Full sun - 6 or more hours of direct sun per day
        Part sun - 4 to 6 hours of direct sun per day, including some afternoon sun
        Part shade - 4 to 6 hours of direct sun per day, mostly before midday
        Full shade - less than 4 hours of direct sun per day
        '''
        x = round(x)
        if x <= 4:
            out = 'full shade'
        elif x in range(4, 7):
            out = 'part sun'
        elif x in range(6, 26):
            out = 'full sun'
        return out

    envr = pd.read_csv(envr_path, index_col=0, dtype={'zip_code': 'string'})
    loc_info = envr[(envr['zip_code'] == zipcode) & (envr['Month'] == month)].to_dict()
    temp_cat = temp_bucket(list(loc_info['tmin'].values())[0])
    zone = list(loc_info['zone'].values())[0]
    sun_cat = sun_bucket(list(loc_info['GHI_per_day'].values())[0])
    
    return zone, sun_cat, temp_cat

In [11]:
zone, sun_cat, temp_cat = envr_features(envr_path, envr_filters)

# Filter plant data

In [12]:
# read plant data

def subset_plant(plant_path, plant_filters):
    
    """ input: 
            plant_path: path to plant master data
            filter_params: filters ingested from front-end json and environment features
        output: 
            subset of plants
    """
    # read plant data
    plant_df = pd.read_csv(plant_path, index_col=0)
    
    # unpack filters
    type_col, smell, showy, maintenance, height_max, height_min, width_max, width_min = plant_filters
    
    sub = plant_df[(plant_df[type_col] == 1) 
                   & (plant_df['smell']== smell) 
                   & (plant_df['showy']== showy) 
                   & (plant_df['sun']== sun_cat) 
                   & (plant_df['maintenance'].isin(maintenance))
                   & ((plant_df['height']<=height_max) & (plant_df['height']>=height_min)) 
                   & ((plant_df['width']<=width_max) & (plant_df['height']>=width_min)) 
                   & (plant_df['zones'].isin([zone-1, zone, zone+1]))
                   & (plant_df['temp_bucket']== temp_cat)]

    # remove the filter columns
    cols_to_drop = ['smell', 'showy','sun', 'maintenance', 'height', 'width', 'zones', 'temp_bucket', type_col]
    sub.drop(columns=cols_to_drop, inplace=True)
    
    return sub

In [13]:
df = subset_plant(plant_path, plant_filters)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub.drop(columns=cols_to_drop, inplace=True)


In [14]:
df.columns

Index(['attracts', 'common', 'water', 'special_feature', 'propagation',
       'problem_solvers', 'flowering', 'annual', 'perennial', 'biennial',
       'drought', 'air_pollution', 'dry_soil', 'wet_soil', 'clay_soil',
       'rain_garden', 'good_for_containers', 'water_plant', 'm_type', 'edible',
       'bloom_season', 'greens'],
      dtype='object')

# Randomly select samples

In [15]:
# randomely select n samples
N_REC = 1
RAM_PLANT = df.common.sample(N_REC).values[0]

In [16]:
def pca(df):
    """ function: one-hot encoded catgorical features concatonated with binary feature
        input: subset of plant
        output: dimentionality reduced pca data
    """
    
    cat_cols = ['attracts', 'water', 'special_feature', 'propagation',
           'problem_solvers', 'm_type', 'bloom_season']

    categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
    encoded_columns = categorical_preprocessor.fit_transform(df[cat_cols])
    processed_data = encoded_columns.todense()

    while True:
        try:
            pca = PCA(n_components=processed_data.shape[1],random_state=42)
            pca_model = pca.fit(processed_data)

        except ValueError: 
            pca_data = processed_data
            break

        else:
            ratio = pca_model.explained_variance_ratio_
            cum_sum_eigenvalues = np.cumsum(ratio)

            # select 95% cutoff
            cutoff = np.where(cum_sum_eigenvalues>0.95)[0][0]
            pca = PCA(n_components=cutoff,random_state=42)
            pca_data = pca.fit_transform(processed_data)
            break

    bin_cols = [col for col in df.columns if col not in cat_cols or not 'common']
    target = df.pop('common').tolist()
    bin_cols.remove('common')
    bin_train = np.array(df[bin_cols])

    features = np.concatenate((bin_train, pca_data),axis=1) 
    
    return features, target

In [17]:
features, target = pca(df)

In [18]:
# mock user selection

USER_PREF_PLANT = RAM_PLANT
NUM_SIMILAR = 5

In [19]:
DOT = 'dot'
COSINE = 'cosine'

def compute_scores(query_embedding, item_embeddings, measure=COSINE):
  """Computes the scores of the candidates given a query.
  Args:
    query_embedding: a vector of shape [k], representing the query embedding.
    item_embeddings: a matrix of shape [N, k], such that row i is the embedding
      of item i.
    measure: a string specifying the similarity measure to be used. Can be
      either DOT or COSINE.
  Returns:
    scores: a vector of shape [N], such that scores[i] is the score of item i.
  """
  u = query_embedding
  V = item_embeddings
  if measure == COSINE:
    V = V / np.linalg.norm(V, axis=1, keepdims=True)
    u = u / np.linalg.norm(u)
  scores = u.dot(V.T)
  return scores


def recommendation(features, target, plant, df, NUM_SIMILAR):
    ''' 
    funtion: recommendation engine
    input:
        features: features for cosine similarity calc.
        target: names of plants
        plant: randomly selected item, used to find similarlity for
        df: subset of plant dataframe
        NUM_SIMILAR: num of recommendation to return
    output:
        df of top n similar items
    '''
    
    item_idx = target.index(plant)
    query_vec = features[item_idx][:]

    scores = compute_scores(query_vec, features, measure=COSINE)
    df['cosine'] = scores

    df['target']= target
    score_df = pd.DataFrame(df.groupby('target')['cosine'].mean().sort_values(ascending=False))
    sim_plants = score_df[:NUM_SIMILAR]

    return sim_plants

In [20]:
sim_items = recommendation(features, target, USER_PREF_PLANT, df, NUM_SIMILAR)

In [21]:
27560

27560

In [22]:
USER_PREF_PLANT

'kale'

In [23]:
sim_items

Unnamed: 0_level_0,cosine
target,Unnamed: 1_level_1
kohlrabi,0.861492
cabbage,0.735594
kale,0.696734
broccoli,0.616344
lettuce,0.570836
