In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mlp
from scipy import stats
import time
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

Build content based recommendation system using cosine similarity

Pros: can recommend new items (no customer ratings before) and recommend for users with unique tastes

Cons:
But need the front end to collect and  build user profile for the new users 
will not be able to recommend items outside user's content profile; unable to exploit quality judgements of other users 

In [2]:
raw_trimmed = pd.read_csv('Jan2022_Cannabis-Profiles_Data Science_cleaned.csv')

In [3]:
raw_trimmed.shape

(4000, 86)

# Prepare feature space

High dimensionality drives the smilarity score to 1

To reduce dimensionality, we need to divide the features into several groups.

Cannabis volume/perc

Terpenes volume//perc





In [4]:
raw_trimmed.columns[80:]

Index(['CBC_Pct_Pct_Total', 'CBDV_Pct_Pct_Total', 'THCV_Pct_Pct_Total',
       'CBL_Pct_Pct_Total', 'D8_Pct_Pct_Total', 'Total_Terpene_Volume_pct'],
      dtype='object')

In [5]:
# cannabis_vol_column =    ['THC_vol', 'CBD_vol', 'CBG_vol', 'CBN_vol', 'CBC_vol', 'CBDV_vol',
#        'THCV_vol', 'CBL_vol']

# for col in cannabis_vol_column:
#     raw_trimmed[f'{col}_% of Total'] = raw_trimmed[col]/raw_trimmed['Total_Cannabinoids_mg_g']

In [6]:
raw_trimmed.columns

Index(['Unnamed: 0', 'Strain_Name', 'Batch_ID', 'A_Pinene', 'Distributor',
       'Strain', 'MetricID', 'Type', 'Distributor_Address',
       'Distributor_City_State_Zip', 'B_Caryophyllene', 'Producer', 'Brand',
       'Date_Recieved', 'Matrix', 'Producer_City_State_Zip',
       'Producer_Lic_Number', 'terpinolene', 'Producer_Address', 'Sample_ID',
       'Lab_ID', 'Total_Terpene_Volume', 'A_Humulene', 'Date_Harvested',
       'B_Myrcene', 'dataOrigin', 'Ocimene', 'Date_Collected', 'D_Limonene',
       'brandPK', 'B_Pinene', 'Distributor_Lic#', 'Linalool',
       'Date_Test_Completed', 'PrimaryKey', 'THC_Pct', 'CBD_Pct', 'CBG_Pct',
       'CBN_Pct', 'CBC_Pct', 'CBDV_Pct', 'THCV_Pct', 'CBL_Pct', 'D8_Pct',
       'Total_Cannabinoid_Pct', 'Nerolidol', 'A-Bisabalol', 'Other',
       'B_Myrcene_Pct_Total', 'B_Caryophyllene_Pct_Total',
       'D_Limonene_Pct_Total', 'A_Pinene_Pct_Total', 'B_Pinene_Pct_Total',
       'A_Humulene_Pct_Total', 'terpinolene_Pct_Total', 'Linalool_Pct_Total',
     

In [11]:
raw_trimmed[[ 
       'Total_Terpene_Volume','Total_Cannabinoid_Pct']]

Unnamed: 0,Total_Terpene_Volume,Total_Cannabinoid_Pct
0,35.47,0.00000
1,19.62,0.19413
2,20.78,0.00000
3,19.20,0.00000
4,34.83,0.00000
...,...,...
3995,8.80,0.23849
3996,12.04,0.30792
3997,9.43,0.24287
3998,6.35,0.29174


In [13]:
raw_trimmed['Total_Terpene_Volume_Pct'] = raw_trimmed['Total_Terpene_Volume']/1000

In [14]:
raw_trimmed.columns

Index(['Unnamed: 0', 'Strain_Name', 'Batch_ID', 'A_Pinene', 'Distributor',
       'Strain', 'MetricID', 'Type', 'Distributor_Address',
       'Distributor_City_State_Zip', 'B_Caryophyllene', 'Producer', 'Brand',
       'Date_Recieved', 'Matrix', 'Producer_City_State_Zip',
       'Producer_Lic_Number', 'terpinolene', 'Producer_Address', 'Sample_ID',
       'Lab_ID', 'Total_Terpene_Volume', 'A_Humulene', 'Date_Harvested',
       'B_Myrcene', 'dataOrigin', 'Ocimene', 'Date_Collected', 'D_Limonene',
       'brandPK', 'B_Pinene', 'Distributor_Lic#', 'Linalool',
       'Date_Test_Completed', 'PrimaryKey', 'THC_Pct', 'CBD_Pct', 'CBG_Pct',
       'CBN_Pct', 'CBC_Pct', 'CBDV_Pct', 'THCV_Pct', 'CBL_Pct', 'D8_Pct',
       'Total_Cannabinoid_Pct', 'Nerolidol', 'A-Bisabalol', 'Other',
       'B_Myrcene_Pct_Total', 'B_Caryophyllene_Pct_Total',
       'D_Limonene_Pct_Total', 'A_Pinene_Pct_Total', 'B_Pinene_Pct_Total',
       'A_Humulene_Pct_Total', 'terpinolene_Pct_Total', 'Linalool_Pct_Total',
     

In [15]:

## THC % = THC volume/1000
## = Total Cannabinoid %

## those represent volume
cannabis_vol = raw_trimmed[[ 'THC_Pct', 'CBD_Pct', 'CBG_Pct',
       'CBN_Pct', 'CBC_Pct', 'CBDV_Pct', 'THCV_Pct', 'CBL_Pct', 'D8_Pct',
       'Total_Cannabinoid_Pct']]

terpenes_perc =  raw_trimmed[['B_Myrcene_Pct_Total', 'B_Caryophyllene_Pct_Total',
       'D_Limonene_Pct_Total', 'A_Pinene_Pct_Total', 'B_Pinene_Pct_Total',
       'A_Humulene_Pct_Total', 'terpinolene_Pct_Total', 'Linalool_Pct_Total',
       'Ocimene_Pct_Total', 'Nerolidol_Pct_Total', 'A_Bisabalol_Pct_Total',
       'Other_Pct_Total']]

cannabis_perc =  raw_trimmed[[ 'THC_Pct_Pct_Total', 'CBD_Pct_Pct_Total',
       'CBG_Pct_Pct_Total', 'CBN_Pct_Pct_Total', 'CBC_Pct_Pct_Total',
       'CBDV_Pct_Pct_Total', 'THCV_Pct_Pct_Total', 'CBL_Pct_Pct_Total',
       'D8_Pct_Pct_Total']]

terpenes_vol =raw_trimmed[[ 'B_Myrcene', 'B_Caryophyllene',
       'D_Limonene', 'A_Pinene', 'B_Pinene', 'A_Humulene', 'terpinolene',
       'Linalool', 'Ocimene', 'Nerolidol', 'A-Bisabalol', 'Other',
       'Total_Terpene_Volume']]

test_vol =raw_trimmed[[ 
       'Total_Terpene_Volume_Pct','Total_Cannabinoid_Pct']]


In [17]:
cannabis_vol =cannabis_vol.fillna(0)
cannabis_perc =cannabis_perc.fillna(0)
test_vol =test_vol.fillna(0)

In [18]:
# data = raw_trimmed[[ 'THC %', 'CBD %', 'CBG %', 'CBN %', 'CBC %', 'CBDV %', 'THCV %',
#        'CBL %','B_Myrcene_% of Total',
#        'B_Caryophyllene_% of Total', 'D_Limonene_% of Total',
#        'A_Pinene_% of Total', 'B_Pinene_% of Total', 'A_Humulene_% of Total',
#        'terpinolene_% of Total', 'Linalool_% of Total', 'Ocimene_% of Total',
#        'Nerolidol_% of Total', 'A-Bisabalol_% of Total', 'Other_% of Total','Total Terpene Volume']]

In [19]:
## encode Matrix, Brand, Strain, Type

In [20]:
#profile = raw_trimmed[['Matrix']]

In [21]:
#profile_binary = pd.get_dummies(profile,prefix = ['Matrix'])

In [22]:
#data_w_profile = data.merge(profile_binary, left_index=True,right_index=True)

In [23]:
# data_w_profile = data
# data_w_profile

In [24]:
# test = list(similarity_score.head().index
# data_w_profile[data_w_profile['id2'].isin(test)]

# Cosine Similarity

## similarity in groups of features 

In [25]:
names = raw_trimmed['Batch_ID'].values.tolist()
len(names)

4000

In [26]:
from sklearn.metrics.pairwise import cosine_distances
names = raw_trimmed['Batch_ID'].values.tolist()
def cal_cos_similarity(data_subset):
    distances = cosine_distances(data_subset)
    distance_df = pd.DataFrame(distances,columns = names, index = names)
    return distance_df

In [27]:
# from scipy.spatial.distance import euclidean
# names = raw_trimmed['Batch_ID'].values.tolist()
# def cal_euc_similarity(data_subset):
#     distances = euclidean(data_subset)
#     distance_df = pd.DataFrame(distances,columns = names, index = names)
#     return distance_df

In [28]:
# test_vol['Batch_ID']  = raw_trimmed['Batch_ID']

# test_vol.columns

In [29]:
# %%time
# from scipy.spatial.distance import euclidean
# names = raw_trimmed['Batch_ID'].values.tolist()
# distance_df = pd.DataFrame( columns = names, index = names)

# ## Euclian distance 
# for i in tqdm(range(len(test_vol))):
#     for j in range(len(test_vol)):
                
#         key_i =list(test_vol.loc[i,'Batch_ID'])[0]
#         key_j = list(test_vol.loc[j,'Batch_ID'])[0]
#         #print(key_i,key_j)
#         point1 = np.array(test_vol[test_vol.index == i][['Total Terpene Volume %', 'Total Cannabinoid %']])
#         point2 = np.array(test_vol[test_vol.index == j][['Total Terpene Volume %', 'Total Cannabinoid %']])
# #             print(point1)
# #             print(point2)
#         distance = euclidean(point1,point2)

#         distance_df.loc[ key_i,key_j] = distance
                



In [30]:
#  distance_df.loc[ 'GHFGG40930','GHFSD0903AA']

In [31]:
# len(cannabis_perc)

In [32]:
cannabis_perc_dist = cal_cos_similarity(cannabis_perc)
terpenes_perc_dist = cal_cos_similarity(terpenes_perc)
cannabis_vol_dist =cal_cos_similarity(cannabis_vol)
terpenes_vol_dist = cal_cos_similarity(terpenes_vol)

In [33]:
test_vol_dist = cal_cos_similarity(test_vol)

In [34]:
test_vol_dist

Unnamed: 0,IND1_134,S062819-3,IND1_107,IND1_181,IND1_66,IND1_112,IND1_193,IND1_230,IND1_127,IND1_179,...,GHFPOG0930,GHFMAC0930,GHFGG40930,GHFPPL0930,GHFLILI0930,GHFWIFI0928MP,GHFGCK0928MP,GHFWIFI0928PR,GHFHF0924PR,GHFGCK0928PR
IND1_134,0.000000,0.899446,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.916231,0.934816,0.937616,0.944710,0.929228,0.963126,9.609288e-01,9.612019e-01,0.978239,0.965233
S062819-3,0.899446,0.000000,0.899446,0.899446,0.899446,0.899446,0.899446,0.899446,0.899446,0.899446,...,0.000142,0.000630,0.000733,0.001031,0.000447,0.002037,1.899336e-03,1.916209e-03,0.003116,0.002174
IND1_107,0.000000,0.899446,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.916231,0.934816,0.937616,0.944710,0.929228,0.963126,9.609288e-01,9.612019e-01,0.978239,0.965233
IND1_181,0.000000,0.899446,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.916231,0.934816,0.937616,0.944710,0.929228,0.963126,9.609288e-01,9.612019e-01,0.978239,0.965233
IND1_66,0.000000,0.899446,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.916231,0.934816,0.937616,0.944710,0.929228,0.963126,9.609288e-01,9.612019e-01,0.978239,0.965233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GHFWIFI0928MP,0.963126,0.002037,0.963126,0.963126,0.963126,0.963126,0.963126,0.963126,0.963126,0.963126,...,0.001104,0.000402,0.000326,0.000170,0.000576,0.000000,2.417947e-06,1.854300e-06,0.000114,0.000002
GHFGCK0928MP,0.960929,0.001899,0.960929,0.960929,0.960929,0.960929,0.960929,0.960929,0.960929,0.960929,...,0.001003,0.000342,0.000272,0.000132,0.000504,0.000002,0.000000e+00,3.734483e-08,0.000150,0.000009
GHFWIFI0928PR,0.961202,0.001916,0.961202,0.961202,0.961202,0.961202,0.961202,0.961202,0.961202,0.961202,...,0.001015,0.000349,0.000279,0.000136,0.000513,0.000002,3.734483e-08,0.000000e+00,0.000145,0.000008
GHFHF0924PR,0.978239,0.003116,0.978239,0.978239,0.978239,0.978239,0.978239,0.978239,0.978239,0.978239,...,0.001928,0.000945,0.000827,0.000563,0.001204,0.000114,1.499638e-04,1.452683e-04,0.000000,0.000085


In [35]:
%%time
cannabis_perc_dist.to_csv('cannabis_perc_product_cosine_distance_df.csv')
terpenes_perc_dist.to_csv('terpenes_perc_product_cosine_distance_df.csv')
cannabis_vol_dist.to_csv('cannabis_vol_product_cosine_distance_df.csv')
terpenes_vol_dist.to_csv('terpenes_vol_product_cosine_distance_df.csv')

test_vol_dist.to_csv('terpenes_cannabi_vol_product_cosine_distance_df.csv')

CPU times: user 51.5 s, sys: 541 ms, total: 52 s
Wall time: 52.2 s


In [36]:
# terpenes_vol_dist.shape

In [37]:
# terpenes_vol_test

In [38]:
# terpenes_vol_test = cal_cos_similarity(terpenes_vol)

In [39]:
# terpenes_vol_test.to_csv('test_terpenes_vol_product_cosine_distance_df.csv')

## all the feature at once

In [40]:

# from sklearn.metrics.pairwise import cosine_distances
# distances = cosine_distances(data_w_profile)
# distances

In [41]:
#raw_trimmed['id2'] = raw_trimmed['Batch_ID'].astype('str') + "_" +raw_trimmed['Strain_integrated'].astype('str')

In [42]:
#data_w_profile['id2'] = raw_trimmed['Batch_ID'].astype('str') + "_" +raw_trimmed['Strain_integrated'].astype('str')

In [43]:
#data_w_profile['id2'] 

In [44]:
#names = data_w_profile['id2'] .values.tolist()
# names = raw_trimmed['Batch_ID'] .values.tolist()
# PairwiseDistances = []

# for i in range(0, len(distances), 1):

#       PairwiseDistances = PairwiseDistances + list(zip( [names[i]]*len(names) ,names ,distances[i]  ))
#   # Pairwise Distance is calculated cosine distance
# PairwiseDistances

In [45]:
# distance_df = pd.DataFrame(distances,columns = names, index = names)

In [46]:
# distance_df

In [47]:
## using all the features at once. 
# distance_df.to_csv('product_cosine_distance_df.csv')
# distance_df.iloc[:,0].sort_values(ascending = True)

## Put distance into pairwise dictionary

In [48]:
# rec_dict = dict()
# for i in distance_df.columns:
#     rec_dict[i] = distance_df.loc[:,i].sort_values(ascending = True)[:10]

In [49]:
# rec_dict

In [50]:
# rec_dict_full = dict()
# for i in distance_df.columns:
#     rec_dict_full[i] = distance_df.loc[:,i].sort_values(ascending = True)

In [51]:
# test = list(rec_dict['BDRP-6715_Blueberry x Town Jewels'].index)

In [52]:
# raw_trimmed[raw_trimmed['id2'].isin(test)]

# Cluster

In [53]:
# clustered = pd.read_csv('concentrates_clustered_31_Aug.csv')

In [54]:
# clustered['id2'] = clustered['Batch_ID'].astype('str') + "_" +clustered['Strain_integrated'].astype('str')

In [55]:
# clustered.columns

In [56]:
# clustered[clustered['id2'].isin(test)]['individual_cluster_adjust_label_2']

In [57]:
# clustered[clustered['id2']=='BDRP-6715_Blueberry x Town Jewels']['individual_cluster_adjust_label_2']

In [58]:
# rec_by_cluster = clustered[clustered['individual_cluster_adjust_label_2']==152254054]['id2'].values

In [59]:
# rec_by_distance = test

In [60]:
# def intersection(lst1, lst2):
#     lst3 = [value for value in lst1 if value in lst2]
#     return lst3

In [61]:
# intersection(rec_by_cluster,rec_by_distance)

In [62]:
# data.columns

Only two intersections with the top 10 similar products

In [63]:
## the similarity score of products in the same cluster
# similarity_score =pd.DataFrame(rec_dict_full['BDRP-6715_Blueberry x Town Jewels'])
# similarity_score

In [64]:
# similarity_score['Rank'] = list(range(2952))

In [65]:
# rec_by_cluster

In [66]:
# similarity_score[similarity_score.index.isin( list(rec_by_cluster))]

The rec_by_cluster, although only has 2 intersection with the top_10 rec_by_distance,
the similarity score are still very high.  23/2900



# Some ideas to combine cosine and cluster

1.same cluster, rank by similarity 
2.rank by similarity, take top 30, select the ones in similar clusters
3.vote
4. 

In [67]:
## match customer expeirnece
## find cluster distance to expand recommendation

