In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/My Drive/cannabis.csv')
data.shape

(2351, 6)

In [None]:
data.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [None]:
data.isnull().sum()

Strain          0
Type            0
Rating          0
Effects         0
Flavor         46
Description    33
dtype: int64

In [None]:
data['Flavor'] = data['Flavor'].replace(np.nan, 'Flavor information now is unavailable. ')
data['Description'] = data['Description'].replace(np.nan, 'Description information is unknown.')

In [None]:
data['Effects'].str.lower().str.split(',').str.len().value_counts()

5    2180
1      97
4      32
3      27
2      15
Name: Effects, dtype: int64

In [None]:
data_effects = set()
for row in data['Effects'].str.lower().str.split(','):
  data_effects.update(set(row))
len(data_effects), data_effects

(16,
 {'aroused',
  'creative',
  'dry',
  'energetic',
  'euphoric',
  'focused',
  'giggly',
  'happy',
  'hungry',
  'mouth',
  'none',
  'relaxed',
  'sleepy',
  'talkative',
  'tingly',
  'uplifted'})

In [None]:
data_flavor = set()
for row in data['Flavor'].str.lower().str.split(','):
  data_flavor.update(set(row))
len(data_flavor), data_flavor

(51,
 {'ammonia',
  'apple',
  'apricot',
  'berry',
  'blue',
  'blueberry',
  'butter',
  'cheese',
  'chemical',
  'chestnut',
  'citrus',
  'coffee',
  'diesel',
  'earthy',
  'flavor information now is unavailable. ',
  'flowery',
  'fruit',
  'grape',
  'grapefruit',
  'honey',
  'lavender',
  'lemon',
  'lime',
  'mango',
  'menthol',
  'mint',
  'minty',
  'none',
  'nutty',
  'orange',
  'peach',
  'pear',
  'pepper',
  'pine',
  'pineapple',
  'plum',
  'pungent',
  'rose',
  'sage',
  'skunk',
  'spicy/herbal',
  'strawberry',
  'sweet',
  'tar',
  'tea',
  'tobacco',
  'tree',
  'tropical',
  'vanilla',
  'violet',
  'woody'})

In [None]:
len(data['Strain'].unique())

2350

In [None]:
data['Type'].value_counts()

hybrid    1212
indica     699
sativa     440
Name: Type, dtype: int64

In [None]:
data.dtypes

Strain          object
Type            object
Rating         float64
Effects         object
Flavor          object
Description     object
dtype: object

In [None]:
# Create a master profile feature
data['Profile'] = data['Type'] + ',' + data['Effects'] + ',' +  data['Flavor'] + ',' + str(data['Rating'])

In [None]:
data['Profile'][1]

'hybrid,Relaxed,Aroused,Creative,Happy,Energetic,Flowery,Violet,Diesel,0       4.0\n1       4.7\n2       4.4\n3       4.2\n4       4.6\n       ... \n2346    4.7\n2347    4.6\n2348    5.0\n2349    4.4\n2350    4.6\nName: Rating, Length: 2351, dtype: float64'

#KNN Model

In [None]:
import spacy.cli
spacy.cli.download('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
X = [nlp(str(item)).vector for item in data['Profile']]

In [None]:
df = pd.DataFrame(X)
df.shape

(2351, 96)

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95
0,0.354726,-1.214044,0.558231,0.460933,1.354464,0.865115,0.471298,0.133282,0.595543,-0.008739,1.376619,1.489451,-1.543706,-1.354528,-2.155865,-1.212803,0.260505,1.153557,0.483621,-0.235416,1.717711,-1.259839,-0.690264,-0.088017,-0.892222,0.067624,-1.1559,-1.08034,1.578337,-0.911599,-0.107353,-0.726251,0.020976,-0.201912,-0.431743,-0.698779,1.322874,-1.176686,1.148563,0.211596,...,-0.735736,0.597935,1.448869,-0.501088,0.490752,0.067765,1.379793,-0.449045,1.283808,-0.522273,2.22904,-1.13766,-1.820875,1.180262,1.686524,-1.34593,0.680885,0.610567,-0.238974,-0.024083,0.042082,2.269241,0.639109,-0.553829,1.026207,-0.004959,0.068601,-0.239944,2.753455,1.383925,-0.001384,-0.564256,-0.725815,-0.746969,0.141355,0.264706,-0.017476,-0.725212,0.292572,0.334034
1,0.345807,-1.163711,0.50354,0.537787,1.211014,0.898536,0.416772,0.137764,0.652658,0.188688,1.429866,1.335036,-1.67619,-1.355819,-2.182853,-1.199458,0.262901,1.281935,0.462808,-0.311308,1.6443,-1.220142,-0.688234,-0.078578,-0.940707,-0.075589,-1.341006,-1.10993,1.652313,-0.84572,-0.002826,-0.907333,0.133378,-0.185603,-0.421436,-0.610048,1.33994,-1.111626,1.224102,0.329836,...,-0.75967,0.571425,1.476756,-0.435935,0.559484,0.017474,1.321906,-0.429221,1.052382,-0.569634,2.240742,-1.12742,-1.964644,1.140884,1.722873,-1.19482,0.594219,0.497937,-0.28133,0.075993,-0.097674,2.341012,0.644269,-0.772472,0.990916,-0.031269,0.090199,-0.280487,2.859144,1.408176,0.039063,-0.660159,-0.696367,-0.731349,0.02108,0.133559,0.013984,-0.6062,0.427626,0.363346
2,0.402839,-1.026178,0.376286,0.454549,1.351765,0.767989,0.37393,0.083797,0.660569,-0.148323,1.417799,1.513975,-1.501284,-1.312195,-2.03271,-1.041674,0.321217,1.298158,0.551362,-0.430059,1.750123,-1.099843,-0.685772,-0.005128,-0.975476,-0.057679,-1.11706,-1.200906,1.68393,-0.810279,-0.128796,-0.691641,-0.035449,-0.28191,-0.526771,-0.70809,1.433776,-1.222712,1.163826,0.184522,...,-0.637169,0.422562,1.275893,-0.38621,0.638397,0.215625,1.286513,-0.261293,1.142172,-0.723952,2.16952,-0.916088,-1.940472,1.114567,1.538393,-1.322058,0.553656,0.63785,-0.352636,-0.039542,-0.008172,2.166485,0.69878,-0.701138,1.105407,-0.000108,0.178269,-0.19697,2.641396,1.312866,0.149788,-0.611777,-0.721009,-0.762531,0.180147,0.248035,-0.088283,-0.693737,0.324018,0.345734
3,0.191648,-1.099868,0.44767,0.437231,1.212628,0.879447,0.388952,0.111454,0.648283,0.121072,1.309956,1.401361,-1.641491,-1.21037,-2.146176,-1.110383,0.351649,1.278637,0.542277,-0.287113,1.457264,-1.275871,-0.702265,-0.099895,-0.930977,-0.05522,-1.344052,-1.09005,1.641188,-0.770712,-0.099107,-0.782415,-0.067619,-0.143828,-0.271191,-0.514618,1.360468,-1.22169,1.182598,0.300818,...,-0.739594,0.558454,1.380761,-0.457168,0.536404,0.228523,1.326899,-0.404802,1.168838,-0.388515,2.215576,-1.248806,-1.872734,1.094773,1.690426,-1.186881,0.648525,0.576559,-0.273659,-0.013629,-0.039928,2.335309,0.557571,-0.742393,0.986348,-0.004865,0.112854,-0.241546,2.890308,1.421865,0.105366,-0.561885,-0.811475,-0.705413,0.052072,0.204617,-0.003423,-0.492892,0.254657,0.326938
4,0.43372,-1.1274,0.528763,0.512377,1.372391,0.913417,0.476348,0.159518,0.607584,-0.048946,1.47995,1.553993,-1.725303,-1.447234,-2.120424,-1.093538,0.308891,1.339141,0.558512,-0.397709,1.779436,-1.288073,-0.654829,-0.022857,-0.898127,-0.032927,-1.294309,-1.143616,1.661815,-0.84178,-0.102796,-0.789406,0.044329,-0.204503,-0.421557,-0.652648,1.404581,-1.259315,1.179111,0.126498,...,-0.789057,0.589757,1.412727,-0.400946,0.467955,0.258935,1.312889,-0.398318,1.149196,-0.582268,2.143822,-1.119448,-1.865888,1.263614,1.712827,-1.318272,0.618975,0.708785,-0.267535,-0.034706,-0.101632,2.297103,0.6959,-0.654464,0.999718,0.064705,0.130263,-0.294807,2.80082,1.395579,0.068859,-0.65109,-0.683539,-0.782758,0.121693,0.264377,-0.038408,-0.624086,0.262207,0.245549


In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
nn = NearestNeighbors(n_neighbors=3, algorithm='ball_tree')
nn.fit(df)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                 radius=1.0)

In [None]:
import pickle

In [None]:
pickle.dump(nn, open('model.pkl', 'wb'))

In [None]:
# Test model
test = 'hybrid relaxed spicy'
test_vect = nlp(test).vector
vect=test_vect.reshape(1,-1)
# vect.shape
# print(len(test_vect))
# test_df = pd.DataFrame(test_vect).transpose()
result = nn.kneighbors(vect)

In [None]:
result

(array([[13.79570682, 13.79914151, 13.80494399]]), array([[ 743,  791, 1872]]))

In [None]:
result[1]

array([[ 743,  791, 1872]])

In [None]:
result[1][0]

array([ 743,  791, 1872])

In [None]:
names = [data['Strain'].iloc[x] for x in result[1][0]]

In [None]:
names

['Ebola-7', 'Fire-Alien-Strawberry', 'Silverfalls-Kush']

In [None]:
for item in result[1][0]:
  print (f"{data['Strain'].iloc[item]}\n")

Ebola-7

Fire-Alien-Strawberry

Silverfalls-Kush

