### Install Dependencies

In [1]:
import json
import pandas as pd, numpy as np

### Convert JSON to Pandas DataFrame

In [2]:
with open('Customer_Search_Sessions_Text_Data.json', 'r', encoding='UTF-8') as file1:
    json_df = pd.DataFrame(json.loads(line) for line in file1)
#json.load(file1)    
json_df.head()

Unnamed: 0,query_arr,atc_item_arr,timestamp_arr,sessionid
0,[water bottle],[20005571_C],[1631528381],5ef0c4fd-31ae-4b60-adc3-ee0cf645c812
1,"[eau de source, nutella, huile d’olive, chou f...","[20005571_C, 20436105_EA;20574189_EA, 20729461...","[1631756371, 1631756422, 1631756501, 1631756536]",50534a81-5abe-4b66-be56-c767ee28acfc
2,"[coffee crisp, coffee, kinder, beuno, salad, b...","[20019964_C, 20050931_EA, NO PRODUCT, NO PRODU...","[1631542818, 1631542869, 1631545955, 163154598...",0f10758d-f4c6-4ec4-9a86-08e353a9c221
3,"[water, canned drinks, soda, hot dog buns, hot...","[20022126_C, 20375155_C, 20306687003_C, 207799...","[1631721943, 1631721964, 1631722025, 163172207...",e60e1c97-a7fc-46d3-af58-cc2f3de4b2f9
4,"[eau, jus de canneberge]","[20022126_C, NO PRODUCT]","[1631736981, 1631736999]",bb672d6d-fddc-4a9e-a898-6bd3f4bc8e78


## Problem Interpretations (w/ Assumptions)

### 1. Each row in dataset corresponds to user searches per session
### 2. User made multiple queries (reformulated one after another) to search for desired product; Query_Array column stores the list of queries (analogous to single word tokens) made consecutively within each session
### 3a. One interpretation is that the final query (last product in query_arr) is the correct product user was looking for i.e. last query results in add-to-cart event.
### 3b. With this assumption, we are to calculate similarity between each query in query_arr with the last query for each session as an attempt to quantify inter-query relationship

### OR

### 4. Other interpretation is that we calculate similarity between consecutive querries within each query_array per user session


## ML Solution using Word Embedding -  To measure text similarity 
##### Use Word2Vec python library with my own Vocabulary 
##### (Limitation: Only words defined within my vocab can be analyzed; outside words/new vocab will result into error )

In [3]:
#! pip install --user --upgrade gensim

In [4]:
# 3 a&b. Comparing each query with last query for **Session 2, Row 3**
from gensim.models import Word2Vec
vocab = json_df.values[2][0]  # creating vocab
model = Word2Vec([vocab], min_count = 1)  # training model
score, sim, odd= [],[],[]
k = len(vocab)
for j in range(k):
    ls = model.wv.similarity(vocab[j],vocab[k-1])  # analyzing model
    #ls1 = model.wv.most_similar(vocab[j])
    #ls2 = model.wv.doesnt_match(vocab)
    score.append(ls)
    #sim.append(ls1)
    #odd.append(ls2)
print(score)
#print(sim)
#print(odd)
print(vocab)

[-0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['coffee crisp', 'coffee', 'kinder', 'beuno', 'salad', 'bacon']


In [5]:
# 4. Consecutive queries compared and similarity score produced for **Session 2, Row 3**
from gensim.models import Word2Vec
vocab = json_df.values[2][0]    # creating vocab
model = Word2Vec([vocab], min_count = 1)   # training model
score, sim = [],[]
k = len(vocab)
for j in range(k-1):
    ls = model.wv.similarity(vocab[j],vocab[j+1])  # analyzing model
    score.append(ls)
print(score)
print(vocab)

[0.13887985, -0.04461707, -0.013514948, -0.023671674, -0.010839173]
['coffee crisp', 'coffee', 'kinder', 'beuno', 'salad', 'bacon']


In [6]:
# 3 a&b. Comparing each query with last query for *** all Sessions in dataset ***
from gensim.models import Word2Vec
for i in range(len(json_df)):
    vocab = json_df.values[i][0]  # creating vocab
    model = Word2Vec([vocab], min_count = 1)  # training model
    score, sim, odd= [],[],[]
    k = len(vocab)
    for j in range(k):
        ls = model.wv.similarity(vocab[j],vocab[k-1])  # analyzing model
        #ls1 = model.wv.most_similar(vocab[j])
        #ls2 = model.wv.doesnt_match(vocab)
        score.append(ls)
        #sim.append(ls1)
        #odd.append(ls2)
    print(score)
    #print(sim)
    #print(odd)
    print(vocab)

[1.0]
['water bottle']
[-0.111670576, -0.052346755, -0.010839173, 1.0]
['eau de source', 'nutella', 'huile d’olive', 'chou fleur']
[-0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['coffee crisp', 'coffee', 'kinder', 'beuno', 'salad', 'bacon']
[0.16918248, 0.08388674, 0.0024377787, -0.014110296, -0.11393685, 0.2188389, 0.054231673, -0.03758423, -0.04111742, 0.09312754, 0.079643264, 0.062814005, 0.21646474, 0.027162561, 0.09294609, 0.016276052, -0.059891753, -0.027792247, -0.11168494, -0.05228118, 0.06444459, -0.050787263, -0.13255474, 0.1416529, -0.01699349, -0.0011241268, -0.03421791, -0.07293519, -0.09941384, -0.01624264, 0.01690807, 0.096444026, 0.059555467, -0.001957709, 0.10862734, -0.14681098, 0.15155162, 0.04643498, 0.19564298, 0.099447004, -0.13832569, -0.095202714, -0.010680665, 1.0]
['water', 'canned drinks', 'soda', 'hot dog buns', 'hot dogs', 'milk', 'yogurt', 'cheddar cheese', 'nacho cheese', 'mozzarella', 'margarine', 'cucumbers', 'broccoli, cab

[0.21883667, -0.052346755, -0.11167108, -0.027750367, -0.059876293, 0.016134718, 0.09291723, -0.010839173, 0.027066018, 0.0628508, 0.07970877, 0.09314463, -0.041253414, -0.037719578, 0.054333676, 0.21614, 1.0]
['natural spring water case', 'blueberries', 'caesar dressing', 'healthy skin anti wrinkle cream', 'healthy skin face lotion', 'soft & thick toilet paper, 12 rolls= 24', 'two scoops raisin bran cereal', 'ziploc', 'lettuce', 'granulated white sugar', 'package of coconut', 'unsweetened coconut', '14 grains whole wheat bread', 'tide laundry detergent', 'coffee pods', 'chicken broth', 'european wieners']
[-0.037713584, -0.041253414, 0.093107, 0.07963486, 0.06287837, 0.21617144, 0.027057461, 0.09291723, 0.016169284, -0.059876293, -0.027739404, -0.11166717, -0.052346755, -0.010839173, 1.0]
['water', 'banana', 'lime yogurt', 'strawberry', 'pc strawberries', 'bagels', 'silver hills', 'chocolate soy', 'bread', 'cheese', 'cheese slices', 'hummus', 'carrots', 'cucumber', 'yogurt greek']
[0.

[0.19537023, -0.052302137, -0.1108251, -0.02772966, -0.059861604, 0.01633822, 0.09324668, 0.027701491, 0.21613, 0.06284952, 0.079663746, 0.09369284, -0.040945526, -0.010441422, -0.037528288, 0.21919294, -0.11391056, -0.013988867, 0.0024561626, 0.08440935, 0.06451626, -0.050271325, -0.13252832, -0.001503757, -0.09545748, -0.13840753, 0.09946191, 0.054321274, 1.0]
['eau', 'oeufs', 'oeuf', 'brocolis', 'choux fleur', 'poulet', 'ficello', 'oasis', 'nutri grains', 'ritz', 'cristie', 'minces aux légumes', 'poivrons', 'choux bruxelle', 'salade de poulet', 'raisins verts', 'betty crocker', 'betty crocker purée', 'pommes de terre en purée', 'cottage', 'café', 'café en grains', 'café le choix du président', 'café van houtte', 'crème glacée', 'crème glacée caramel', 'melon', 'framboises', 'haricots']
[-0.111670576, -0.052346755, -0.010839173, 1.0]
['water', 'eggs', 'slice cheese', 'toilet paper']
[0.21617144, 0.0272738, 0.092917226, 0.01613473, -0.059876285, -0.027750367, -0.111670576, -0.05234675

[0.0024360362, -0.052346755, -0.11165246, -0.027750367, -0.059813786, 0.016171485, 0.09291723, 0.027047977, 0.21617801, 0.0628508, 0.07963486, 0.09310115, -0.041253414, -0.037699, 0.054330662, 0.21884276, -0.11398096, -0.0143946875, -0.010839173, 1.0]
['water case', 'almond milk', 'milk', 'dill pickles', 'hot peppers', 'bell peppers', 'yop case', 'yop', 'yop yogurt', 'yops', 'fruit cups', 'juice boxes', 'raspberries', 'blueberries', 'bananas', 'ceasar dressing', 'ketchup', 'potatoes', 'chips', 'bread']
[-0.03423771, -0.014336314, -0.113800794, 0.21878365, 0.05435562, -0.03770299, -0.041206516, 0.09331084, 0.07955845, 0.0628188, 0.2162662, 0.027297238, 0.09292492, 0.016160317, -0.059827145, -0.027611896, -0.11157284, -0.052225765, 0.0023566375, -0.010504705, 0.084122196, -0.05061937, -0.07281609, -0.09943468, -0.01626545, 0.016980924, 0.096194275, 0.14178456, 0.05946036, 0.10855298, -0.14740536, 0.1519192, 0.046487592, 0.19555926, 0.09977648, -0.1381227, -0.095433794, -0.0018573268, -0.

[0.054400526, -0.03770622, -0.040908147, 0.093054116, 0.07960558, 0.0628729, 0.21611355, 0.027098363, 0.09322798, 0.016132005, -0.059853017, -0.027568284, -0.111667946, -0.0523743, -0.010798492, 1.0]
['g2', 'bread', 'bagels', 'cream cheese', 'turkey bacon', 'eggs', 'no yolk egg noodles', 'ground chicken', 'fruit', 'yogurt drinks', 'chip dip', 'detergent', 'paper towel', 'lunch mate', 'crackers', 'made good']
[0.0643779, -0.052346755, -0.111661434, -0.027750367, -0.05984619, 0.016197931, 0.09291723, 0.027058955, 0.21619494, 0.0628508, 0.07963486, 0.09310115, -0.041253414, -0.03769949, 0.054339662, 0.21884768, -0.11398096, -0.014373494, 0.0024360362, 0.08404717, -0.010832924, 1.0]
['gatorade', 'perogie', 'skyr', 'strawberrie', 'strawberries', 'glasses', 'eyeglass', 'eye glass', 'vision', 'apples', 'corn flakes', 'milk 1', 'mandarin oranges', 'hot dog bun', 'english muffin', 'no name muffin', 'extra crisp', 'english', "president's choice english", 'risotto', 'doritos', 'ms vickie']
[-0.05

[0.09291723, 0.016134718, -0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['sprite', 'spriz up', 'cat litter', 'chicken wings', 'sweet chili chicken wings', 'oragel', 'cabbage', 'saurkraut']
[-0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['zero sprite', 'waffles & pancakes', 'eggo', '7up', 'drumstick', 'nestea zero']
[0.0628508, 0.2161742, 0.027057461, 0.09291723, 0.016134718, -0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839172, 1.0]
['sprite', 'salad dressing', 'bread', 'oatmeal', 'nescafe', 'juice', 'golden temple', 'jam', 'cheese', 'chips & snacks', 'chicolate']
[-0.050813857, -0.05230651, -0.1114851, -0.027741695, -0.0596291, 0.016301202, 0.092931725, 0.027287314, 0.2161828, 0.06284849, -0.010739859, 0.0796357, -0.041259706, -0.037470493, 0.054340623, 0.21899799, -0.1139749, -0.0141835315, 0.0024410586, 0.08407959, 0.06438949, 0.09305835, 1.0]
['michelob ultra', 'gain', 'gain feuilles assouplissantes', 'hot do

[-0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['catfood', 'squash', 'parsnips', 'roast beef', 'condensed cream of mushroom soup', 'carrots']
[-0.017186686, 0.002401825, -0.014198015, -0.11384527, 0.21881407, 0.054245796, -0.037678998, -0.041096568, 0.093378454, 0.0631264, -0.010695398, 0.21615507, 0.02743936, 0.09315028, 0.016221955, -0.059536956, -0.027661206, -0.1114874, -0.052100997, 0.084109515, 0.07975927, 0.06441605, 0.05964397, -0.00086861826, -0.03429901, -0.07298183, -0.09935205, -0.016186412, 0.017278137, 0.09634123, 0.14184307, 0.10877167, -0.13239355, -0.14761172, 0.1518584, 0.0465226, 0.1953567, 0.099444635, -0.13827698, -0.09521142, -0.0019437924, -0.050462894, 1.0]
['cat litter', 'apples', 'garlic', 'tylenol', 'advil', 'ibuprofen extra strength 400mg', 'ibuprofen regular strength 200mg', 'neocitrin', 'tylenol sinus', 'popcorn', 'coke', 'chips', 'bits and bites', 'tomato', 'blueberries', 'bakery', 'popcorn sesasoning', 'salt and vinegar chips

[0.09291723, 0.016134718, -0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['fire logs', 'prunes', 'eggs', 'salsa', 'quinoa', 'souvlaki marinade', 'frozen raspberries', 'frozen blueberries']
[1.0]
['jus']
[1.0]
['jus']
[0.09308152, 0.07963486, 0.06276734, 0.21617144, 0.027044848, 0.09291723, 0.016134718, -0.059876285, -0.027759008, -0.111670576, -0.052332718, -0.010830897, 1.0]
['oasis', 'bud light', 'pommes naturellement imparfaites', 'pommes', 'riz dainty', 'dainty', 'riz', 'knorr', "p'tit quebec", 'mini concombres', 'tomates raisins', 'olymel', 'bologne']
[0.01704056, 0.21886875, 0.05441068, -0.037645828, -0.041214436, 0.09315131, 0.07977135, 0.06291239, 0.21615075, 0.027090693, 0.092952356, 0.016137177, -0.059848916, -0.027598845, -0.11167051, -0.0523368, -0.11362614, -0.010552619, -0.01429983, 0.08412342, 0.09622608, 0.14178611, 0.059568096, 0.108455345, -0.14746918, 0.15164441, 0.046251774, 0.19539359, 0.0995426, -0.1382363, -0.095387086, -0.0019291438, 

[-0.010839173, 1.0]
['bubly', 'pie crust']
[0.09615226, 0.2189724, 0.05463264, -0.037551884, -0.041055147, 0.09318464, 0.07975213, 0.06283731, 0.21616936, 0.026937563, 0.09304516, 0.016156537, -0.05986567, -0.027561095, -0.11172456, -0.05239581, -0.11395695, -0.014078164, 0.0024001938, 0.19558436, 0.14181691, 0.059358105, 0.10847554, -0.14745164, 0.15171607, 0.046283185, 0.099452324, 0.08397186, -0.13809538, -0.095436834, -0.0019929397, -0.13212888, -0.050637946, 0.064398944, -0.010566773, 1.0]
['orange buble', 'lime buble', 'lime bubly', 'parmesan cheese', 'graham cracker crumbs', 'custard powder', 'pumpkin', 'cheese', 'lactose plain yogurt', 'blueberries', 'raspberries', 'banana', 'royal gala', 'apples', 'mango', 'cucumbers', 'lettuce', 'spinach', 'ham', 'pasta sauce', 'hunts', 'tomato soup', 'tomato paste', "dove men's gel", 'dove men', 'dove gel', 'hair gel', 'lindt', 'oranges', 'tropicana', 'tropicana essential', 'chicken', 'omega 3 eggs', '1 mil', '1 milk', 'homo milk']
[0.108595

['zevia']
[0.027057461, 0.09291723, 0.016134718, -0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['zevia', 'berries', 'flour', 'kosher salt', 'peppers', 'square tissue', 'tissue', 'english muffins', 'deli meat']
[0.027057461, 0.09291723, 0.016134718, -0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['lacroix', 'mint tea', 'pumpkin spice', 'crate 61', 'morning rounds', 'bagel mini', 'mini', 'veggie straw', 'chocolate pudding']
[-0.111670576, -0.052346755, -0.010839173, 1.0]
['zevia', 'water', 'coconut water', 'infuser tea']
[0.016134718, -0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['zevia', "nature's bakery", 'butter', "annie's", 'little big', 'dandies', 'neal brothers foods']
[0.21883667, -0.052346755, -0.11167108, -0.027750367, -0.059876293, 0.016134718, 0.09291723, -0.010839173, 0.027066018, 0.0628508, 0.07970877, 0.09314463, -0.041253414, -0.037719578, 0.054333676, 0.21614, 1.0]
['zevia', 'organic', 

[-0.010839173, 1.0]
['spiced cherry', 'agave']
[1.0]
['arizona']
[0.016134718, -0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['airzona', 'water', '12 pack', 'monster engry', 'red bull', 'peace tea', 'pringles']
[1.0]
['coke zero']
[-0.052346755, -0.010839173, 1.0]
['ginger ale', 'schweppes', 'ice cream']
[0.0628508, 0.2161742, 0.027057461, 0.09291723, 0.016134718, -0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839172, 1.0]
['ginger ale', 'tide free', 'chipets', 'chocolate chips', 'celery', 'popables', 'sour cream', 'aquafina', 'listerine', 'gauze pads, large', 'gauze pads, medium']
[-0.014417639, -0.05225071, -0.11148954, -0.027773924, -0.059638318, 0.01631432, 0.09289809, 0.027302168, -0.010728652, 0.21614625, 0.079655915, 0.09308225, -0.041265883, -0.03765665, 0.054369073, 0.21890432, -0.11396976, 0.063003704, 1.0]
['ginger ale', 'root beer', 'iams', 'whiskas', 'chicken cat food', 'cream', 'white bread', 'sliced cheese', 'chips', 'sour crea

[-0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['arizona watermelon', 'mini chocolate bars', '90 chocolate bars', '90 count chocolate bars', 'halloween chocolate 90 count', 'halloween']
[-0.07290792, -0.014446174, -0.11391242, 0.21888506, 0.05431868, -0.03768794, -0.041233804, 0.093067676, 0.07963454, 0.0628508, 0.21617144, 0.027128711, 0.09291781, 0.016101835, -0.059876293, -0.027719192, -0.111670576, -0.052297793, 0.0024212603, 0.08408297, 0.06439768, -0.14740081, -0.099382274, -0.016293412, 0.017037649, 0.09623282, 0.14176105, 0.05943599, 0.1084637, 0.15158813, -0.050695986, 0.046365805, 0.19548403, 0.09942282, -0.13833946, -0.09518429, -0.0019454656, -0.13270105, -0.01073257, 1.0]
['cat food', 'bread', 'cat litter', 'milk', 'rice pudding cups', 'laundry soap', 'laundry detergent', 'alymer', 'butter', 'spinach', 'vitamin k2', 'vitamin d3', 'vitamin potaasium', 'vitamin potassium', 'potassium 50mg', 'milk bones', 'granola bars', 'tim horton granola bars',

[-0.010839173, 1.0]
['peach tea', "president's choice chocolate almonds"]
[-0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['diet tonic', 'advil kids', 'advil', 'green grapes', 'skittles']
[0.027057461, 0.09291723, 0.016134718, -0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['president choice tonic water', 'toilet paper', 'canned tomatoes', 'basil', 'red onions', 'zuchini', 'bacel', 'becel', 'crackers']
[-0.052346755, -0.010839173, 1.0]
['ginger beer', 'lemondade', 'ice']
[-0.010839173, 1.0]
['lacroix', 'cabbage']
[-0.0019470645, -0.052346755, -0.11165592, -0.027750367, -0.059873726, 0.016148163, 0.09293643, 0.027022365, 0.21617144, 0.0628508, 0.079584315, -0.010857365, 0.0930981, -0.037738685, 0.054333676, 0.21883953, -0.11398096, -0.014455645, 0.0024152058, 0.08401157, 0.06441657, -0.05081332, -0.13275692, -0.041222636, 1.0]
['cream soda', 'coca cola president', 'sparkling water', 'baby bars', 'baby food', 'baby oat bar', 'toddler oat bar', 's

[-0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839173, 1.0]
['remedy kombucha', 'turkey burger', 'turkey peperoni', 'peperoni', 'johnsonville sausage', 'gluten free pretzel crisps']
[-0.0019470645, -0.052346755, -0.11165592, -0.027750367, -0.059873726, 0.016148163, 0.09293643, 0.027022365, 0.21617144, 0.0628508, 0.079584315, -0.010857365, 0.0930981, -0.037738685, 0.054333676, 0.21883953, -0.11398096, -0.014455645, 0.0024152058, 0.08401157, 0.06441657, -0.05081332, -0.13275692, -0.041222636, 1.0]
['kombucha', 'jasmine rice organic', 'toilet paper', '2% chocolate milk', '1% chocolate milk', '2% milk', 'organic tea', 'organic earl grey tea', 'earl grey tea', 'vitamin c 1000mg', 'organic vitamin c 1000mg', 'sisu vitamin c', 'sisu vitamins', 'creamy peanut butter', 'bread', 'ace baguette bread', 'potatoes', 'cheese fish crackers', 'crackers', 'melon', 'cantaloupe', 'whistler bottled water', 'whistler glacier spring water', 'whistler glacial bottled water', 'bottled water']
[

[1.0]
['gatorade']
[-0.052346755, -0.010839173, 1.0]
['pepsi zero', 'hygrade', 'tofu']
[1.0]
['aha']
[0.0628508, 0.2161742, 0.027057461, 0.09291723, 0.016134718, -0.059876293, -0.027750367, -0.111670576, -0.052346755, -0.010839172, 1.0]
['aha', 'doritos', 'ham', 'milk', 'bread', 'habitant', 'avocado', 'raisins', 'chicken pad thai', 'rice noodles', 'bean sprouts']
[0.054400526, -0.03770622, -0.040908147, 0.093054116, 0.07960558, 0.0628729, 0.21611355, 0.027098363, 0.09322798, 0.016132005, -0.059853017, -0.027568284, -0.111667946, -0.0523743, -0.010798492, 1.0]
['aha', 'bubly', 'silver hills bread', 'gluten free bagel', 'ground beef', 'free from', 'chipotle spice', 'rice crackers', 'eggs', 'avocado', 'peppers', 'baby carrots', 'cucumber', 'soup', 'happy planet soup', 'kefir']
[1.0]
['bottled water']
[-0.010839173, 1.0]
['water', 'tassimo coffee']
[0.15161024, -0.052346755, -0.11163338, -0.02774064, -0.059876293, 0.016140083, 0.0929023, 0.027065666, 0.21617144, 0.06285644, 0.07963486, 0.0

[1.0]
['spring roll wrappers']
[1.0]
['boneless rib end pork chops, club pack']
[1.0]
['san']
[1.0]
['ricotta cheese']
[1.0]
['lunch box']
[1.0]
['coffee pods']
[1.0]
['tea']
[1.0]
['harvest snaps']
[1.0]
['jamieson soft chews calcium']
[1.0]
['eau']
[1.0]
['pantene']
[1.0]
['newtons']
[1.0]
['potatoes']
[1.0]
['green weighted']
[1.0]
['pasta']
[1.0]
['disposable cups']
[1.0]
['halloumi cheese']
[1.0]
['avocat']
[1.0]
['coke zero']
[1.0]
['jolly jumper']
[1.0]
['greenfield natural meat co.']
[1.0]
['triscuit']
[1.0]
['juice boxes']
[1.0]
['goldfish']
[1.0]
['diced tomatoes no salt']
[1.0]
['bread']
[1.0]
['westlab']
[1.0]
['marine collagen']
[1.0]
['pencil crayons']
[1.0]
['bobo']
[1.0]
['kraft']
[1.0]
['deoderant']
[1.0]
['hemp hearts']
[1.0]
['bananas']
[1.0]
['plum']
[1.0]
['schweppes']
[1.0]
['nature clean soap']
[1.0]
['field roast']
[1.0]
['sugar']
[1.0]
['pampers wipes']
[1.0]
['cranberry juice']
[1.0]
['dryer sheets']
[1.0]
['jamieson']
[1.0]
['yogurt']
[1.0]
['coverlet']
[1.0]

KeyboardInterrupt: 

In [7]:
# 4. Consecutive queries compared and similarity score produced for *** all Sessions in dataset ***
from gensim.models import Word2Vec
for i in range(len(json_df)):
    vocab = json_df.values[i][0]    # creating vocab
    model = Word2Vec([vocab], min_count = 1)   # training model
    score, sim = [],[]
    k = len(vocab)
    for j in range(k-1):
        ls = model.wv.similarity(vocab[j],vocab[j+1])  # analyzing model
        score.append(ls)
    print(score)
    print(vocab)

[]
['water bottle']
[-0.013514948, -0.023671674, -0.010839173]
['eau de source', 'nutella', 'huile d’olive', 'chou fleur']
[0.13887985, -0.04461707, -0.013514948, -0.023671674, -0.010839173]
['coffee crisp', 'coffee', 'kinder', 'beuno', 'salad', 'bacon']
[0.0038111245, -0.0009846962, 0.0003889287, 0.11937966, -0.14902948, 0.06548712, -0.09824341, 0.027215723, -0.0755177, -0.030191496, 0.10945788, 0.014984888, 0.044884916, -0.14444871, 0.0053134076, 0.01925817, 0.13885167, -0.044224247, -0.013236468, -0.0074412865, 0.067342535, 0.107436225, 0.029060697, -0.09362363, -0.05141988, 0.110187575, 0.002829252, 0.00734705, -0.23736587, 0.025477832, -0.032352634, 0.050734136, 0.0011917986, 0.063331686, -0.14725435, -0.102968745, -0.008044168, -0.05984464, 0.064070195, -0.06714499, 0.11907527, -0.06721938, -0.010680665]
['water', 'canned drinks', 'soda', 'hot dog buns', 'hot dogs', 'milk', 'yogurt', 'cheddar cheese', 'nacho cheese', 'mozzarella', 'margarine', 'cucumbers', 'broccoli, cabbage & ca

[0.019152299, 0.13887985, -0.04461707, -0.013514948, -0.023671674, -0.010839173]
['water', 'egg whites', 'original smokies', 'gillette', 'almonds', 'halibut', 'dishwashing']
[0.14862445, -0.013544901, -0.044613734, 0.13889168, 0.019688107, 0.0048411507, -0.14455226, 0.04490443, -0.09576736, -0.13429946, 0.12813479, -0.07639, 0.02703633, -0.098247066, 0.06543887, -0.14900543, 0.1193726, 0.00047561486, 0.09731776, 0.07963361]
['water', 'chicken', 'cheddar', 'gluten free bagels', 'gluten free pizza', 'tofu', 'bok choy', 'rice', 'broccoli', 'peppers', 'vegetables', 'stir fry', 'julienne carrots', 'egg noodle', 'chicken thigh', 'andalou', 'frozen peaches', 'frozen strawberries', 'milk', 'orang', 'orange juice']
[0.039555587, 0.063418895, -0.067303605, 0.118774265, 0.0028439173, 0.073826455, 0.10750773, 0.067360856, -0.075961284, -0.00091365026, 0.00064578117, 0.11979719, -0.14856936, 0.066122286, -0.09790076, 0.027137853, -0.0762895, -0.029222138, 0.10953073, 0.015211765, 0.044689186, -0.14

[-0.04461707, -0.013514948, -0.023671674, -0.010839173]
['water', 'mortadella', 'provolone cheese', 'crackers', 'milk']
[-0.010839173]
['bottled water', 'bleach']
[0.014987758, 0.044699874, -0.14454566, 0.0048425104, 0.019152299, 0.13887985, -0.04461707, -0.013514948, -0.023671672, -0.010839172]
['water', 'coke', 'grapes', 'gogo', 'gogo apple sauce', 'cookies', 'carrots', 'pasta', 'pasta sauce', 'meat ball', 'meatballs']
[-0.22729617, -0.013550967, -0.044588435, 0.13887683, 0.01965471, 0.0047989488, -0.14453398, 0.045199446, 0.015039983, -0.13429186, 0.008322228, -0.16288689, 0.026978975, -0.098226994, 0.06558993, -0.1490341, 0.1193161, 0.0005026174, -0.00091365026, -0.075938635, 0.1489742, 0.09305835]
['water', 'caffeine free diet coke', 'diet sprite', 'sprite', 'diet coke', 'cplus', 'c plus', 'c plus orange soda', 'pop', 'pre popped', 'c plus pop', 'orange soda', 'diet pop', 'diet soda', 'soda', 'sliced ham', 'country harvest bread', 'dempsters bread', 'no name peanut butter', 'lysol

[-0.04461707, -0.013514948, -0.023671674, -0.010839173]
['water', 'lunchable', 'koba', 'meat', 'hamburger']
[-0.14454566, 0.0048425104, 0.019152299, 0.13887985, -0.04461707, -0.013514948, -0.023671674, -0.010839173]
['yop', 'craquelins et biscuits', 'craquelins', 'jus', 'patte', 'poulet', 'poisson', 'eponge', 'mio']
[-0.013514948, -0.023671674, -0.010839173]
['yop', 'épice bagel', 'bagel', 'fromage a la crème']
[-0.04461707, -0.013514948, -0.023671674, -0.010839173]
['yoplait', 'wraps', 'peas, beans & corn', 'canned corn', 'eggs']
[-0.0074324184, -0.01348428, -0.044633664, 0.13887261, 0.01949299, 0.0048850477, -0.14455229, 0.04486737, 0.01504347, 0.10941853, -0.030302342, -0.07639, 0.026978148, -0.09823241, 0.0654905, -0.14902768, 0.11934508, 0.0004743798, -0.00091365026, -0.01855421, -0.010832924]
['yoplait', 'nescafe rich', 'peppers', 'green grapes', 'grapes', 'bleberries', 'blueberries', 'cream cheese', 'cream cheese strawberry', 'flour wraps', 'butter', 'gatorade', 'chicken strips'

[0.053464323, -0.013323129, -0.04460168, 0.13887827, 0.019320933, 0.004838962, -0.14455259, 0.044690877, 0.014989821, 0.109607644, -0.030355357, -0.0036990698, 0.13752559, -0.14475119, 0.06531604, -0.14913276, 0.11945813, 0.0006633735, -0.0008641458, -0.07581164, 0.0673568, 0.10763977, 0.07358911, 0.0027833823, 0.03691316, -0.037719578]
['tzao', 'tazo', 'tazo mint', 'stash mint', 'mint tea', 'chapman cone', 'fruite', 'bread', 'allens', 'allens vinegar', 'clorox', 'cheese string', 'celery', 'brocolli', 'blueberries', 'asian noodle', 'haiku', 'sliced water chestnuts', 'young baby corn in brine', 'carrott', 'cream cheese', 'crunchie 4pk', 'icing sugar', 'gatorade', '1 cup', '1 cup storage', 'ziploc']
[0.053464323, -0.013323129, -0.04460168, 0.13887827, 0.019320933, 0.004838962, -0.14455259, 0.044690877, 0.014989821, 0.109607644, -0.030355357, -0.0036990698, 0.13752559, -0.14475119, 0.06531604, -0.14913276, 0.11945813, 0.0006633735, -0.0008641458, -0.07581164, 0.0673568, 0.10763977, 0.0735

['monster', 'salmi', 'salami', 'blueberries', 'raspberries', 'bananas', 'yogurt', 'milk', 'orange juice', 'dare', 'cadbury', 'old dutch', 'sour cream', 'raisin bran', 'pretzels', 'bread', 'old mill white bread', 'spinach', 'vitamin d', 'salted roasted sun flower seeds', 'sun flower seeds', 'sunflower seeds', 'roasted sunflower seeds', 'eggs']
[-0.023671674, -0.010839173]
['monster', 'oven roast', 'nature valley']
[-0.023671674, -0.010839173]
['monster energy', 'sandwiches', 'buns']
[-0.015400868, -0.14897878, 0.06562199, -0.09796166, 0.027385948, -0.07628029, -0.030056387, -0.074091285, -0.09573394, 0.033640575, -0.14445952, 0.0052620815, 0.019610245, 0.13868572, -0.04465711, -0.013392786, 0.15290418, -0.04984915, 0.08714183, -0.17791246, 0.036355022, 0.025996305, -0.032242294, 0.09103775, -0.0074439603, 0.14895177, -0.14878938, 0.07542985, 0.06689513, 0.051426604, 0.06357683, -0.067188375, 0.11899664, 0.003266518, 0.07453106, 0.10769199, 0.046336144, 0.084337085]
['coffee no name', 'c

[0.044735733, -0.1445247, 0.0048425104, 0.019152299, 0.13887985, -0.04461707, -0.013514948, -0.023671674, -0.010839181]
['sparkling water', 'chicken breast', 'chicken thighs', 'maple syrup', 'lemon', 'potstickers', 'vanilla extract', 'thai basil', 'basil', 'garlic']
[0.109524556, 0.015015273, 0.044671305, -0.14454001, 0.004852889, 0.019152299, 0.13887985, -0.044132903, -0.013432669, -0.02365179, -0.01088622]
['cream cheese', 'garbage bags', 'sun chips', 'chips', 'coconut milk', 'cottage', 'cheese', 'milk', 'eggs', 'free run eggs', 'humus', 'hummus']
[]
['beer']
[-0.010839173]
['nestle water', 'nestle water delivery']
[-0.09825402, 0.027019612, -0.07638238, -0.030302342, 0.10941853, 0.015025198, 0.044689193, -0.14448667, 0.0048370697, 0.019140474, 0.13878547, -0.04463267, -0.013514948, -0.023671674, -0.010798492]
['splash', 'pepperoni', 'lasagne', 'lasagna', 'raisin', 'grape', 'blueberries', 'cesar', 'goodnites', 'homogenized milk 3.25%', 'fruits', 'chicken', 'curds', 'st albert', 'muff

[0.109524556, 0.015015273, 0.044671305, -0.14454001, 0.004852889, 0.019152299, 0.13887985, -0.044132903, -0.013432669, -0.02365179, -0.01088622]
['kerrig coffee', 'cheese', 'jalapenos', 'salsa', 'romaine', 'bleach', 'tassimo', 'ziggy’s', 'fuji apples', 'pam', 'peanut butter', 'lemons']
[-0.013514948, -0.023671674, -0.010839173]
['coffee', 'lays', 'fancy feast', 'apples']
[-0.023671674, -0.010839173]
['dog food', 'frozen casserole', 'frozen meals']
[-0.030296177, 0.10960891, 0.014882498, 0.044643596, -0.14455914, 0.0048425104, 0.019152312, 0.13927871, -0.044562627, -0.013485333, -0.02367525, -0.010830897]
['beneful', 'laundry detergent', 'water', 's bars', 'z bars', 'cliff bars', 'cliff kids', 'becel', 'nacho chips', 'creamed corn', 'lemon', 'potatoes', 'tortilla & wraps']
[0.014987758, 0.044699874, -0.14454566, 0.0048425104, 0.019152299, 0.13887985, -0.04461707, -0.013514948, -0.023671672, -0.010839172]
['larabar', 'silk', 'organic celery', 'flatbread pizza', 'danone', 'olives', 'zucch

[-0.07351302, 0.119554125, -0.14905609, 0.065360636, -0.098216854, 0.027246438, -0.076000795, -0.029924713, 0.10940445, 0.015025198, 0.044637177, -0.144444, 0.005325264, 0.018977545, 0.13870133, -0.044749867, -0.013525081, -0.034594998, -0.0005103005, -0.07543612, 0.08482594, 0.09464057, -0.23670685, 0.02539488, -0.032325048, 0.09093178, -0.007259731, 0.1489539, -0.060706936, -0.022379527, 0.13681912, -0.059715547, 0.06355268, -0.06706006, 0.11956125, 0.0028026681, 0.073631614, 0.12803832, -0.01073257]
['bubly', 'batteries', 'batteries c', 'c battery', 'aa battery', 'almonds', 'paper towel', 'buns', 'garlic bread', 'crouton', 'green beans', 'beans', 'flax seed', 'greenhouse juice', 'greenhouse', 'genius', 'the good', 'apple juice box', 'spinach', 'peaches', 'garlic', 'fresh garlic', 'fresh garlic bulb', 'kitchen garbage bag', 'chocolate chips', 'aluminum foil', 'parmesan', 'terriyaki', 'terriyaki sauce', 'coconut aminos', 'cream cheese', 'french bean', 'pear', 'grapes', 'feta', 'pita',

[0.019152299, 0.13887985, -0.04461707, -0.013514948, -0.023671674, -0.010839173]
['gatorade', 'chocolate bars', 'nestle smarties candy coated milk chocolate', 'chocolate, candy & gum', 'candy & chocolate', 'chips', 'lunchmate']
[0.044735733, -0.1445247, 0.0048425104, 0.019152299, 0.13887985, -0.04461707, -0.013514948, -0.023671674, -0.010839181]
['gatorade', 'pizza', 'triscuit', 'dips', 'miracle whip', 'pudding', 'summer sausage', 'ham', 'havarti', 'baby food']
[0.02701495, -0.07638936, -0.030299965, 0.10947381, 0.01500214, 0.044689193, -0.14454566, 0.00507989, 0.01911949, 0.13887727, -0.04461958, -0.013513082, -0.023671674, -0.010839173]
['gatorade', 'pop', 'pepsi', 'sprite', 'cream soda', 'orange', 'orange pop', 'orange soda', 'frozen chicken', 'tums', 'bitter', 'butter', 'becel', 'hot dog', 'chicken balogne']
[-0.14454566, 0.0048425104, 0.019152299, 0.13887985, -0.04461707, -0.013514948, -0.023671674, -0.010839173]
['gatorade', 'mini wheats', 'harvest crunch original granola', 'tana

[]
['rice']
[-0.15274855, -0.1489943, 0.06556135, -0.09826169, 0.026968414, -0.07612399, -0.029418845, -0.07423447, -0.09579184, 0.033638023, -0.14455742, 0.0048909653, 0.019325765, 0.1390259, -0.04461707, -0.013519355, 0.15371346, 0.0011749467, -0.000735193, -0.075718924, -0.024480535, -0.03249931, 0.091061346, -0.0072204713, 0.14911218, -0.1488433, -0.1026793, -0.007835369, -0.060168985, 0.063239366, -0.06677694, 0.119570136, 0.0031684134, 0.07400925, 0.1078884, 0.09936342, 0.06293594]
['diced tomatoes', '2 go coffee cups', 'coffee cups', 'throw away cups', '2 go cups', 'paper cups with lids', 'paper cups', 'no name cups', 'disposable cups', 'disposable coffee cups', 'disposable cups with lids', 'coffee thermos', 'decaf coffee', 'nabob coffee', 'nabob decaf coffee', 'dried mango', 'dried fruit', 'crackers', 'parchment paper', 'no name paper', 'paper', 'lotion', 'tortillas', 'sour cream', 'chips', 'art book', 'sketch book', 'frozen fruit', 'nuts', 'avocado', 'bananas', 'diapers', 'fin

[0.02701495, -0.07638936, -0.030299965, 0.10947381, 0.01500214, 0.044689193, -0.14454566, 0.00507989, 0.01911949, 0.13887727, -0.04461958, -0.013513082, -0.023671674, -0.010839173]
['gatorade zero', 'horseradish', 'toilet pap', 'toilet paper', 'snacks', 'chocolate', 'hickory sticks', 'mujffins', 'yogurt', 'potatoes', 'spinach', 'celery', 'peppers', 'cauliflower', 'noodles']
[-0.03457117, -0.013463832, -0.044617373, 0.13887985, 0.01914507, 0.0050102943, -0.14388406, 0.044725604, 0.015025198, 0.10941853, -0.030266779, -0.07633359, 0.1373406, 0.16068412, 0.049073577, -0.14896625, 0.11936081, 0.00048351462, -0.00089882594, -0.07588928, 0.06730738, 0.10756424, 0.073713616, 0.0027895262, 0.11872665, -0.06716446, 0.0190394, 0.054321274]
['gatorade', 'dijon mustard', 'french’s dijon mustard', 'peppermint and mint extract', 'peppermint extract', 'mint extract', 'ruffles', 'baguette harvest multigrain', 'baguette multigrain', 'baguette', 'fatso peanut butter crunchy salted caramel', 'trutaste 1%

[]
['almond milk']
[-0.010839173]
['unsweetened almond milk', 'eggs']
[-0.084851645, -0.013486388, -0.04458769, 0.13884038, 0.01922207, 0.0048807813, -0.14422294, 0.044487573, 0.015025198, 0.10941213, 0.008777883, -0.0034773592, 0.09232711, -0.09825735, 0.06548556, -0.14902948, 0.11940686, 0.00057956774, -0.0005874871, -0.07556473, 0.06775203, 0.10791105, 0.14737074, -0.041222636]
['almond milk ,unsweetened', 'peanut butter', 'oatmeal', 'gluten free oatmeal', 'oats', 'oranges', 'oranges blood', 'oranges red', 'grapes', 'kiwi', 'peppers', 'celery', 'tomatoes', 'broccoli, cabbage & cauliflower', 'brussel sprouts', 'eggs', 'chicken', 'tofu', 'nutritional yeast', 'tuna', 'pickles', 'salsa', 'hummus', 'asparagus', 'spinach']
[-0.013514948, -0.023671674, -0.010839173]
['almond milk', 'milk', 'onions', 'rice']
[0.13887985, -0.04461707, -0.013514948, -0.023671674, -0.010839173]
['almond milk', 'dates', 'bananas', 'apples', 'frozen strawberries', 'frozen raspberries']
[-0.11228521, -0.013413804

[0.0038111245, -0.0009846962, 0.0003889287, 0.11937966, -0.14902948, 0.06548712, -0.09824341, 0.027215723, -0.0755177, -0.030191496, 0.10945788, 0.014984888, 0.044884916, -0.14444871, 0.0053134076, 0.01925817, 0.13885167, -0.044224247, -0.013236468, -0.0074412865, 0.067342535, 0.107436225, 0.029060697, -0.09362363, -0.05141988, 0.110187575, 0.002829252, 0.00734705, -0.23736587, 0.025477832, -0.032352634, 0.050734136, 0.0011917986, 0.063331686, -0.14725435, -0.102968745, -0.008044168, -0.05984464, 0.064070195, -0.06714499, 0.11907527, -0.06721938, -0.010680665]
['bubly', 'mini tortilla', 'puffins', 'sweet potato fries', 'yogurt drink', 'hydrafruit', 'greenfield', 'papet towel', 'sundried', 'eggs', 'engllish muffin', 'english muffin', 'tuna', 'earth balance', 'vegan becel', 'fig bar', 'bounce', 'tide', 'tide honey', 'tide lav', 'packin tape', 'packing tape', 'scoth', 'scotch', 'almond milk', 'garlic', 'mixed green', 'shredded cheese', 'salsa', 'lemon', 'slaw', 'cat food', 'cream', 'cerea

[]
['assaisonnement viande']
[]
['halloween candy']
[]
['lemons']
[]
['birthday cake']
[]
['cupcake']
[]
['chocolate']
[]
['gay lea spreadables']
[]
['beef rib']
[]
['cake']
[]
['tournedoes']
[]
['downy']
[]
['pumpkin']
[]
['skittles']
[]
['life at home taper mug']
[]
['milk']
[]
['pizza pocket']
[]
['grill mat']
[]
['cheese shredded']
[]
['vh stir fry sauce']
[]
['oral b']
[]
['pc chips']
[]
['kids toothpaste']
[]
['cholupa']
[]
['bookcase']
[]
['sugar']
[]
['raw pet food']
[]
['distilled water']
[]
['coconut water']
[]
['gatorade']
[]
['cottonelle flushable wipes']
[]
['shea moisture shampoo and conditioner']
[]
['jars']
[]
['lemons']
[]
['turkey']
[]
['hot dogs']
[]
['toddler']
[]
['all natural peanut butter']
[]
['cake']
[]
['chocolate chip cookies']
[]
['curel']
[]
['green peppers']
[]
['mozzarella cheese']
[]
['canned salmon']
[]
['ground beef']
[]
['dress']
[]
['crema']
[]
['lasagne noodles']
[]
['milk chocolate bar']
[]
['purex']
[]
['becel']
[]
['macaroni au fromage']
[]
['sta

[]
['dove deodorant']
[]
['coffee maker']
[]
['parchment muffin cups']
[]
['v8 cocktail']
[]
['parathas']
[]
['unsalted butter']
[]
['hardbite']
[]
['maggi sauce']
[]
['stevia']
[]
['oranges']
[]
['laxaday']
[]
['utensil holder']
[]
['hair spray']
[]
['prime rib roast']
[]
['water']
[]
['halloween candy']
[]
['nicoderm']
[]
['hangar']
[]
['canned tuna']
[]
['huile coco']
[]
['mozzarella']
[]
['tikka']
[]
['m&ms']
[]
['table consol']
[]
['vitimin d']
[]
['sidekicks']
[]
['facial tissue']
[]
['peach, plum & nectarines']
[]
['dulcolax']
[]
['cereal']
[]
['corn squares']
[]
['crispy oninos']
[]
['marbled cheese']
[]
['salmon']
[]
['honey bunches of oats almond with 25% more almonds cereal']
[]
['salsa']
[]
['beets']
[]
['broad beans']
[]
['gluten free pizza']
[]
['all bran']
[]
['ham slices']
[]
['kraft parmesan cheese']
[]
['tortillas']
[]
['bras']
[]
['macaroni']
[]
['floor pillow']
[]
['advent']
[]
['organic face wash']
[]
['ensure']
[]
['nutrasea']
[]
['floor cushion']
[]
['andalou']
[

[]
['eau gazeuse']
[]
['black garlic']
[]
['sugar']
[]
['porc']
[]
['toy trucks']
[]
['ravioli']
[]
['untortilla']
[]
['tomato']
[]
['smoked salmon']
[]
['davids killer bread']
[]
['cake']
[]
['pastrami']
[]
['nata de coco']
[]
['fancy molasses']
[]
['provigo']
[]
['brita']
[]
['fruit cups']
[]
['veggie burgers']
[]
['fluoride free toothpaste']
[]
['roasted red pepper']
[]
['flushable wipes']
[]
['gatorade']
[]
['bio oil']
[]
['keto stream']
[]
['frozen dessert']
[]
['liberte']
[]
['chicken breast']
[]
['tissue']
[]
['dates']
[]
['nutpods']
[]
['starbucks']
[]
['simply orange']
[]
['brown sugar']
[]
['rubbing alcholol']
[]
['yogurt']
[]
['casa']
[]
['advil']
[]
['herring']
[]
['dinnerware']
[]
['rice']
[]
['pc lasagna']
[]
['maple candy']
[]
['ladies pajamas']
[]
['monster energy']
[]
['cat litter']
[]
['fondue cheese']
[]
['splenda stevia sweetener']
[]
['chicken burgers']
[]
['oatmeal']
[]
['miss vickies']
[]
['ground pork']
[]
['pepper']
[]
['mozzarella']
[]
['deli turkey']
[]
['cho

KeyboardInterrupt: 

## Update dataframe with Similarity Score and Classification (similar/not similar)
### I used a threshold of 50% similarity between consecutive terms irrespective of direction (orthogonal/diametrically opposite/same)

In [8]:
df = json_df.copy()  # Creating dataframe df and adding comments columns to indicate similarity
#df
df["Similarity Score"] = " "
df["Classification Comments"] = " "
df.head()

Unnamed: 0,query_arr,atc_item_arr,timestamp_arr,sessionid,Similarity Score,Classification Comments
0,[water bottle],[20005571_C],[1631528381],5ef0c4fd-31ae-4b60-adc3-ee0cf645c812,,
1,"[eau de source, nutella, huile d’olive, chou f...","[20005571_C, 20436105_EA;20574189_EA, 20729461...","[1631756371, 1631756422, 1631756501, 1631756536]",50534a81-5abe-4b66-be56-c767ee28acfc,,
2,"[coffee crisp, coffee, kinder, beuno, salad, b...","[20019964_C, 20050931_EA, NO PRODUCT, NO PRODU...","[1631542818, 1631542869, 1631545955, 163154598...",0f10758d-f4c6-4ec4-9a86-08e353a9c221,,
3,"[water, canned drinks, soda, hot dog buns, hot...","[20022126_C, 20375155_C, 20306687003_C, 207799...","[1631721943, 1631721964, 1631722025, 163172207...",e60e1c97-a7fc-46d3-af58-cc2f3de4b2f9,,
4,"[eau, jus de canneberge]","[20022126_C, NO PRODUCT]","[1631736981, 1631736999]",bb672d6d-fddc-4a9e-a898-6bd3f4bc8e78,,


In [9]:
# 4. Consecutive queries compared and similarity score produced for ** first 10 sessions ***
from gensim.models import Word2Vec
for i in range(10):
    vocab = df.values[i][0]    # creating vocab
    model = Word2Vec([vocab], min_count = 1)   # training model
    score, sim = [],[]
    k = len(vocab)
    for j in range(k-1):
        ls = model.wv.similarity(vocab[j],vocab[j+1])  # analyzing model
        if abs(ls) < 0.5:      # threshold of 50% similarity irrespective of direction (orthogonal/diametrically opposite/same)
            ls1 = 'not similar'
        else:
            ls1 = 'similar'
        score.append(ls)
        sim.append(ls1)
    df["Similarity Score"][i] = score
    df["Classification Comments"][i] = sim #np.where(df["Classification Comments"][i] < 0.5,'not similar','similar')
df.head(10)

Unnamed: 0,query_arr,atc_item_arr,timestamp_arr,sessionid,Similarity Score,Classification Comments
0,[water bottle],[20005571_C],[1631528381],5ef0c4fd-31ae-4b60-adc3-ee0cf645c812,[],[]
1,"[eau de source, nutella, huile d’olive, chou f...","[20005571_C, 20436105_EA;20574189_EA, 20729461...","[1631756371, 1631756422, 1631756501, 1631756536]",50534a81-5abe-4b66-be56-c767ee28acfc,"[-0.013514948, -0.023671674, -0.010839173]","[not similar, not similar, not similar]"
2,"[coffee crisp, coffee, kinder, beuno, salad, b...","[20019964_C, 20050931_EA, NO PRODUCT, NO PRODU...","[1631542818, 1631542869, 1631545955, 163154598...",0f10758d-f4c6-4ec4-9a86-08e353a9c221,"[0.13887985, -0.04461707, -0.013514948, -0.023...","[not similar, not similar, not similar, not si..."
3,"[water, canned drinks, soda, hot dog buns, hot...","[20022126_C, 20375155_C, 20306687003_C, 207799...","[1631721943, 1631721964, 1631722025, 163172207...",e60e1c97-a7fc-46d3-af58-cc2f3de4b2f9,"[0.0038111245, -0.0009846962, 0.0003889287, 0....","[not similar, not similar, not similar, not si..."
4,"[eau, jus de canneberge]","[20022126_C, NO PRODUCT]","[1631736981, 1631736999]",bb672d6d-fddc-4a9e-a898-6bd3f4bc8e78,[-0.010839173],[not similar]
5,"[case of water, grapefruit, grapefruit juice, ...","[20022126_C, NO PRODUCT, 21209809_EA, NO PRODU...","[1631714854, 1631714871, 1631714878, 163171494...",efb2afbf-3389-4054-9e9e-fc68495144c2,"[-0.030296177, 0.10960891, 0.014882498, 0.0446...","[not similar, not similar, not similar, not si..."
6,"[water, butter, fruit packs]","[20022126_C, 20118993_EA, 20075199001_EA]","[1631568918, 1631569330, 1631569367]",8b392055-e827-4726-b538-f9e6dd57591e,"[-0.023671674, -0.010839173]","[not similar, not similar]"
7,"[water, tide, garbage bags, pantene, polident,...","[20022126_C, 20751974_EA, 20052479_EA, 2090667...","[1631568745, 1631568779, 1631568814, 163156886...",a44568e2-0755-46a7-b294-e1f106d5389c,"[0.019152299, 0.13887985, -0.04461707, -0.0135...","[not similar, not similar, not similar, not si..."
8,"[water, orange juice, apple juice, pepperoni, ...","[20022126_C, 20883056_EA, 20501952_EA, 2018984...","[1631750390, 1631750446, 1631750474, 163175054...",9d29cbcf-67e3-4652-91c7-17680459e33e,"[-0.11228521, -0.013413804, -0.04451552, 0.138...","[not similar, not similar, not similar, not si..."
9,"[water, epsom salts, garlic, cotton balls, cot...","[20022126_C, 21256700_EA, 21004355001_EA, NO P...","[1631748364, 1631748375, 1631748387, 163174840...",e22aa53d-e1d9-4375-b89d-c69446d88eb7,"[0.109524556, 0.015015273, 0.044671305, -0.144...","[not similar, not similar, not similar, not si..."


## Model Evaluation Ideas

### One way to evaluate the word2vec model is to come up with ground truth labels (e.g. Use AWS Amazon SageMaker Ground Truth to Label Data). 
### Ground truth labels will be words that should be closest together in vector space with largest cosine similarity.
### Apply the K-Means algorithm on the features generated by the Word2Vec model. Assuming, we would have created our own manual labels/ground truth representing the instances/records. 
### We can calculate the accuracy of the model by comparing the clustered result tags generated with the ground truth labels.
### Eg: Our kMeans clusters => Clusuter 0 - Positive words; Cluster 1 - Negative words
### We can compare the tags/labels generated by the clusters with the ground truth values of the instances/sentences in the clusters and calculate the accuracy.