In [53]:
import pandas as pd
import ydata_profiling
from pandas.io.json import json_normalize
import json

## Products data set 

In [54]:
df_products = pd.read_json("data/products.json.gz")

In [55]:
df_products.head(3)

Unnamed: 0,sku,name,type,price,upc,category,shipping,description,manufacturer,model,url,image
0,43900,Duracell - AAA Batteries (4-Pack),HardGood,5.49,41333424019,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",5.49,Compatible with select electronic devices; AAA...,Duracell,MN2400B4Z,http://www.bestbuy.com/site/duracell-aaa-batte...,http://img.bbystatic.com/BestBuy_US/images/pro...
1,48530,Duracell - AA 1.5V CopperTop Batteries (4-Pack),HardGood,5.49,41333415017,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",5.49,Long-lasting energy; DURALOCK Power Preserve t...,Duracell,MN1500B4Z,http://www.bestbuy.com/site/duracell-aa-1-5v-c...,http://img.bbystatic.com/BestBuy_US/images/pro...
2,127687,Duracell - AA Batteries (8-Pack),HardGood,7.49,41333825014,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",5.49,Compatible with select electronic devices; AA ...,Duracell,MN1500B8Z,http://www.bestbuy.com/site/duracell-aa-batter...,http://img.bbystatic.com/BestBuy_US/images/pro...


##### Links/references: 
* https://bestbuyapis.github.io/api-documentation/#detail
* https://bestbuyapis.github.io/bby-query-builder/#/productSearch

#### Processing "category" column
* As the **category** field consists of a list of nested jsons, we will process the column and create multiple columns for each category. 

In [56]:
# Processing category field with json_normalize [http://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.io.json.json_normalize.html]
# 1 min to process
df_list = []
for x in range(len(df_products)):
    df_temp = json_normalize(df_products["category"][x])
    df_temp = df_temp.transpose()
    df_temp = df_temp.drop(["id"])
    df_temp = df_temp.rename(index={"name": x})
    df_list.append(df_temp)
df_category = pd.concat(df_list)
df_products = df_products.join(df_category)
# Delete temp dataframes..
del df_list, df_temp, df_category
# Renaming category column to respective fields
df_products.rename(index=str, columns={0: "Category_F1", 1: "Category_F2", 2: "Category_F3", 3: "Category_F4", 4: "Category_F5", 5: "Category_F6", 6: "Category_F7"}, inplace=True)
# Dropping columns that dont provide substantial information for analysis of data
df_products.drop(['sku', 'upc','url', 'image','category'], axis=1, inplace=True)

  """


In [57]:
df_products.head(3)

Unnamed: 0,name,type,price,shipping,description,manufacturer,model,Category_F1,Category_F2,Category_F3,Category_F4,Category_F5,Category_F6,Category_F7
0,Duracell - AAA Batteries (4-Pack),HardGood,5.49,5.49,Compatible with select electronic devices; AAA...,Duracell,MN2400B4Z,Connected Home & Housewares,Housewares,Household Batteries,Alkaline Batteries,,,
1,Duracell - AA 1.5V CopperTop Batteries (4-Pack),HardGood,5.49,5.49,Long-lasting energy; DURALOCK Power Preserve t...,Duracell,MN1500B4Z,Connected Home & Housewares,Housewares,Household Batteries,Alkaline Batteries,,,
2,Duracell - AA Batteries (8-Pack),HardGood,7.49,5.49,Compatible with select electronic devices; AA ...,Duracell,MN1500B8Z,Connected Home & Housewares,Housewares,Household Batteries,Alkaline Batteries,,,


In [95]:
# return random JSON records for analysis
df_products.sample(n=5).to_dict(orient='records')

[{'name': 'Imagine Artist - PRE-OWNED - Nintendo DS',
  'type': 'Game',
  'price': 9.99,
  'shipping': 3.99,
  'description': 'Creativity is a thing of beauty',
  'manufacturer': 'Ubisoft',
  'model': None,
  'Category_F1': 'Video Games',
  'Category_F2': 'Pre-Owned Games',
  'Category_F3': nan,
  'Category_F4': nan,
  'Category_F5': nan,
  'Category_F6': nan,
  'Category_F7': nan},
 {'name': 'Manhasset - Symphony Music Stand - Black',
  'type': 'HardGood',
  'price': 54.99,
  'shipping': 0,
  'description': 'MANHASSET Symphony Music Stand: ABS plastic desk construction; heavy-gauge steel welded base; greaseless chrome inner shaft; automatic friction-tilt neck; 26" - 48" adjustable height; textured finish',
  'manufacturer': 'Manhasset',
  'model': 'ST0084',
  'Category_F1': 'Musical Instruments',
  'Category_F2': 'Recording Equipment',
  'Category_F3': 'Recording Furniture & Stands',
  'Category_F4': 'Recording & Music Stands',
  'Category_F5': nan,
  'Category_F6': nan,
  'Category_F

In [65]:
# List categories for first categorical field in hierarchy
df_products["Category_F1"].unique()

array(['Connected Home & Housewares', 'Carfi Instore Only',
       'Car Electronics & GPS', 'In-Store Only', 'Musical Instruments',
       'Toys, Games & Drones', 'Video Games', 'Cameras & Camcorders',
       'Computers & Tablets', 'Appliances', 'Audio', 'TV & Home Theater',
       'Health, Fitness & Beauty', 'Name Brands', 'Cell Phones',
       'Movies & Music', 'Exclusives', 'Gadgets', 'Magnolia Home Theater',
       'Geek Squad', 'Outlet Store', 'XBox_Buy2Get1_20130616',
       'Accessories', 'MP Pending', 'Best Buy Gift Cards',
       'H/VG_X360/Games/B2G1_20130602', 'Other Product Categories',
       'Aluminum Tripods', 'Beverage & Wine Coolers', 'Furniture',
       'MP Exclusives', 'AV Surge Protection', 'Wearable Technology',
       'Gift Ideas', 'Poker Tables', 'Custom Parts', 'Remote Controls',
       'Geek Squad Services', 'Video Switchers',
       'Wireless Systems & Cameras', 'Bunk Beds & Mattresses',
       'Karaoke Software', 'Freestanding Gas Ranges',
       'Analog Audi

In [64]:
# df_products["Category_F2"].unique()
# df_products["Category_F3"].unique()

In [63]:
# Return 5 random records under category -> Health, Fitness & Beauty
df_products[df_products["Category_F1"] == 'Health, Fitness & Beauty'].sample(n=5).to_dict(orient='records')

[{'name': 'Jawbone - UP3 Activity Tracker + Heart Rate - Silver',
  'type': 'HardGood',
  'price': 179.99,
  'shipping': 0,
  'description': 'Fits most wrist sizesMeasures activity vs. inactivity, calories burned, distance traveled, resting heart rate, pace, sleep quality, speed and steps takenHeart rate monitorWorks with iOS and Android devices',
  'manufacturer': 'Jawbone',
  'model': 'JBUP3-SILVERCROSS',
  'Category_F1': 'Health, Fitness & Beauty',
  'Category_F2': 'Activity Trackers & Pedometers',
  'Category_F3': nan,
  'Category_F4': nan,
  'Category_F5': nan,
  'Category_F6': nan,
  'Category_F7': nan},
 {'name': 'Remington - 4-in-1 Vacuum Hair Trimmer - Black/Blue',
  'type': 'HardGood',
  'price': 49.99,
  'shipping': 0,
  'description': 'Nose/ear trimmer; linear head attachment; foil shaver attachment; easy washout feature; includes travel pouch; water-resistant',
  'manufacturer': 'Remington',
  'model': 'VPG6530A',
  'Category_F1': 'Health, Fitness & Beauty',
  'Category_F2

## [Optional] Analysis of data
* Tool used for explanation is ydata-profiling
    * Link: https://github.com/ydataai/ydata-profiling
* Some features of pandas-profiling:
    * Generates a neat html report of a dataframe
    * User can interactively explore attributes

In [5]:
# optionally profile dataframe for an overview
# ydata_profiling.ProfileReport(df_products)

## Use raw data for creating Text embeddings which can be used for:

* Semantic search: Search text ranked by semantic similarity.
* Recommendation: Return items with text attributes similar to the given text.
* Classification: Return the class of items whose text attributes are similar to the given text.
* Clustering: Cluster items whose text attributes are similar to the given text.
* Outlier Detection: Return items where text attributes are least related to the given text.

Learn more here about creating embedding with [palm-api-example](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/intro_palm_api.ipynb)


In [66]:
import seaborn as sns
from IPython.display import Markdown, display
from sklearn.metrics.pairwise import cosine_similarity
from vertexai.preview.language_models import (ChatModel, InputOutputTextPair,
                                              TextEmbeddingModel,
                                              TextGenerationModel)


In [67]:
embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")


In [68]:
df_health_fitness_beauty = df_products[df_products["Category_F1"] == 'Health, Fitness & Beauty'].sample(n=5)

In [72]:
df_health_fitness_beauty

Unnamed: 0,name,type,price,shipping,description,manufacturer,model,Category_F1,Category_F2,Category_F3,Category_F4,Category_F5,Category_F6,Category_F7
8497,FHI Heat - RUNWAY IQ Perfect Session Roller Sy...,HardGood,199.99,0.0,FHI HEAT RUNWAY IQ Perfect Session Roller Syst...,FHI Heat,RW5008,"Health, Fitness & Beauty",Personal Care & Beauty,Hair Care,Curlers,,,
40404,Garmin - Forerunner 15 GPS Watch (Large) - Red...,HardGood,139.99,0.0,"Fits wrists from 5.75"" to 9""Measures activity ...",Garmin,010-01241-01,"Health, Fitness & Beauty",Fitness & GPS Watches,,,,,
25112,Fitbit - Blaze Smart Fitness Watch (Large) - B...,HardGood,199.95,0.0,"Fits wrists from 6.7"" to 8.1""Measures steps ta...",Fitbit,FB502SBKL,"Health, Fitness & Beauty",Activity Trackers & Pedometers,,,,,
46393,Philips Norelco - PowerTouch Wet and Dry Elect...,HardGood,89.99,0.0,"AquaTec seal; washable, flexing DualPrecision ...",Philips Norelco,AT814/41,"Health, Fitness & Beauty",Personal Care & Beauty,Shavers & Trimmers,Trimmers,,,
10333,Vidal Sassoon - 1875W Tourmaline Compact Trave...,HardGood,22.99,5.99,VIDAL SASSOON 1875W Tourmaline Compact Travel ...,Vidal Sassoon,VS784,"Health, Fitness & Beauty",Personal Care & Beauty,Hair Care,Hair Dryers,,,


In [71]:
df_health_fitness_beauty.description.values

array(['FHI HEAT RUNWAY IQ Perfect Session Roller System Set: 8-second heating; 284-degree maximum temperature; compact unit design; includes 4 small, 4 medium and 4 large rollers and 12 roller grip clips',
       'Fits wrists from 5.75" to 9"Measures activity vs. inactivity, calories burned, distance, sleep quality, speed and steps takenGPS enabledWaterproof design',
       'Fits wrists from 6.7" to 8.1"Measures steps taken, distance traveled, and stairs climbedSmart notificationsCompatible with Android, iOS and Windows PhoneHi-res color LCD touch screenElastomer band',
       'AquaTec seal; washable, flexing DualPrecision heads; Super Lift & Cut technology; SkinGlide system; pop-up trimmer; cordless design; slip-resistant grip',
       'VIDAL SASSOON 1875W Tourmaline Compact Travel Hair Dryer: 1875W of power; Tourmaline ceramic technology; 2 heat/speed settings; cold-shot button; folding handle; dual-voltage design; airflow concentrator'],
      dtype=object)

In [73]:
df_health_fitness_beauty["embeddings"] = [
    emb.values for emb in embedding_model.get_embeddings(df_health_fitness_beauty.description.values)
]


In [76]:
df_health_fitness_beauty[["description","embeddings"]]

Unnamed: 0,description,embeddings
8497,FHI HEAT RUNWAY IQ Perfect Session Roller Syst...,"[-0.022047631442546844, -0.0170704647898674, 0..."
40404,"Fits wrists from 5.75"" to 9""Measures activity ...","[0.033669061958789825, 0.007567955646663904, 0..."
25112,"Fits wrists from 6.7"" to 8.1""Measures steps ta...","[0.004225388169288635, 0.006953661795705557, 0..."
46393,"AquaTec seal; washable, flexing DualPrecision ...","[0.00800141878426075, 0.018349215388298035, 0...."
10333,VIDAL SASSOON 1875W Tourmaline Compact Travel ...,"[-0.014383732341229916, 0.013423227705061436, ..."


## Comparing similarity of text examples using cosine similarity

In [86]:
df_health_fitness_beauty.description.values

array(['FHI HEAT RUNWAY IQ Perfect Session Roller System Set: 8-second heating; 284-degree maximum temperature; compact unit design; includes 4 small, 4 medium and 4 large rollers and 12 roller grip clips',
       'Fits wrists from 5.75" to 9"Measures activity vs. inactivity, calories burned, distance, sleep quality, speed and steps takenGPS enabledWaterproof design',
       'Fits wrists from 6.7" to 8.1"Measures steps taken, distance traveled, and stairs climbedSmart notificationsCompatible with Android, iOS and Windows PhoneHi-res color LCD touch screenElastomer band',
       'AquaTec seal; washable, flexing DualPrecision heads; Super Lift & Cut technology; SkinGlide system; pop-up trimmer; cordless design; slip-resistant grip',
       'VIDAL SASSOON 1875W Tourmaline Compact Travel Hair Dryer: 1875W of power; Tourmaline ceramic technology; 2 heat/speed settings; cold-shot button; folding handle; dual-voltage design; airflow concentrator'],
      dtype=object)

In [98]:
# cos_sim_array = cosine_similarity(list(df_health_fitness_beauty.embeddings.values))

In [96]:
# # display as DataFrame
# df = pd.DataFrame(cos_sim_array, index=df_health_fitness_beauty.description.values, columns=df_health_fitness_beauty.description.values)
# df

In [97]:
# ax = sns.heatmap(df, annot=True, cmap="crest")
# ax.xaxis.tick_top()
# ax.set_xticklabels(df_health_fitness_beauty.description.values, rotation=90)