In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa
import os

np.random.seed(42)

In [2]:
# Loading new query dataset:

queries_amazon_review_sample_good = pd.read_json('../other_datasets/queries_amazon_review_sample_good.json', lines=True)
print('Number of items in queries_amazon_review_sample_good', queries_amazon_review_sample_good['item_id'].nunique())
queries_amazon_review_sample_good.head()

Number of items in queries_amazon_review_sample_good 256658


Unnamed: 0,query,item_id,proba,entropy,predicted_label
0,party to go mobile in store,B003ZRA3EI,"[0.04296875, 0.0252685546875, 0.93359375]",0.292969,2
1,double mijwiz in store,B00CB5BMHW,"[0.0108642578125, 0.0189208984375, 0.968750000...",0.155273,2
2,restaurant gift card,B00PG23W84,"[0.001014709472656, 0.00457763671875, 0.99609375]",0.034912,2
3,free calling,B00JFOFJSK,"[0.0103759765625, 0.051025390625, 0.9375]",0.259766,2
4,gift cards to purchase,B01GKWEY9O,"[0.0018615722656250002, 0.004608154296875, 0.9...",0.039062,2


In [3]:
# Loading ESCI dataset:

df_examples = pd.read_parquet('../other_datasets/shopping_queries_dataset_examples.parquet')
esci_dataset_all_product_ids = set(df_examples['product_id'])
print(len(esci_dataset_all_product_ids))
df_examples.head()

1802772


Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split
0,0,revent 80 cfm,0,B000MOO21W,us,I,0,1,train
1,1,revent 80 cfm,0,B07X3Y6B1V,us,E,0,1,train
2,2,revent 80 cfm,0,B07WDM7MQQ,us,E,0,1,train
3,3,revent 80 cfm,0,B07RH6Z8KW,us,E,0,1,train
4,4,revent 80 cfm,0,B07QJ7WYFQ,us,E,0,1,train


In [4]:
# Removing items from new query dataset which occur in ESCI: 

query_asins = set(queries_amazon_review_sample_good['item_id'].unique())
print('Number of items in queries_amazon_review_sample_good', len(query_asins))

query_asins_minus_esci_dataset = query_asins.difference(esci_dataset_all_product_ids)

print("Number of items in query_asins_minus_esci_dataset:",  len(query_asins_minus_esci_dataset))

Number of items in queries_amazon_review_sample_good 256658
Number of items in query_asins_minus_esci_dataset: 251998


In [None]:
# combining all category parquet files into one: (uncomment the following lines)

csv_files = [file for file in os.listdir('../downloaded_amazon_meta_datasets_csv') if file.endswith('.csv')]
print(len(csv_files))

parquet_writer = None

for csv_file in tqdm(csv_files):
    print(csv_file)
    for chunk in pd.read_csv('../downloaded_amazon_meta_datasets_csv/'+csv_file, chunksize=10000):
        chunk = chunk.astype(str)
        table = pa.Table.from_pandas(chunk)
        if parquet_writer is None:
            print('table schema', table.schema)
            parquet_writer = pq.ParquetWriter('../downloaded_amazon_meta_datasets_csv/amazon_reviews_23_all_metadata_appended.parquet', table.schema)
        parquet_writer.write_table(table)
        del table
        del chunk

if parquet_writer is not None:
    parquet_writer.close()

31


  0%|          | 0/31 [00:00<?, ?it/s]

raw_meta_Kindle_Store.csv
table schema main_category: string
title: string
average_rating: string
rating_number: string
features: string
description: string
images: string
videos: string
details: string
parent_asin: string
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1451


  3%|▎         | 1/31 [01:47<53:50, 107.68s/it]

raw_meta_Health_and_Personal_Care.csv


  6%|▋         | 2/31 [01:48<21:37, 44.75s/it] 

raw_meta_Clothing_Shoes_and_Jewelry.csv


In [None]:
# Loading all_meta dataset:

all_metadata = pd.read_parquet('../downloaded_amazon_meta_datasets_csv/amazon_reviews_23_all_metadata_appended.parquet')

print(len(all_metadata))
all_metadata.head()

In [None]:
# checking number of products per category:
print(all_metadata.groupby('main_category').size())

In [None]:
all_metadata['main_category'] = all_metadata['main_category'].replace('nan', np.nan)
all_metadata.dropna(subset=['main_category'], inplace=True)
print(all_metadata.groupby('main_category').size())

In [None]:
# Removing esci sins from meta_datset

print(len(set(esci_dataset_all_product_ids)))

print(len(all_metadata), len(set(all_metadata['parent_asin'])))

all_meta_minus_esci = all_metadata[~all_metadata['parent_asin'].isin(esci_dataset_all_product_ids)]

print(len(all_meta_minus_esci), len(set(all_meta_minus_esci['parent_asin'])))

In [None]:
# Refining meta_dataset

print('original', len(all_meta_minus_esci))

all_meta_minus_esci = all_meta_minus_esci[all_meta_minus_esci['title'].str.len() >= 5]
print('removing title < 5', len(all_meta_minus_esci))

all_meta_minus_esci = all_meta_minus_esci[all_meta_minus_esci['features'] != '[]']
print('removing empty features', len(all_meta_minus_esci))

all_meta_minus_esci = all_meta_minus_esci[all_meta_minus_esci['description'] != '[]']
print('removing empty description', len(all_meta_minus_esci))

all_meta_minus_esci = all_meta_minus_esci[all_meta_minus_esci['details'] != '{}']
print('removing empty details', len(all_meta_minus_esci))

all_meta_minus_esci = all_meta_minus_esci.dropna()
print('removing rows with nan', len(all_meta_minus_esci))



In [None]:
# randomly sampling 130,000 products from each category

sampled_meta_dataset = all_meta_minus_esci.groupby('main_category').apply(lambda x: x.sample(min(130000, len(x)))).reset_index(drop=True)
print(len(sampled_meta_dataset))
sampled_meta_dataset.head()

In [None]:
# removing rows with query asins

print(len(query_asins_minus_esci_dataset))
print(sampled_meta_dataset['parent_asin'].nunique())

sampled_meta_dataset_minus_query_asins = sampled_meta_dataset[~sampled_meta_dataset['parent_asin'].isin(query_asins_minus_esci_dataset)]

print(len(sampled_meta_dataset_minus_query_asins))
print(sampled_meta_dataset_minus_query_asins['parent_asin'].nunique())

In [None]:
plt.figure(figsize=(30, 10))
sns.countplot(x='main_category', data=sampled_meta_dataset_minus_query_asins)
plt.title('Count of Products by Main Category')
plt.xticks(rotation=75)
plt.show()

In [None]:
# create sub_meta_dataset for query asins

print(len(query_asins_minus_esci_dataset))
print(all_metadata['parent_asin'].nunique())

query_asin_metadata = all_metadata[all_metadata['parent_asin'].isin(query_asins_minus_esci_dataset)]

print(len(query_asin_metadata), query_asin_metadata['parent_asin'].nunique())

In [None]:
plt.figure(figsize=(30, 10))
sns.countplot(x='main_category', data=query_asin_metadata)
plt.title('Count of Products by Main Category')
plt.xticks(rotation=75)
plt.show()

In [None]:
# query_asin_metadata['from_query_dataset'] = 1
# sampled_meta_dataset_minus_query_asins['from_query_dataset'] = 0

print('length of query_asin_metadata', len(query_asin_metadata))
print('length of sampled_meta_dataset_minus_query_asins', len(sampled_meta_dataset_minus_query_asins))

combined_df = pd.concat([query_asin_metadata, sampled_meta_dataset_minus_query_asins], ignore_index=True)
print('length of combined_df', len(combined_df))
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

combined_df.head()

In [None]:
print(len(combined_df))
plt.figure(figsize=(30, 10))
sns.countplot(x='main_category', data=combined_df)
plt.title('Count of Products by Main Category')
plt.xticks(rotation=75)
plt.show()

In [None]:
combined_df.to_parquet('../zero_shot_retrieval_task_dataset.parquet', index=False)

# Reading the final dataset:

In [None]:
import pandas as pd
final_dataset = pd.read_parquet('../zero_shot_retrieval_task_dataset.parquet')
print(len(final_dataset))
final_dataset.head()

In [None]:
final_dataset.tail()

In [None]:
final_dataset['images'][0]

In [None]:
final_dataset['images'][3289115]

# Uploading to HF:

In [6]:
from datasets import Dataset
hf_dataset = Dataset.from_pandas(final_dataset)

In [12]:
hf_dataset.push_to_hub("AdhirajSingh1206/TREC-Zero-Shot-Product-Search", split="full")

Uploading the dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/AdhirajSingh1206/TREC-Zero-Shot-Product-Search/commit/732a7c2c810da329248cca6ba42bbffe251bb4f1', commit_message='Upload dataset', commit_description='', oid='732a7c2c810da329248cca6ba42bbffe251bb4f1', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
from datasets import load_dataset
dataset = load_dataset("AdhirajSingh1206/TREC-Zero-Shot-Product-Search", split="full")
dataset = dataset.to_pandas()
print(len(dataset))
dataset.head()

Downloading readme:   0%|          | 0.00/636 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/21 [00:00<?, ?files/s]

Generating full split:   0%|          | 0/3289116 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

3289116


Unnamed: 0,main_category,title,average_rating,rating_number,features,description,images,videos,details,parent_asin
0,All Beauty,The Body Shop Deep Sleep Peaceful Body Moistur...,3.0,2.0,"['Calming, skin-softening moisturizer'\n 'Cont...",['Best if you want to: Smooth on a dreamy mois...,"{'hi_res': array([None], dtype=object), 'large...","{'title': array([], dtype=object), 'url': arra...","{""Package Dimensions"": ""3.6 x 3.6 x 1.9 inches...",B001AWJJN6
1,"Arts, Crafts & Sewing",Copic Markers VRF-R56V Various Ink - 25CC - Cu...,5.0,1.0,"['Angled, dropper style tip for easy refilling...",['Markers can be refilled using the booster by...,{'hi_res': array(['https://m.media-amazon.com/...,"{'title': array([], dtype=object), 'url': arra...","{""Product Dimensions"": ""4.75 x 1 x 0.63 inches...",B008001U7O
2,Books,El niño que vivía en las estrellas (Serie Azul...,4.8,7.0,['Al despacho del doctor Rojas llega un extrañ...,['Book Description'\n '¿En qué clase de mundo ...,"{'hi_res': array([], dtype=object), 'large': a...","{'title': array([], dtype=object), 'url': arra...","{""Publisher"": ""Santillana Educaci\u00f3n, S.L....",8420464910
3,Appstore for Android,Metube: Player for YouTube,3.4,58.0,['search youtube video' 'simple music player' ...,"[""SIMPLE!! FREE!! Youtube music and videos pla...","{'hi_res': array([None, None, None, None, None...","{'title': array([''], dtype=object), 'url': ar...","{""Release Date"": ""2014"", ""Date first listed on...",B00JYW2U1M
4,Computers,"Dynatron K199 Intel Sandy Bridge Xeon, Core i3...",4.2,50.0,"['Dynatron K199 Intel Sandy Bridge Xeon, Core ...","['Recommend for Intel® Socket LGA 1150, LGA 11...",{'hi_res': array(['https://m.media-amazon.com/...,"{'title': array([], dtype=object), 'url': arra...","{""Brand"": ""Dynatron"", ""Series"": ""K199"", ""Item ...",B005OYEU7G
