In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

## Amazon Shoes Dataset
* [Source](https://data.world/crawlfeeds/amazon-uk-shoes-dataset)

### Preparing the data

In [120]:
path = "data/amazon_uk_shoes_dataset.json"
with open(path, "r") as f:
    data = json.load(f)

In [62]:
def get_product_details(item):
    fields_to_keep = ["department", "manufacturer", "product dimensions"]

    REPLACE_DICT = {
        "\u200f": "",
        "\u200e": "",
        "\n\n": "",
        "\n:\n": ":",
        ":\n": ": ",
    }
    details = item["product_details"]
    for k, v in REPLACE_DICT.items():
        details = details.replace(k, v)

    details = details.lower()

    detail_lines = details.split("\n")
    detail_dict = {field: "NA" for field in fields_to_keep}
    for line in detail_lines:
        if ":" in line:
            field, value = line.split(":", 1)
            if field in fields_to_keep:
                detail_dict[field] = value.strip()

    department_map = {
        "'s": "",
        "’s": "",
        "s’": "",
        "s'": "",
    }

    department = detail_dict["department"]
    for k, v in department_map.items():
        department = department.replace(k, v)
    
    detail_dict["department"] = department
    
    return detail_dict

In [63]:
feature_count = {}
for item in data:
    features = item["features"]
    keys = [list(feature.keys())[0].strip().lower() for feature in features]
    for key in keys:
        key = key.lower().strip().replace(" ", "_")
        if key in feature_count:
            feature_count[key] += 1
        else:
            feature_count[key] = 1

feature_count = pd.Series(feature_count)
feature_count.sort_values(ascending=False).head(10)

closure                 10767
shoe_width              10485
sole                     9962
outer_material           8726
heel_type                6226
inner_material           5935
heel_height              2939
material_composition     1545
fit_type                  342
material                  185
dtype: int64

In [64]:
def get_features(item):
    features_to_keep = ["closure", "shoe_width", "sole", "outer_material"]
    features = item["features"]
    feature_dict = {feature: "NA" for feature in features_to_keep}
    
    for feature in features:
        feature_name = list(feature.keys())[0]
        feature_name = feature_name.lower().strip().replace(" ", "_")
        if feature_name in features_to_keep:
            feature_value = list(feature.values())[0].lower().strip()
            feature_dict[feature_name] = feature_value
    
    return feature_dict

In [65]:
def get_brand(item):
    if "brand" in item:
        brand_info = item["brand"]
        if "Visit the" in brand_info:
            m = re.search(r"Visit the (.+?) Store", brand_info)
            return m.group(1)
        else:
            return brand_info
    else:
        return "NA"

In [66]:
all_brands = []
for item in data:
    all_brands.append(get_brand(item))

brand_count = pd.Series(all_brands).value_counts()
brand_count.head(10)

adidas         685
Skechers       500
PUMA           473
New Balance    405
Reebok         296
ASICS          280
Geox           255
Clarks         211
NIKE           164
ECCO           144
dtype: int64

In [67]:
def get_category(item):
    if "breadcrumbs" in item:
        breadcrumbs = item["breadcrumbs"]
        if breadcrumbs:
            return breadcrumbs
        else:
            return "NA"
    else:
        return "NA"

In [68]:
item_classes = []
for item in data:
    item_classes.append(get_category(item))

item_classes = pd.Series(item_classes)
item_classes.value_counts().head(10)

Shoes/Women's Shoes/Fashion & Athletic Trainers/Fashion Trainers                                           2823
Shoes/Men's Shoes/Fashion & Athletic Trainers/Fashion Trainers                                             1030
NA                                                                                                          584
Shoes/Boys' Shoes/Fashion & Athletic Trainers/Fashion Trainers                                              567
Shoes/Women's Shoes/Fashion & Athletic Trainers/Sports & Outdoor Shoes/Running Shoes/Road Running Shoes     415
Shoes/Men's Shoes/Boots                                                                                     394
Shoes/Men's Shoes/Fashion & Athletic Trainers/Sports & Outdoor Shoes/Running Shoes/Road Running Shoes       379
Shoes/Men's Shoes/Lace-ups                                                                                  307
Shoes/Women's Shoes/Fashion & Athletic Trainers/Sports & Outdoor Shoes/Dance Shoes                      

In [69]:
def get_price(item):
    price = item["price"]

    if price in ["Check website", "NA", "", None]:
        return np.nan
    
    elif "-" in price:
        price_range = price.split("-")
        price1 = float(price_range[0].strip().strip("£").replace(",", ""))
        price2 = float(price_range[1].strip().strip("£").replace(",", ""))
        return (price1 + price2) / 2
    
    else:
        return float(price.strip().strip("£").replace(",", ""))

In [118]:
def get_item_info_doc(item):
    info = {
        "id": item["asin"],
        "url": item.get("url", "NA"),
        "model_name": item.get("title", "NA"),
        "category": get_category(item),
        "brand": get_brand(item),
    }

    features = get_features(item)
    details = get_product_details(item)
    info.update(features)
    info.update(details)

    price = item.get("price", "Check website for price")
    if price is None:
        price = "Check website for price"

    info["price_text"] = price
    info["price"] = get_price(item)

    return info

In [121]:
all_item_data = []
for item in data:
    all_item_data.append(get_item_info_doc(item))

In [123]:
save_path = "data/parsed_amazon_shoe_data.jsonl"
with open(save_path, "w") as f:
    for item in all_item_data:
        f.write(json.dumps(item) + "\n")

In [73]:
departments = pd.Series([item["department"] for item in all_item_data])
departments.value_counts()

women                  3919
men                    3258
girl                   1217
unisex kids             917
NA                      817
unisex                  651
boy                     645
unisex baby              45
baby girls               44
baby boys                41
unisex adult              9
unisex kid                8
unisex-child              8
men;women                 4
synthetic/suede           4
unisex-adult              3
baby girl                 3
adults                    2
baby boy                  2
sandals                   1
boys                      1
adults and children       1
kids girls                1
womens                    1
unisex – adults           1
patent/nubuck             1
junior riding boot        1
dtype: int64

### Finetuning templates

In [124]:
path = f"data/parsed_amazon_shoe_data.jsonl"
with open(path, "r") as f:
    data = []
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [125]:
df["brand"].value_counts().head(10)

adidas         685
Skechers       500
PUMA           473
New Balance    405
Reebok         296
ASICS          280
Geox           255
Clarks         211
NIKE           164
ECCO           144
Name: brand, dtype: int64

In [126]:
def get_general_recommendations(brand_name, data):
    questions = [
        f"Can you recommend some shoes from {brand_name}?",
        f"What shoes do you have from {brand_name}?",
        f"Can you show me some shoes from {brand_name}?",
        f"Can you recommend some {brand_name} shoes?",
        f"What are the best {brand_name} shoes?",
        f"Can you show me some {brand_name} shoes?",
    ]
    
    rejection_answers = [
        "We currently do not have any shoes from this brand",
        f"We currently do not have any shoes from {brand_name}.",
        f"No shoes found for {brand_name}.",
        "Unfortunately, we do not have any shoes from this brand at the moment.",
        f"Unfortunately, we do not have any shoes from {brand_name} at the moment.",
        f"Our collection does not include any shoes from {brand_name}.",
        f"Sorry, we do not have any shoes from {brand_name}.",
        f"The brand {brand_name} is not available in our collection.",
        f"Sorry, we do not have any shoes from this brand.",
    ]

    answers = [
        f"Here are some shoes from {brand_name}:",
        f"Here are some {brand_name} shoes:",
        f"Here are some shoes from {brand_name} that you might like:",
        f"Here are some {brand_name} shoes that you might like:",
        f"Here are some {brand_name} shoes that you might find interesting:",
        f"Here are some shoes from {brand_name} that you might find interesting:",
    ]
    brand_name = brand_name.lower()
    
    cond = data["brand"].str.lower() == brand_name
    brand_df = data[cond]

    question = np.random.choice(questions)

    if brand_df.empty:
        answer = np.random.choice(rejection_answers)
    
    else:
        n = np.random.randint(2, 8)
        sample_size = min(n, len(brand_df))
        brand_df = brand_df.sample(sample_size)
        answer = np.random.choice(answers)
        
        for i, row in brand_df.iterrows():
            model_name = row["model_name"]
            price = row["price_text"]
            answer += f"\n- {model_name} ({price})"
    
    return question, answer

In [160]:
def get_recommendations_by_price(brand_name, price_range, data):

    min_price, max_price = price_range
    if min_price is None:
        min_price = 0

    if max_price is None:
        max_price = np.inf
    
    questions = [
        f"Recommend me some shoes from {brand_name} between £{min_price:.0f} and £{max_price:.0f}",
        f"Can you show me some shoes from {brand_name} between {min_price:0f} and {max_price:.0f} pounds?",
        f"What are some {brand_name} shoes between £{min_price:.0f} and £{max_price:.0f}?",
        f"Can you recommend some {brand_name} shoes between £{min_price:.0f} and £{max_price:.0f}?",
        f"What are the best {brand_name} shoes between £{min_price:.0f} and £{max_price:.0f}?",
        f"I want to buy a shoe between £{min_price:.0f} and £{max_price:.0f} from {brand_name}. Can you help me?",
        f"I want a shoe from {brand_name} that costs less than £{max_price:.0f}.",
        f'Which {brand_name} shoes are available between £{min_price:.0f} and £{max_price:.0f}?',
        f"{brand_name} shoes between £{min_price:.0f} and £{max_price:.0f}?",
    ]

    rejection_answers = [
        f"Unfortunately, we do not have any {brand_name} shoes between £{min_price:.0f} and £{max_price:.0f}.",
        f"Sorry, we do not have any {brand_name} shoes between £{min_price:.0f} and £{max_price:.0f}.",
        f"No {brand_name} shoes found between £{min_price:.0f} and £{max_price:.0f}.",
        f"Currently, we do not have any {brand_name} shoes between £{min_price:.0f} and £{max_price:.0f}.",
        f"Sorry, it seems like we do not have any {brand_name} shoes between £{min_price:.0f} and £{max_price:.0f} at the moment.",
        f"Currently there are no shoes that meet your criteria. Do you want to try a different price range?",
    ]

    answers = [
        f"Here are some shoes from {brand_name} within the range £{min_price:.0f} and £{max_price:.0f}:",
        f"Here are some {brand_name} shoes within the range £{min_price:.0f} and £{max_price:.0f}:",
        f"Here are some {brand_name} shoes that you might like within the range £{min_price:.0f} and £{max_price:.0f}:",
        f"Here are some shoes from {brand_name} that you might like within the range £{min_price:.0f} and £{max_price:.0f}:",
        f"Here are some {brand_name} shoes that you might find interesting within the range £{min_price:.0f} and £{max_price:.0f}:",
        f"Here are some shoes from {brand_name} that you might find interesting within the range £{min_price:.0f} and £{max_price:.0f}:",
    ]

    cond = (data["brand"].str.lower() == brand_name.lower()) & (data["price"].between(min_price, max_price))
    brand_df = data[cond]

    question = np.random.choice(questions)

    if brand_df.empty:
        answer = np.random.choice(rejection_answers)
    else:
        n = np.random.randint(2, 8)
        sample_size = min(n, len(brand_df))
        brand_df = brand_df.sample(sample_size)
        answer = np.random.choice(answers)
        for i, row in brand_df.iterrows():
            model_name = row["model_name"]
            price = row["price_text"]
            answer += f"\n- {model_name} ({price})"

    return question, answer

In [161]:
def get_recommendations_by_category(category, data):
    """
    Recommend shoes from a specific category (men, women, kids, etc.)
    """
    questions = [
        f"Can you recommend some {category} shoes?",
        f"What are the best {category} shoes?",
        f"Can you show me some {category} shoes?",
        f"Recommend me some {category} shoes.",
        f"What {category} shoes do you have?",
        f"Can you show me some {category} shoes?",
        f"{category} shoes",
    ]

    rejection_answers = [
        f"Unfortunately, we do not have any {category} shoes at the moment.",
        f"Sorry, we do not have any {category} shoes.",
        f"No {category} shoes found.",
        f"Currently, we do not have any {category} shoes.",
        f"Sorry, we do not have any {category} shoes at the moment.",
        f"Unfortunately, we do not have any {category} shoes.",
        f"Currently there are no {category} shoes available.",
    ]

    answers = [
        f"Here are some {category} shoes:",
        f"Here are some {category} shoes that you might like:",
        f"Here are some {category} shoes that you might find interesting:",
        f"Here are some {category} shoes that you might find appealing:",
        f"Here are some {category} shoes that you might like:",
    ]

    cond = data["department"].str.lower() == category.lower()
    category_df = data[cond]

    question = np.random.choice(questions)

    if category_df.empty:
        answer = np.random.choice(rejection_answers)
    else:
        n = np.random.randint(2, 8)
        sample_size = min(n, len(category_df))
        category_df = category_df.sample(sample_size)
        answer = np.random.choice(answers)
        for i, row in category_df.iterrows():
            model_name = row["model_name"]
            price = row["price_text"]
            answer += f"\n- {model_name} ({price})"
    
    return question, answer

In [162]:
def get_recommendations_by_category_brand(category, brand_name, data):

    questions = [
        f"Can you recommend some {category} shoes from {brand_name}?",
        f"What are the best {category} shoes from {brand_name}?",
        f"Can you show me some {category} shoes from {brand_name}?",
        f"Recommend me some {category} shoes from {brand_name}.",
        f"What {category} shoes do you have from {brand_name}?",
        f"Show me some {category} shoes from {brand_name}?",
        f"{category} shoes from {brand_name}",
    ]

    answers = [
        f"Here are some {category} shoes from {brand_name}:",
        f"Here are some {category} shoes from {brand_name} that you might like:",
        f"Here are some {category} shoes from {brand_name} that you might find interesting:",
        f"Here are some {category} shoes from {brand_name} that you might find appealing:",
        f"Here are some {category} shoes from {brand_name} that you might like:",
    ]

    rejection_answers = [
        f"Unfortunately, we do not have any {category} shoes from {brand_name} at the moment.",
        f"Sorry, we do not have any {category} shoes from {brand_name}.",
        f"No {category} shoes found for {brand_name}.",
        f"Currently, we do not have any {category} shoes from {brand_name}.",
        f"Sorry, we do not have any {category} shoes from {brand_name} at the moment.",
        f"Unfortunately, we do not have any {category} shoes from {brand_name}.",
        f"Currently there are no {category} shoes available from {brand_name}.",
    ]

    cond = (data["department"].str.lower() == category.lower()) & (data["brand"].str.lower() == brand_name.lower())

    category_brand_df = data[cond]

    question = np.random.choice(questions)

    if category_brand_df.empty:
        answer = np.random.choice(rejection_answers)
    else:
        n = np.random.randint(2, 8)
        sample_size = min(n, len(category_brand_df))
        category_brand_df = category_brand_df.sample(sample_size)
        answer = np.random.choice(answers)
        for i, row in category_brand_df.iterrows():
            model_name = row["model_name"]
            price = row["price_text"]
            answer += f"\n- {model_name} ({price})"
    
    return question, answer

In [135]:
brand_name = "Reebok"
min_price = 50
max_price = 100
category = "men"
# q, a = get_general_recommendations(brand_name, df)

# q, a = get_recommendations_by_price(brand_name, (min_price, max_price), df)

# q, a = get_recommendations_by_category(category, df)

q, a = get_recommendations_by_category_brand(category, brand_name, df)
print(q)
print(a)

Can you show me some men shoes from Reebok?
Here are some men shoes from Reebok that you might like:
- Reebok Men's Sole Fury Adapt Cross Trainer (£64.10 - £163.88)
- Reebok Women's Twistform Blaze 3.0 MTM Running Shoe, 0 (£96.99 - £136.71)


In [142]:
df["department"].value_counts().head(10).index

Index(['women', 'men', 'girl', 'unisex kids', 'NA', 'unisex', 'boy',
       'unisex baby', 'baby girls', 'baby boys'],
      dtype='object')

In [187]:
all_brand_names = df["brand"].value_counts().head(20).index
all_categories = [
    'women', 'men', 'girl', 'unisex kids', 'unisex', 'boy', 'unisex baby', 'baby girls', 'baby boys'
    ]

num_samples = 1500
sample_1 = []
for sample_id in range(num_samples):
    brand_name = np.random.choice(all_brand_names)
    cond = df["brand"] == brand_name
    brand_df = df[cond]
    categories = brand_df["department"].unique()
    categories = [c for c in categories if c in all_categories]
    category = np.random.choice(categories)
    q, a = get_recommendations_by_category_brand(category, brand_name, df)
    sample_1.append({"question": q, "answer": a})


num_samples = 1500
sample_2 = []
for sample_id in range(num_samples):
    brand_name = np.random.choice(all_brand_names)
    cond = df["brand"] == brand_name
    brand_df = df[cond]
    min_price = brand_df["price"].min()
    max_price = brand_df["price"].max()
    if min_price == max_price:
        continue

    min_price = np.random.randint(0, max_price)
    max_price = np.random.randint(min_price, max_price)
    if np.isnan(min_price):
        min_price = 0
    if np.isnan(max_price):
        max_price = 100
    
    q, a = get_recommendations_by_price(brand_name, (min_price, max_price), df)
    sample_2.append({"question": q, "answer": a})


num_samples = 500
sample_3 = []
existing_questions = set()
for sample_id in range(num_samples):
    category = np.random.choice(all_categories)
    q, a = get_recommendations_by_category(category, df)
    if q in existing_questions:
        continue
    existing_questions.add(q)
    sample_3.append({"question": q, "answer": a})

all_samples = sample_1 + sample_2 + sample_3

samples = pd.DataFrame(all_samples)
samples.drop_duplicates(subset="question", inplace=True)
samples.drop_duplicates(subset="answer", inplace=True)
samples.insert(0, "id", range(1, 1 + len(samples)))

samples = samples.to_dict(orient="records")


In [188]:
samples = samples.to_dict(orient="records")

In [189]:
samples[0]

{'id': 1,
 'question': 'What are the best women shoes from Geox?',
 'answer': "Here are some women shoes from Geox that you might like:\n- Geox Women's D Smeraldo a Low-Top Sneakers (£47.28 - £144.69)\n- Geox New Club C Women Loafers (£51.25 - £79.62)\n- Geox Women's D Nhenbus C Low-Top Sneakers (£61.28 - £100.51)\n- Geox Women's D Nebula C Sneaker (£54.04 - £103.43)"}

In [191]:
save_path = "data/finetuning_dataset.json"
with open(save_path, "w") as f:
    for sample in samples:
        f.write(json.dumps(sample) + "\n")

## Database documents

In [2]:
path = f"data/parsed_amazon_shoe_data.jsonl"
with open(path, "r") as f:
    data = []
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [3]:
data[0]

{'id': 'B08BLP231K',
 'url': 'https://www.amazon.co.uk/dp/B08BLP231K',
 'model_name': 'Geox Jr Sandal Strada B Fisherman, Brown Red, 2.5 UK Child',
 'category': "Shoes/Boys' Shoes/Sandals",
 'brand': 'Geox',
 'closure': 'strap',
 'shoe_width': 'medium',
 'sole': 'rubber',
 'outer_material': 'synthetic',
 'department': 'boy',
 'manufacturer': 'geox',
 'product dimensions': 'NA',
 'price_text': '£50.00',
 'price': 50.0}

In [16]:
def prepare_database_documents(item):
    text = ""
    text += f"Model: {item['model_name']}\n"
    text += f"Brand: {item['brand']}\n"
    text += f"Manufacturer: {item['manufacturer']}\n"
    text += f"Category: {item['category']}\n"
    text += f"Department: {item['department']}\n"
    text += f"Price: {item['price_text']}\n"
    text += f"Shoe Width: {item['shoe_width']}\n"
    text += f"Outer Material: {item['outer_material']}\n"
    text += f"Sole: {item['sole']}\n"
    text += f"Closure: {item['closure']}\n"
    text += f"Product Dimensions: {item['product dimensions']}\n"

    doc = {
        "id": item["id"],
        "url": item["url"],
        "closure": item["closure"],
        "price": item["price"],
        "brand": item["brand"],
        "department": item["department"],
        "text": text,
    }

    return doc

In [18]:
docs = []
for item in data:
    docs.append(prepare_database_documents(item))

In [21]:
print(docs[-10]["text"])

Model: GIOSEPPO Women's Gerpinnes Sneaker
Brand: GIOSEPPO
Manufacturer: gioseppo
Category: Shoes/Women's Shoes/Fashion & Athletic Trainers/Fashion Trainers
Department: women
Price: £78.26 - £78.67
Shoe Width: medium
Outer Material: 30% polyester, 25% pu, 45% vaccine
Sole: 100% tpr
Closure: lace-up
Product Dimensions: NA



In [23]:
save_path = "data/amazon_shoe_database.jsonl"
with open(save_path, "w") as f:
    for doc in docs:
        f.write(json.dumps(doc) + "\n")

In [24]:
len(docs)

11605