# Nodify Test

In [18]:
import json
from pathlib import Path
import pandas as pd

In [48]:
from dotenv import load_dotenv

Create a dotenv file for openai api token, example

In [None]:
# %%writefile ../.env
# OPENAI_API_KEY="sk-xxxxxxxxxxxxxxxxx"

In [49]:
load_dotenv("../.env")

True

## Category data

In [28]:
categories = json.loads(Path("../data/category.json").read_text())

In [29]:
l1 = ""

for row in categories:
    if row["parent_type"] == "H4":
        l1 = row["name"]
    row["level1"] = l1

In [43]:
categories[:5]

[{'name': 'Antiques',
  'url': 'https://www.ebay.com/b/Antiquities/37903/bn_1865503',
  'parent_type': 'H4',
  'level1': 'Antiques'},
 {'name': 'Architectural & Garden',
  'url': 'https://www.ebay.com/b/Architectural-Garden-Antiques/4707/bn_1865433',
  'parent_type': 'LI',
  'level1': 'Antiques'},
 {'name': 'Asian Antiques',
  'url': 'https://www.ebay.com/b/Asian-Antiques/20082/bn_1865025',
  'parent_type': 'LI',
  'level1': 'Antiques'},
 {'name': 'Books & Manuscripts',
  'url': 'https://www.ebay.com/b/Antiquarian-Collectible-Books/29223/bn_1865565',
  'parent_type': 'LI',
  'level1': 'Antiques'},
 {'name': 'Decorative Arts',
  'url': 'https://www.ebay.com/b/Antique-Decorative-Arts/20086/bn_1849288',
  'parent_type': 'LI',
  'level1': 'Antiques'}]

## Sub category mapping

In [42]:
categories_df = pd.DataFrame(categories)[["name", "level1"]]
categories_df

Unnamed: 0,name,level1
0,Antiques,Antiques
1,Architectural & Garden,Antiques
2,Asian Antiques,Antiques
3,Books & Manuscripts,Antiques
4,Decorative Arts,Antiques
...,...,...
489,Personal Security,Everything Else
490,Religious Products & Supplies,Everything Else
491,Reward Points & Incentives,Everything Else
492,Weird Stuff,Everything Else


### Category stats

In [45]:
l1_stats = pd.DataFrame(categories_df["level1"].value_counts())
l1_stats

Unnamed: 0,level1
Collectibles,46
Toys & Hobbies,25
Antiques,22
Health & Beauty,22
Home & Garden,22
Computers & Tablets,21
Business & Industrial,19
Cameras & Photo,18
Fashion,18
Baby,17


In [40]:
def get_sub_category(categories_df, level1):
    sub_df = categories_df.query(f"level1=='{level1}'")
    return list(i for i in sub_df["name"] if i != level1)

In [41]:
get_sub_category(categories_df, "Health & Beauty")

['Bath & Body',
 'Dietary Supplements, Nutrition',
 'Fragrances',
 'Hair Care & Styling',
 'Health Care',
 'Makeup',
 'Massage',
 'Medical, Mobility & Disability',
 'Nail Care, Manicure & Pedicure',
 'Natural & Homeopathic Remedies',
 'Oral Care',
 'Over-the-Counter Medicine',
 'Salon & Spa Equipment',
 'Shaving & Hair Removal',
 'Skin Care',
 'Sun Protection & Tanning',
 'Tattoos & Body Art',
 'Vision Care',
 'Weight Management',
 'Wholesale Lots',
 'Other Health & Beauty']

In [46]:
level1_list = list(l1_stats["level1"].index)
level1_list

['Collectibles',
 'Toys & Hobbies',
 'Antiques',
 'Health & Beauty',
 'Home & Garden',
 'Computers & Tablets',
 'Business & Industrial',
 'Cameras & Photo',
 'Fashion',
 'Baby',
 'Stamps',
 'Musical Instruments & Gear',
 'Jewelry & Watches',
 'Sporting Goods',
 'Everything Else',
 'Crafts',
 'Coins & Paper Money',
 'Travel',
 'Books',
 'Electronics',
 'Pet Supplies',
 'Specialty Services',
 'Video Games & Consoles',
 'Cell Phones & Smartphones',
 'DVDs & Movies',
 'Entertainment Memorabilia',
 'Sports Mem, Cards & Fan Shop',
 'Tickets & Experiences',
 'eBay Motors',
 'Real Estate',
 'Music',
 'Dolls & Bears',
 'Gift Cards & Coupons',
 'Art',
 'Pottery & Glass']

## Data from a single page

In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the eBay product page to scrape
url = "https://www.ebay.com/itm/125480010416?_trkparms=5079%3A0&_trksid=p2509164.m5277"

# Send a GET request to the URL and get the HTML content
response = requests.get(url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

In [57]:
product_title = soup.find('h1', {'class': 'x-item-title__mainTitle'}).text.strip()
product_title

'ECCO GOLF BIOM H4 GORE TEX LEATHER WATERPROOF SPIKELESS MENS GOLF SHOES'

## Language model

In [47]:
from langchain.schema import HumanMessage, SystemMessage
from langchain.chat_models import ChatOpenAI

In [50]:
# if you setup the openai api token correctly this step will be fine
chat = ChatOpenAI()

## Layer 1

In [78]:
level1_list_join = ','.join(level1_list)
messages = [
    SystemMessage(content="You are a knowledge distilling robot, who always output json data as result"),
    HumanMessage(content=f"""For product discription: {product_title.lower()},
    which of the following categories it fits into?
    {level1_list_join}
    Please answer in the name of the category, no more no less, in format of `xxxxx`""")
]

In [79]:
result_text = chat(messages).content

In [84]:
result_text.strip()

'Sporting Goods'

## Layer 2

In [83]:
sub_cates_list = get_sub_category(categories_df, result_text.strip())
sub_cates_list

['Boxing, Martial Arts & MMA',
 'Cycling',
 'Fishing',
 'Fitness, Running & Yoga',
 'Golf',
 'Hunting',
 'Indoor Games',
 'Outdoor Sports',
 'Team Sports',
 'Tennis & Racquet Sports',
 'Water Sports',
 'Winter Sports',
 'Other Sporting Goods',
 'Wholesale Lots']

In [85]:
sub_cates_list_join = ','.join(sub_cates_list)
messages = [
    SystemMessage(content="You are a knowledge distilling robot, who always output json data as result"),
    HumanMessage(content=f"""For product discription: {product_title.lower()},
    which of the following categories it fits into?
    {sub_cates_list_join}
    Please answer in the name of the category, no more no less, in format of `xxxxx`""")
]

In [86]:
final_result = chat(messages).content
final_result.strip()

'Golf'

In [87]:
product_title

'ECCO GOLF BIOM H4 GORE TEX LEATHER WATERPROOF SPIKELESS MENS GOLF SHOES'