In [26]:
import os
import sqlite3
import json
from re import sub

columnSeparator = "|"

MONTHS = {
    'Jan':'01','Feb':'02','Mar':'03','Apr':'04','May':'05','Jun':'06',
    'Jul':'07','Aug':'08','Sep':'09','Oct':'10','Nov':'11','Dec':'12'
}

def transformMonth(mon):
    return MONTHS.get(mon, mon)

def transformDttm(dttm):
    if not dttm:
        return None
    dttm = dttm.strip().split(' ')
    dt = dttm[0].split('-')
    date = '20' + dt[2] + '-' + transformMonth(dt[0]) + '-' + dt[1]
    time = dttm[1]
    return date + ' ' + time


def transformDollar(money):
    if money is None or len(money) == 0:
        return None
    return float(sub(r'[^\d.]', '', money))

USERS = set()
ITEMS = set()
CATEGORIES = set()
BIDS = set()

def parse_item(item):
    seller = item.get("Seller", {})
    seller_user_id = seller.get("UserID")
    seller_rating = int(seller.get("Rating")) if seller.get("Rating") else None
    seller_location = item.get("Location")
    seller_country = item.get("Country")

    if seller_user_id:
        USERS.add((seller_user_id, seller_rating, seller_location, seller_country))

    item_id = item.get("ItemID")
    item_name = item.get("Name")
    item_currently = transformDollar(item.get("Currently"))
    item_first_bid = transformDollar(item.get("First_Bid"))
    item_number_of_bids = int(item.get("Number_of_Bids")) if item.get("Number_of_Bids") else 0
    item_location = item.get("Location")
    item_country = item.get("Country")
    item_started = transformDttm(item.get("Started"))
    item_ends = transformDttm(item.get("Ends"))
    item_description = item.get("Description")

    ITEMS.add((
        item_id, item_name, item_currently, item_first_bid,
        item_number_of_bids, item_location, item_country,
        item_started, item_ends, seller_user_id, item_description
    ))

    for cat in item.get("Category", []):
        CATEGORIES.add((item_id, cat))

    for bid in item.get("Bids", []) or []:
        bid = bid.get("Bid", {})
        bidder = bid.get("Bidder", {})
        bidder_user_id = bidder.get("UserID")
        bidder_rating = int(bidder.get("Rating")) if bidder.get("Rating") else None
        bidder_location = bidder.get("Location")
        bidder_country = bidder.get("Country")

        if bidder_user_id:
            USERS.add((bidder_user_id, bidder_rating, bidder_location, bidder_country))

        bid_time = transformDttm(bid.get("Time"))
        bid_amount = transformDollar(bid.get("Amount"))
        if bidder_user_id and item_id:
            BIDS.add((item_id, bidder_user_id, bid_time, bid_amount))

def load_items_from_json(json_file):
    with open(json_file, 'r') as f:
        data = json.loads(f.read())
        items = data.get("Items", [])
    return items
    
for i in range(40):
    test_file = f"ebay_data/items-{i}.json"
    if os.path.exists(test_file):
        items = load_items_from_json(test_file)
        print(f"Loaded {len(items)} items from {test_file}")
        for item in items:
            parse_item(item)
    else:
        print(f"File {test_file} not found")

Loaded 500 items from ebay_data/items-0.json
Loaded 500 items from ebay_data/items-1.json
Loaded 500 items from ebay_data/items-2.json
Loaded 500 items from ebay_data/items-3.json
Loaded 500 items from ebay_data/items-4.json
Loaded 500 items from ebay_data/items-5.json
Loaded 500 items from ebay_data/items-6.json
Loaded 500 items from ebay_data/items-7.json
Loaded 500 items from ebay_data/items-8.json
Loaded 500 items from ebay_data/items-9.json
Loaded 500 items from ebay_data/items-10.json
Loaded 500 items from ebay_data/items-11.json
Loaded 500 items from ebay_data/items-12.json
Loaded 500 items from ebay_data/items-13.json
Loaded 500 items from ebay_data/items-14.json
Loaded 500 items from ebay_data/items-15.json
Loaded 500 items from ebay_data/items-16.json
Loaded 500 items from ebay_data/items-17.json
Loaded 500 items from ebay_data/items-18.json
Loaded 500 items from ebay_data/items-19.json
Loaded 500 items from ebay_data/items-20.json
Loaded 500 items from ebay_data/items-21.jso

In [27]:
def check_duplicates():
    # Counts before deduplication
    users_list = [u for u in USERS]
    items_list = [i for i in ITEMS]
    categories_list = [c for c in CATEGORIES]
    bids_list = [b for b in BIDS]

    print("\n--- Duplicate Check ---")
    print(f"USERS: {len(users_list)} unique entries")
    print(f"ITEMS: {len(items_list)} unique entries")
    print(f"CATEGORIES: {len(categories_list)} unique entries")
    print(f"BIDS: {len(bids_list)} unique entries")
check_duplicates()



--- Duplicate Check ---
USERS: 13422 unique entries
ITEMS: 19532 unique entries
CATEGORIES: 90269 unique entries
BIDS: 9874 unique entries


In [31]:

conn = sqlite3.connect("ebay.db")
cur = conn.cursor()

cur.executescript("""
DROP TABLE IF EXISTS BIDS;
DROP TABLE IF EXISTS ITEM_CATEGORIES;
DROP TABLE IF EXISTS ITEM;
DROP TABLE IF EXISTS USERS;

CREATE TABLE USERS (
    user_id TEXT PRIMARY KEY,
    rating INTEGER,
    location TEXT,
    country TEXT
);

CREATE TABLE ITEM (
    item_id TEXT PRIMARY KEY,
    name TEXT,
    currently REAL,
    first_bid REAL,
    number_of_bids INTEGER,
    location TEXT,
    country TEXT,
    started TEXT,
    ends TEXT,
    seller_id TEXT,
    description TEXT,
    FOREIGN KEY(seller_id) REFERENCES USERS(user_id)
);

CREATE TABLE ITEM_CATEGORIES (
    item_id TEXT,
    category TEXT,
    FOREIGN KEY(item_id) REFERENCES ITEM(item_id)
);

CREATE TABLE BIDS (
    bid_id INTEGER PRIMARY KEY AUTOINCREMENT,
    item_id TEXT,
    user_id TEXT,
    time TEXT,
    amount REAL,
    FOREIGN KEY(item_id) REFERENCES ITEM(item_id),
    FOREIGN KEY(user_id) REFERENCES USERS(user_id)
);
""")
conn.commit()

with conn:
    cur.executemany("INSERT INTO USERS(user_id, rating, location, country) VALUES (?,?,?,?)", list(USERS))
    cur.executemany("""INSERT INTO ITEM(
        item_id, name, currently, first_bid, number_of_bids, location, country,
        started, ends, seller_id, description
    ) VALUES (?,?,?,?,?,?,?,?,?,?,?)""", list(ITEMS))
    cur.executemany("INSERT INTO ITEM_CATEGORIES(item_id, category) VALUES (?,?)", list(CATEGORIES))
    cur.executemany("INSERT INTO BIDS(item_id, user_id, time, amount) VALUES (?,?,?,?)", list(BIDS))

conn.commit()
conn.close()

In [32]:
conn = sqlite3.connect("ebay.db")

def check_db_duplicates(conn):
    cur = conn.cursor()
    print("\n--- Duplicate Check in SQLite ---")

    # USERS – should be unique on user_id
    cur.execute("""
        SELECT user_id, COUNT(*)
        FROM USERS
        GROUP BY user_id
        HAVING COUNT(*) > 1;
    """)
    dup_users = cur.fetchall()
    print(f"USERS duplicates: {len(dup_users)}")
    if dup_users:
        print(dup_users[:10])  # print a few examples

    cur.execute("""SELECT COUNT(*) FROM USERS;""")
    users_count = cur.fetchall()
    print(f"USERS COUNT: {users_count}")

    # ITEM – unique on item_id
    cur.execute("""
        SELECT item_id, COUNT(*)
        FROM ITEM
        GROUP BY item_id
        HAVING COUNT(*) > 1;
    """)
    dup_items = cur.fetchall()
    print(f"ITEMS duplicates: {len(dup_items)}")
    if dup_items:
        print(dup_items[:10])
    
    cur.execute("""SELECT COUNT(*) FROM ITEM;""")
    item_count = cur.fetchall()
    print(f"USERS COUNT: {item_count}")

    # ITEM_CATEGORIES – (item_id, category) should be unique
    cur.execute("""
        SELECT item_id, category, COUNT(*)
        FROM ITEM_CATEGORIES
        GROUP BY item_id, category
        HAVING COUNT(*) > 1;
    """)
    dup_cats = cur.fetchall()
    print(f"CATEGORIES duplicates: {len(dup_cats)}")
    if dup_cats:
        print(dup_cats[:10])
    
    cur.execute("""SELECT COUNT(*) FROM ITEM_CATEGORIES;""")
    item_categories_count = cur.fetchall()
    print(f"USERS COUNT: {item_categories_count}")

    # BIDS – here bid_id is AUTOINCREMENT, but duplicates can still appear
    cur.execute("""
        SELECT item_id, user_id, time, amount, COUNT(*)
        FROM BIDS
        GROUP BY item_id, user_id, time, amount
        HAVING COUNT(*) > 1;
    """)
    dup_bids = cur.fetchall()
    print(f"BIDS duplicates: {len(dup_bids)}")
    if dup_bids:
        print(dup_bids[:10])
    
    cur.execute("""SELECT COUNT(*) FROM BIDS;""")
    bids_count = cur.fetchall()
    print(f"USERS COUNT: {bids_count}")

check_db_duplicates(conn)


--- Duplicate Check in SQLite ---
USERS duplicates: 0
USERS COUNT: [(13422,)]
ITEMS duplicates: 0
USERS COUNT: [(19532,)]
CATEGORIES duplicates: 0
USERS COUNT: [(90269,)]
BIDS duplicates: 0
USERS COUNT: [(9874,)]


# TASK E

In [33]:
%load_ext sql
%sql sqlite:///ebay.db

1)

In [None]:
%sql SELECT COUNT(*) FROM USERS;

COUNT(*)
13422


2)

In [39]:
%%sql
select COUNT(*) from users u 
where u.location = 'New York'

COUNT(*)
80


3)

In [50]:
%%sql 
SELECT COUNT(*) 
FROM (
    SELECT i.item_id
    FROM item i
    JOIN item_categories c ON i.item_id = c.item_id
    GROUP BY i.item_id
    HAVING COUNT(c.category) = 4
) AS sub;


COUNT(*)
8365


4)

In [54]:
%%sql
SELECT item_id, currently as 'Currently $' FROM item
ORDER BY currently desc
LIMIT 10;

item_id,Currently $
1046871451,18000.0
1677348181,12999.0
1046709352,7888.0
1675361271,4800.0
1678331432,4795.0
1678334088,3950.0
1678330330,3150.0
1310051115,3000.0
1046740686,3000.0
1047627192,2900.0


5)

In [66]:
%%sql
SELECT COUNT(DISTINCT user_id) AS high_rating_sellers
FROM USERS
WHERE rating > 1000
  AND user_id IN (SELECT DISTINCT seller_id FROM ITEM);


high_rating_sellers
3130


6)

In [67]:
%%sql
SELECT COUNT(DISTINCT u.user_id) AS sellers_and_bidders
FROM USERS u
WHERE u.user_id IN (SELECT seller_id FROM ITEM)
  AND u.user_id IN (SELECT user_id FROM BIDS);


sellers_and_bidders
6717
