### title handling : extract product name for each title (later)

In [1]:
import os
import json
import sys
import os

# Add the parent directory to sys.path to allow imports from src
sys.path.append(os.path.abspath("../"))

# Now import the function
from src.cleaner import extract_price
from src.cleaner import clean_location
from src.cleaner import categorize_delivery
from src.cleaner import extract_feedback_count,update_item

In [2]:
os.chdir(r'D:\BDIA\semestre 4\Analyse du web\final project\data\collected_data')
os.getcwd()

'D:\\BDIA\\semestre 4\\Analyse du web\\final project\\data\\collected_data'

In [3]:
os.listdir()

['500_results.json',
 'ebay_clothing_data.json',
 'market_product.json',
 'men_results.json',
 'women_results.json']

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")  # small English model

def extract_product_names(title):
    doc = nlp(title)
    # Extract entities labeled as PRODUCT or ORG (sometimes product names get labeled as ORG)
    products = [ent.text for ent in doc.ents if ent.label_ in ("PRODUCT", "ORG")]
    return products if products else [title]  # fallback to full title if none found

# Example:
title = "Spider Punk T-shirt - Funny Web-Slinging Style! Unisex Printed Design"
print(extract_product_names(title))


['Spider Punk T-shirt - Funny Web-Slinging Style! Unisex Printed Design']


#### Price normalizing:make the price in one Unit

In [5]:
with open('market_product.json','r',encoding='utf-8') as f:
    data=json.load(f)

In [6]:
prices=list(map(lambda x:extract_price(x['price']),data))

In [7]:
data_with_price_handled = list(map(lambda item_price: {**item_price[0], 'price': item_price[1]}, zip(data, prices)))

### location handling 

In [8]:
data_l_p_h = [{**item, 'location': clean_location(item['location'])} for item in data_with_price_handled ]

### delivery manipulation

In [11]:
from collections import defaultdict

counts = defaultdict(int)

for item in data:
    category = categorize_delivery(item['delivery_price'])
    item['delivery_category'] = category
    counts[category] += 1

# Convert defaultdict to regular dict if you want
counts = dict(counts)
print(counts)


{'Paid Delivery': 938, 'Free Delivery': 53, 'Other Delivery': 9}


In [13]:
# we will add label named delivery category in our data :

data_l_p_d=[ {**item,'delivery_category':categorize_delivery(item['delivery_price'])} for item in data_l_p_h]

In [18]:
#now we will put delivery price as value 
def clean_delivery_price(delivery_price, delivery_category):
    if delivery_category == "Free Delivery":
        return 0
    elif delivery_category == "Paid Delivery":
        import re
        # Extract the first number in the string
        match = re.search(r'[\d,.]+', delivery_price)
        if match:
            num_str = match.group().replace(',', '')
            try:
                return float(num_str)
            except ValueError:
                return None
        else:
            return None
    else:
        # category is 'other' or unknown
        return None

In [19]:
for item in data_l_p_d:
    item['clean_delivery_price'] = clean_delivery_price(item['delivery_price'], item['delivery_category'])


In [23]:
# delivery type data didn't present any extra information so we can't rely on it 
unique_categories = {item['delivery_type'] for item in data_l_p_d if 'delivery_type' in item}
print(unique_categories)

{'Buy It Now'}


In [24]:
for item in data_l_p_d:
    item.pop('delivery_type', None)

In [27]:
data_l_p_d[199].get('delivery_type',0)

0

In [28]:
#the same thing with date var (each comment has its date no need to have an empty date in the data )
for item in data_l_p_d:
    item.pop('date', None)

In [29]:
data_l_p_d[199].get('date','no date key')

'no date key'

In [33]:
with open(r'D:\BDIA\semestre 4\Analyse du web\final project\data\processed_data\market_data_processed_v1.json','w',encoding='utf-8') as f:
    json.dump(data_l_p_d,f,indent=4)

In [24]:
os.chdir(r'D:\BDIA\semestre 4\Analyse du web\final project\data\processed_data')
os.getcwd()

'D:\\BDIA\\semestre 4\\Analyse du web\\final project\\data\\processed_data'

In [25]:
os.listdir()

['file.txt',
 'market_data_processed_v1.json',
 'market_data_processed_v2.json',
 'market_data_processed_v3.json']

In [34]:
with open(r'market_data_processed_v3.json','r',encoding='utf-8') as f:
    data=json.load(f)

In [28]:
new_data= list(map(update_item, data))

In [29]:
len(new_data)

1000

In [35]:
data[601]

{'title': "Adidas Samba OG White Black | Men's Shoes IT | New | Shipping Included",
 'price': 90.26,
 'delivery_price': 'Free International Shipping',
 'location': 'Netherlands',
 'product_link': 'https://www.ebay.com/itm/286570480562?_skw=women&hash=item42b8ee83b2:g:YDAAAOSwLoRoJcxp&itmprp=enc%3AAQAKAAAA4FkggFvd1GGDu0w3yXCmi1coLWAbTAB1xC%2F1mSzwP4MbbO2WfNuVMvo0y8E%2Fm8ALM0%2BFDLSwmoRcAe0MYwyEI67Lc8IwEccAVu%2FeKnWPJZpc6Uqe0eGZaRMfc%2FHAhMlKt7wkQa0C8Kzie2lkJjDubUGnrXdP6e%2BB7HryZPA6lj0rKLDSwPVMOprZH0uMEDu6bGvNk1AZ9TjU1tnbexutkzIhMwIv7rNZyX%2BNIapN94UEA9RsKXz6S6RxCTuhs%2Ff0k8wFlm4dx4rmZ7tA1qamUUkHvcY7k6XLhD5SyB0DligL%7Ctkp%3ABk9SR4Dj-9reZQ',
 'image_link': 'https://i.ebayimg.com/images/g/YDAAAOSwLoRoJcxp/s-l140.jpg',
 'sex': 'F',
 'event_breath1': 'People are checking this out. 3 have added this to their watchlist.',
 'event_breath2': 'Popular item. 5 have already sold.',
 'feedback': [],
 'delivery_category': 'Free Delivery',
 'clean_delivery_price': 0,
 'feedback_count': 0,
 'category'

In [37]:
os.chdir(r'D:\BDIA\semestre 4\Analyse du web\final project\data\collected_data')
os.getcwd()

'D:\\BDIA\\semestre 4\\Analyse du web\\final project\\data\\collected_data'

In [55]:
with open(r'D:\BDIA\semestre 4\Analyse du web\final project\data\processed_data\market_data_processed_v3.json','r',encoding='utf-8') as f:
    data=json.load(f)
with open('Item Categorization1.json','r',encoding='utf-8') as f:
    c1=json.load(f)
with open('Item Categorization2.json','r',encoding='utf-8') as f:
    c2=json.load(f)

with open('Item Categorization3.json','r',encoding='utf-8') as f:
    c3=json.load(f)
with open('Item Categorization4.json','r',encoding='utf-8') as f:
    c4=json.load(f)

In [56]:
len(c1),len(c2),len(c3),len(c4),len(data)

(250, 250, 250, 250, 1000)

In [59]:
new_data=c1+c2+c3+c4
len(new_data)

1000

In [60]:
from collections import Counter
count=Counter([item['category'] for item in new_data])
count 

Counter({'T-shirt': 355,
         'T-shirts': 212,
         'Sunglasses (suggested)': 152,
         'Shoes': 71,
         'Cap': 27,
         'Socks': 25,
         'Scarf (suggested)': 24,
         'Undergarments': 15,
         'Jacket': 12,
         'Underwear (suggested)': 10,
         'Wallet (suggested)': 10,
         'Pants (suggested)': 10,
         'Belt (suggested)': 9,
         'Shirt (suggested)': 8,
         'Hoodie': 8,
         'Backpack': 6,
         'Poncho (suggested)': 6,
         'Neckties': 5,
         'Bag (suggested)': 5,
         'Sandals': 3,
         'Jeans (suggested)': 3,
         'Miscellaneous (suggested)': 3,
         'Necktie (suggested)': 2,
         'Bathrobe (suggested)': 2,
         'Tank Top (suggested)': 2,
         'Swimsuit (suggested)': 2,
         'Compression Sleeves (suggested)': 2,
         'Sweatpants': 2,
         'Sweatshirt (suggested)': 2,
         'Shorts': 1,
         'Jeans': 1,
         'Caps': 1,
         'Thermostat (suggested)': 1,

In [62]:
os.chdir(r'D:\BDIA\semestre 4\Analyse du web\final project\data\processed_data')
os.getcwd()

'D:\\BDIA\\semestre 4\\Analyse du web\\final project\\data\\processed_data'

In [63]:
with open('market_data_processed_v4_0.json','r',encoding='utf-8') as f:
    data40=json.load(f)

with open('market_data_processed_v4_1.json','r',encoding='utf-8') as f:
    data41=json.load(f)

In [64]:
data40m0=[]
for index in range(len(data40)):
    data40[index]["category"]=new_data[index]["category"]
    data40m0.append(data40[index])

In [68]:
data40m1=[]
for index in range(len(data41)):
    data41[index]["category"]=new_data[index]["category"]
    data40m1.append(data41[index])

In [70]:
with open('market_data_processed_v4_0m0.json','w',encoding='utf-8') as f:
    json.dump(data40m0,f,ensure_ascii=False,indent=4)

with open('market_data_processed_v4_1m0.json','w',encoding='utf-8') as f:
    json.dump(data40m1,f,ensure_ascii=False,indent=4)


In [71]:
count=Counter([item['category'] for item in data40m0])
count

Counter({'T-shirt': 355,
         'T-shirts': 212,
         'Sunglasses (suggested)': 152,
         'Shoes': 71,
         'Cap': 27,
         'Socks': 25,
         'Scarf (suggested)': 24,
         'Undergarments': 15,
         'Jacket': 12,
         'Underwear (suggested)': 10,
         'Wallet (suggested)': 10,
         'Pants (suggested)': 10,
         'Belt (suggested)': 9,
         'Shirt (suggested)': 8,
         'Hoodie': 8,
         'Backpack': 6,
         'Poncho (suggested)': 6,
         'Neckties': 5,
         'Bag (suggested)': 5,
         'Sandals': 3,
         'Jeans (suggested)': 3,
         'Miscellaneous (suggested)': 3,
         'Necktie (suggested)': 2,
         'Bathrobe (suggested)': 2,
         'Tank Top (suggested)': 2,
         'Swimsuit (suggested)': 2,
         'Compression Sleeves (suggested)': 2,
         'Sweatpants': 2,
         'Sweatshirt (suggested)': 2,
         'Shorts': 1,
         'Jeans': 1,
         'Caps': 1,
         'Thermostat (suggested)': 1,

In [72]:
import pandas as pd

In [76]:
df1=pd.DataFrame(data40m1)
df1.to_csv('market_data_processed_v4_1m0.csv',index=False,encoding='utf-8')
df1.to_excel('market_data_processed_v4_1m0.xlsx',index=False)

True