In [1]:
import os
import time

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import numpy as np

In [2]:
start = time.time()

selected_columns = [
    "Pid", "Description", "Name", "Category", 
    "Price", "PriceCurrency", "FinalPrice", "Discount", "isOnSale", "IsInStock", "Brand", 
    "Manufacturer", "Color", "Gender", "Size", "Condition"
]

# Define dataset
dataset = ds.dataset("../data/merged_output.parquet", format="parquet")

# Define filter to keep only rows with allowed currencies
currency_filter = ds.field("PriceCurrency").isin(["USD", "CAD", "GBP"])

# Load filtered data into a pandas DataFrame
filtered_table = dataset.to_table(filter=currency_filter, columns=selected_columns)
df = filtered_table.to_pandas()

print("Load time: {:.2f} seconds".format(time.time() - start))

Load time: 233.16 seconds


In [3]:
df.shape

(11025415, 16)

In [4]:
# Clean and prepare both columns: Replace NaN with empty string and lowercase
df["Brand_clean"] = df["Brand"].fillna('').str.strip().str.lower()
df["Manufacturer_clean"] = df["Manufacturer"].fillna('').str.strip().str.lower()

# Perform the comparison and create merged column
df["MergedBrand"] = df["Brand"].where(df["Brand_clean"] == df["Manufacturer_clean"], df["Brand"].combine_first(df["Manufacturer"]))

In [5]:
df['MergedBrand'].value_counts(normalize=True, dropna=False) * 100

MergedBrand
<NA>             44.491586
Temu              4.346703
Lands' End        1.391984
Nike               1.06425
Levi's            0.600812
                   ...    
5052557215173     0.000009
5052557215166     0.000009
ELLISS            0.000009
Ivory Snow        0.000009
Divot Board       0.000009
Name: proportion, Length: 25731, dtype: Float64

In [6]:
columns_to_keep = ["Pid", "Description", "Name", "Category", "Price", "PriceCurrency", "FinalPrice", "Discount", 
                   "isOnSale", "IsInStock", "Color", "Gender", "Size", "Condition", "MergedBrand"]

df_filtered = df[columns_to_keep]

In [7]:
df_filtered.to_parquet('../data/filtered_data.parquet', engine='pyarrow')