In [None]:
!pip install pandas jsonlines

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import zipfile
import os

base_path = "/content/drive/MyDrive/cs774/WDC"
for zip_name in ["20pair.zip", "50pair.zip", "80pair.zip"]:
    zip_path = os.path.join(base_path, zip_name)
    extract_dir = os.path.join(base_path, 'jaccard', zip_name.replace(".zip", ""))
    print(f"Unzipping {zip_name} to {extract_dir}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Done extracting to: {extract_dir}")

# Preview of Pre-processed data

In [None]:
def explore_data_fields(file_path, num_samples=10):
    """Explore all available fields in the data by looking at sample records."""
    print(f"\nExploring fields in: {file_path}")
    print("-" * 80)

    all_fields = set()
    samples = []

    # Read a few records and collect all field names
    with gzip.open(file_path, 'rt', encoding='utf-8') as read:
        count = 0
        for line in read:
            if count >= num_samples:
                break

            entry = json.loads(line)
            samples.append(entry)
            all_fields.update(entry.keys())
            count += 1

    # Print all discovered fields
    print(f"Discovered {len(all_fields)} fields:")
    for field in sorted(all_fields):
        print(f"- {field}")

    # Print a complete sample record
    if samples:
        print("\nSample record (complete):")
        print(json.dumps(samples[0], indent=2))

    print("-" * 80)
    return all_fields

# Example usage - add this before your main loop
sample_folder = os.path.join(base_dir, list(splits.keys())[0])
sample_files = [f for f in os.listdir(sample_folder) if f.endswith(".json.gz") and "train" in f]
if sample_files:
    all_fields = explore_data_fields(os.path.join(sample_folder, sample_files[0]))

In [None]:
import gzip
import json
import jsonlines
import os
import pandas as pd

base_dir = "/content/drive/MyDrive/cs774/WDC/jaccard"

splits = {
    "20pair": "20",
    "50pair": "50",
    "80pair": "80"
}

# Load exchange rates from CSV into a dictionary
def load_exchange_rates():
    exchange_rates_path = os.path.join("/content/drive/MyDrive/cs774/WDC", "exchange_rates", "xrate_april_2025.csv")
    rates_df = pd.read_csv(exchange_rates_path)
    rates_dict = {}

    if not rates_df.empty:
        rate_row = rates_df.iloc[0]
        for currency in rates_df.columns:
            if currency != 'Date' and currency.strip():
                try:
                    usd_rate = float(rate_row.get(' USD', 1.0))
                    currency_rate = float(rate_row.get(currency))
                    rates_dict[currency.strip()] = currency_rate / usd_rate
                except (ValueError, TypeError):
                    continue
        rates_dict['USD'] = 1.0

    return rates_dict

EXCHANGE_RATES = load_exchange_rates()
PRICE_RANGE_TOLERANCE = 0.10  # 10% tolerance

# Format product data with price normalization and range matching
def serialize_product(entry, side):
    fields = ['title', 'brand', 'description']
    parts = []

    # Handle price normalization and range matching
    price = entry.get(f"price_{side}")
    currency = entry.get(f"priceCurrency_{side}")

    if price and price != "null" and currency and currency != "null":
        try:
            parts.append(f"price: {price}")
            parts.append(f"priceCurrency: {currency}")

            price_value = float(price)
            conversion_rate = EXCHANGE_RATES.get(currency, 1.0)
            normalized_price = price_value * conversion_rate

            lower_bound = normalized_price * (1 - PRICE_RANGE_TOLERANCE)
            upper_bound = normalized_price * (1 + PRICE_RANGE_TOLERANCE)

            parts.append(f"price_usd: {normalized_price:.2f}")
            parts.append(f"price_range: {lower_bound:.2f}-{upper_bound:.2f} USD")
        except (ValueError, TypeError):
            parts.append(f"price: {price}")
            parts.append(f"priceCurrency: {currency}")

    # Add other product fields
    for field in fields:
        key = f"{field}_{side}"
        val = entry.get(key)
        if val and val != "null":
            parts.append(f"{field}: {val}")

    return " || ".join(parts)

# Main processing loop: convert files across all data splits
for folder_name, tag in splits.items():
    folder_path = os.path.join(base_dir, folder_name)
    for fname in os.listdir(folder_path):
        if fname.endswith(".json.gz") and ("train" in fname or "valid" in fname):
            input_path = os.path.join(folder_path, fname)
            output_name = fname.replace(".json.gz", f"_ditto_{tag}.jsonl")
            output_path = os.path.join(folder_path, output_name)

            print(f"Converting {fname} → {output_name}...")
            with gzip.open(input_path, 'rt', encoding='utf-8') as read, jsonlines.open(output_path, mode='w') as writer:
                for line in read:
                    raw = json.loads(line)
                    writer.write({
                        "text_left": serialize_product(raw, "left"),
                        "text_right": serialize_product(raw, "right"),
                        "label": str(raw["label"])
                    })
            print(f"Saved: {output_path}")

In [None]:
# prompt: Convert .jsonl into dataframe

import pandas as pd
import jsonlines

def jsonl_to_dataframe(file_path):
  """Converts a JSONL file to a Pandas DataFrame.

  Args:
    file_path: The path to the JSONL file.

  Returns:
    A Pandas DataFrame representing the data in the JSONL file.
    Returns None if there's an error during file processing.
  """
  try:
    data = []
    with jsonlines.open(file_path) as reader:
      for obj in reader:
        data.append(obj)
    return pd.DataFrame(data)
  except Exception as e:
    print(f"An error occurred: {e}")
    return None

# Example usage (assuming 'output_path' from your previous code):
# Replace with your actual file path
# Example file path (modify to your specific jsonl file)
file_path = "/content/drive/MyDrive/cs774/WDC/80pair/wdcproducts80cc20rnd000un_train_large_ditto_80.jsonl"

df = jsonl_to_dataframe(file_path)

if df is not None:
    print(df.head())
    print(df.info())


In [None]:
 df.head(5).to_csv("sample.csv")

# Preview of the processed data





In [None]:
def preview_converted_data(file_path, num_samples=5):
    print(f"\nPreviewing {num_samples} entries from converted data: {file_path}")
    print("-" * 80)

    count = 0
    with jsonlines.open(file_path) as reader:
        for entry in reader:
            if count >= num_samples:
                break

            print(f"Entry #{count+1}:")
            print(f"Left Product: {entry.get('text_left', 'N/A')}")
            print(f"Right Product: {entry.get('text_right', 'N/A')}")
            print(f"Match: {entry.get('label', 'N/A')}")
            print("-" * 40)

            count += 1
    print("-" * 80)

In [None]:
sample_folder = os.path.join(base_dir, list(splits.keys())[0])  # Use first split folder
sample_files = [f for f in os.listdir(sample_folder) if f.endswith("_ditto_" + splits[list(splits.keys())[0]] + ".jsonl") and "train" in f]
if sample_files:
    preview_converted_data(os.path.join(sample_folder, sample_files[0]))