Jaccard Similarity, String Similarity, Weighted Ensemble Similarity

In [None]:
!pip install pandas jsonlines

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import zipfile
import os

base_path = "/content/drive/MyDrive/cs774/WDC"
for zip_name in ["20pair.zip", "50pair.zip", "80pair.zip"]:
    zip_path = os.path.join(base_path, zip_name)
    extract_dir = os.path.join(base_path, zip_name.replace(".zip", ""))
    print(f"Unzipping {zip_name} to {extract_dir}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Done extracting to: {extract_dir}")

# Preview of Pre-processed data

In [None]:
def explore_data_fields(file_path, num_samples=10):
    """Explore all available fields in the data by looking at sample records."""
    print(f"\nExploring fields in: {file_path}")
    print("-" * 80)

    all_fields = set()
    samples = []

    # Read a few records and collect all field names
    with gzip.open(file_path, 'rt', encoding='utf-8') as read:
        count = 0
        for line in read:
            if count >= num_samples:
                break

            entry = json.loads(line)
            samples.append(entry)
            all_fields.update(entry.keys())
            count += 1

    # Print all discovered fields
    print(f"Discovered {len(all_fields)} fields:")
    for field in sorted(all_fields):
        print(f"- {field}")

    # Print a complete sample record
    if samples:
        print("\nSample record (complete):")
        print(json.dumps(samples[0], indent=2))

    print("-" * 80)
    return all_fields

sample_folder = os.path.join(base_dir, list(splits.keys())[0])
sample_files = [f for f in os.listdir(sample_folder) if f.endswith(".json.gz") and "train" in f]
if sample_files:
    all_fields = explore_data_fields(os.path.join(sample_folder, sample_files[0]))


Exploring fields in: /content/drive/MyDrive/cs774/WDC/20pair/wdcproducts20cc80rnd000un_train_small.json.gz
--------------------------------------------------------------------------------
Discovered 17 fields:
- brand_left
- brand_right
- cluster_id_left
- cluster_id_right
- description_left
- description_right
- id_left
- id_right
- is_hard_negative
- label
- pair_id
- priceCurrency_left
- priceCurrency_right
- price_left
- price_right
- title_left
- title_right

Sample record (complete):
{
  "id_left": 70706895,
  "brand_left": null,
  "title_left": "Western Digital Blue SSD 3D 2TB 2.5 (WDS200T2B0A)",
  "description_left": null,
  "price_left": null,
  "priceCurrency_left": null,
  "cluster_id_left": 1455368,
  "id_right": 52586114,
  "brand_right": null,
  "title_right": "Western Digital Blue PC 2.5\\\" 250 GB Serial ATA III",
  "description_right": "250 GB, 2.5\\\", SATA 6Gb/s, 540/500 MB/s",
  "price_right": "62.436001",
  "priceCurrency_right": "EUR",
  "cluster_id_right": 48324

In [None]:
import gzip
import json
import jsonlines
import os
import pandas as pd

base_dir = "/content/drive/MyDrive/cs774/WDC"

splits = {
    "20pair": "20",
    "50pair": "50",
    "80pair": "80"
}

# Load exchange rates from CSV into a dictionary
def load_exchange_rates():
    exchange_rates_path = os.path.join(base_dir, "exchange_rates", "xrate_april_2025.csv")
    rates_df = pd.read_csv(exchange_rates_path)
    rates_dict = {}

    if not rates_df.empty:
        rate_row = rates_df.iloc[0]
        for currency in rates_df.columns:
            if currency != 'Date' and currency.strip():
                try:
                    usd_rate = float(rate_row.get(' USD', 1.0))
                    currency_rate = float(rate_row.get(currency))
                    rates_dict[currency.strip()] = currency_rate / usd_rate
                except (ValueError, TypeError):
                    continue
        rates_dict['USD'] = 1.0

    return rates_dict

EXCHANGE_RATES = load_exchange_rates()
PRICE_RANGE_TOLERANCE = 0.10  # 10% tolerance

# Format product data with price normalization and range matching
def serialize_product(entry, side):
    fields = ['title', 'brand', 'description', 'cluster_id']
    parts = []

    # Handle price normalization and range matching
    price = entry.get(f"price_{side}")
    currency = entry.get(f"priceCurrency_{side}")

    if price and price != "null" and currency and currency != "null":
        try:
            parts.append(f"price: {price}")
            parts.append(f"priceCurrency: {currency}")

            price_value = float(price)
            conversion_rate = EXCHANGE_RATES.get(currency, 1.0)
            normalized_price = price_value * conversion_rate

            lower_bound = normalized_price * (1 - PRICE_RANGE_TOLERANCE)
            upper_bound = normalized_price * (1 + PRICE_RANGE_TOLERANCE)

            parts.append(f"price_usd: {normalized_price:.2f}")
            parts.append(f"price_range: {lower_bound:.2f}-{upper_bound:.2f} USD")
        except (ValueError, TypeError):
            parts.append(f"price: {price}")
            parts.append(f"priceCurrency: {currency}")

    # Add other product fields
    for field in fields:
        key = f"{field}_{side}"
        val = entry.get(key)
        if val and val != "null":
            parts.append(f"{field}: {val}")

    return " || ".join(parts)

# Main processing loop: convert files across all data splits
for folder_name, tag in splits.items():
    folder_path = os.path.join(base_dir, folder_name)
    for fname in os.listdir(folder_path):
        if fname.endswith(".json.gz") and ("train" in fname or "valid" in fname):
            input_path = os.path.join(folder_path, fname)
            output_name = fname.replace(".json.gz", f"_ditto_{tag}.jsonl")
            output_path = os.path.join(folder_path, output_name)

            print(f"Converting {fname} → {output_name}...")
            with gzip.open(input_path, 'rt', encoding='utf-8') as read, jsonlines.open(output_path, mode='w') as writer:
                for line in read:
                    raw = json.loads(line)
                    writer.write({
                        "text_left": serialize_product(raw, "left"),
                        "text_right": serialize_product(raw, "right"),
                        "label": str(raw["label"]),
                        "cluster_id_left": raw.get("cluster_id_left", ""),
                        "cluster_id_right": raw.get("cluster_id_right", "")
                    })
            print(f"Saved: {output_path}")

### Model

In [None]:
# prompt: Convert .jsonl into dataframe

import pandas as pd
import jsonlines

def jsonl_to_dataframe(file_path):
  """Converts a JSONL file to a Pandas DataFrame.

  Args:
    file_path: The path to the JSONL file.

  Returns:
    A Pandas DataFrame representing the data in the JSONL file.
    Returns None if there's an error during file processing.
  """
  try:
    data = []
    with jsonlines.open(file_path) as reader:
      for obj in reader:
        data.append(obj)
    return pd.DataFrame(data)
  except Exception as e:
    print(f"An error occurred: {e}")
    return None

# Example usage (assuming 'output_path' from your previous code):
# Replace with your actual file path
# Example file path (modify to your specific jsonl file)
file_path = "/content/drive/MyDrive/cs774/WDC/80pair/wdcproducts80cc20rnd000un_train_large_ditto_80.jsonl"

df = jsonl_to_dataframe(file_path)

if df is not None:
    print(df.head())
    print(df.info())


                                           text_left  \
0  price: 119 || priceCurrency: EUR || price_usd:...   
1  price: 12.48 || priceCurrency: GBP || price_us...   
2  price: 296.39 || priceCurrency: EUR || price_u...   
3  price: 69.90 || priceCurrency: EUR || price_us...   
4  price: 4.0099E2 || priceCurrency: EUR || price...   

                                          text_right label  cluster_id_left  \
0  price: 545.4008 || priceCurrency: RON || price...     0           156996   
1  price: 24 || priceCurrency: EUR || price_usd: ...     1           373003   
2  price: 234.99 || priceCurrency: CAD || price_u...     1             9046   
3  price: 69.00 || priceCurrency: EUR || price_us...     1           643961   
4  price: 25.95 || priceCurrency: EUR || price_us...     0            56526   

   cluster_id_right  
0            549556  
1            373003  
2              9046  
3            643961  
4            435008  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19835 

# Preview of the processed data





In [None]:
def preview_converted_data(file_path, num_samples=5):
    print(f"\nPreviewing {num_samples} entries from converted data: {file_path}")
    print("-" * 80)

    count = 0
    with jsonlines.open(file_path) as reader:
        for entry in reader:
            if count >= num_samples:
                break

            print(f"Entry #{count+1}:")
            print(f"Left Product: {entry.get('text_left', 'N/A')}")
            print(f"Right Product: {entry.get('text_right', 'N/A')}")
            print(f"Match: {entry.get('label', 'N/A')}")
            print("-" * 40)

            count += 1
    print("-" * 80)

In [None]:
sample_folder = os.path.join(base_dir, list(splits.keys())[0])  # Use first split folder
sample_files = [f for f in os.listdir(sample_folder) if f.endswith("_ditto_" + splits[list(splits.keys())[0]] + ".jsonl") and "train" in f]
if sample_files:
    preview_converted_data(os.path.join(sample_folder, sample_files[0]))


Previewing 5 entries from converted data: /content/drive/MyDrive/cs774/WDC/20pair/wdcproducts20cc80rnd000un_train_small_ditto_20.jsonl
--------------------------------------------------------------------------------
Entry #1:
Left Product: title: Western Digital Blue SSD 3D 2TB 2.5 (WDS200T2B0A) || cluster_id: 1455368
Right Product: price: 62.436001 || priceCurrency: EUR || price_usd: 62.44 || price_range: 56.19-68.68 USD || title: Western Digital Blue PC 2.5\" 250 GB Serial ATA III || description: 250 GB, 2.5\", SATA 6Gb/s, 540/500 MB/s || cluster_id: 483248
Match: 0
----------------------------------------
Entry #2:
Left Product: price: 39.99 || priceCurrency: USD || price_usd: 39.99 || price_range: 35.99-43.99 USD || title: Compatible Canon 120 Toner Cartridge || brand: Canon || description: Get your printing right the first time and every time. This eco-friendly cartridge is compatible with the Canon 120. You will get first-rate print quality with crisp vibrant output from the fir

# **Prelimnary Rule Based Matching**

In [None]:
from difflib import SequenceMatcher
import re
from sklearn.metrics import classification_report, confusion_matrix

def string_similarity(a, b):
    if not a or not b:
        return 0
    return SequenceMatcher(None, str(a).lower(), str(b).lower()).ratio()

def jaccard_similarity(a, b):
    if not a or not b:
        return 0
    a_tokens = set(re.findall(r'\w+', str(a).lower()))
    b_tokens = set(re.findall(r'\w+', str(b).lower()))

    intersection = len(a_tokens.intersection(b_tokens))
    union = len(a_tokens.union(b_tokens))

    return intersection / union if union > 0 else 0

def evaluate_similarity_metrics(file_path):
    """Evaluate different similarity metrics against provided labels"""
    true_labels = []
    title_sims = []
    brand_sims = []
    desc_sims = []
    combined_sims = []  # Weighted combination

    # Define weights for combined score
    weights = {
        'title': 0.5,
        'brand': 0.3,
        'desc': 0.2
    }

    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            raw = json.loads(line)

            # Get true label
            true_label = int(raw.get("label", 0))
            true_labels.append(true_label)

            # Calculate similarity scores
            title_sim = string_similarity(raw.get("title_left"), raw.get("title_right"))
            brand_sim = string_similarity(raw.get("brand_left"), raw.get("brand_right"))
            desc_sim = jaccard_similarity(raw.get("description_left"), raw.get("description_right"))

            # Store individual similarities
            title_sims.append(title_sim)
            brand_sims.append(brand_sim)
            desc_sims.append(desc_sim)

            # Compute combined score
            combined_sim = (weights['title'] * title_sim +
                           weights['brand'] * brand_sim +
                           weights['desc'] * desc_sim)
            combined_sims.append(combined_sim)

    # Test different thresholds for each metric
    metrics = {
        'Title Similarity': title_sims,
        'Brand Similarity': brand_sims,
        'Description Similarity': desc_sims,
        'Combined Similarity': combined_sims
    }

    results = {}

    for name, scores in metrics.items():
        best_f1 = 0
        best_threshold = 0
        best_report = None

        # Try different thresholds
        for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
            predictions = [1 if score >= threshold else 0 for score in scores]
            report = classification_report(true_labels, predictions, output_dict=True, zero_division=0)
            f1 = report['1']['f1-score']  # F1 for the match class

            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
                best_report = report

        results[name] = {
            'threshold': best_threshold,
            'f1': best_f1,
            'precision': best_report['1']['precision'],
            'recall': best_report['1']['recall']
        }

    return results

# Run evaluation on validation data
for folder_name, tag in splits.items():
    folder_path = os.path.join(base_dir, folder_name)
    validation_files = [f for f in os.listdir(folder_path) if f.endswith(".json.gz") and "valid" in f]

    if validation_files:
        print(f"\nEvaluating similarity metrics on {folder_name} split")
        print("-" * 60)

        for val_file in validation_files:
            file_path = os.path.join(folder_path, val_file)
            print(f"File: {val_file}")

            results = evaluate_similarity_metrics(file_path)

            # Print results table
            print(f"{'Metric':<25} {'Threshold':<10} {'F1':<10} {'Precision':<10} {'Recall':<10}")
            print("-" * 65)

            for metric, stats in results.items():
                print(f"{metric:<25} {stats['threshold']:<10.2f} {stats['f1']:<10.4f} {stats['precision']:<10.4f} {stats['recall']:<10.4f}")

            print()


Evaluating similarity metrics on 20pair split
------------------------------------------------------------
File: wdcproducts20cc80rnd000un_valid_large.json.gz
Metric                    Threshold  F1         Precision  Recall    
-----------------------------------------------------------------
Title Similarity          0.40       0.3133     0.1959     0.7820    
Brand Similarity          0.50       0.1441     0.1850     0.1180    
Description Similarity    0.10       0.2651     0.2388     0.2980    
Combined Similarity       0.20       0.3079     0.1882     0.8460    

File: wdcproducts20cc80rnd000un_valid_medium.json.gz
Metric                    Threshold  F1         Precision  Recall    
-----------------------------------------------------------------
Title Similarity          0.40       0.3752     0.2468     0.7820    
Brand Similarity          0.50       0.1553     0.2269     0.1180    
Description Similarity    0.10       0.2998     0.3016     0.2980    
Combined Similarity     

# Rule Based Matching Analysis  

Based on the similarity metrics evaluation across different data splits, several key patterns emerge. Across all datasets, title similarity and combined similarity consistently outperform brand and description similarity in terms of F1 score. Title similarity achieves F1 scores ranging from 0.28 to 0.50 at thresholds of 0.30-0.40, while the combined similarity approach shows comparable or slightly better performance at a lower threshold of 0.20. This suggests that product titles contain the most discriminative information for matching products.


There's a clear precision-recall tradeoff visible in the results. Combined similarity consistently achieves the highest recall (82-84%) but at the cost of lower precision (17-36%), indicating it captures most matches but includes many false positives. Meanwhile, description similarity often has better precision than title similarity, especially in the small dataset splits, though its recall is significantly lower (26-32%). Interestingly, performance is consistently better on "small" dataset splits compared to "medium" and "large" splits across all metrics, suggesting that smaller datasets may contain more obvious matches or less diverse product descriptions.


Brand similarity shows the weakest overall performance, with low recall (11-12%) indicating that many matching products either have different brand representations or missing brand information. The consistency of optimal thresholds across different splits (0.40 for title, 0.50 for brand, 0.10 for description, and 0.20 for combined similarity) provides a solid foundation for setting rule-based matching parameters.



*   For title similarity (0.40), this moderate threshold balances precision and recall effectively. Titles often contain the core product information but may vary in format, additional descriptors, or word order. A threshold of 0.40 accommodates these variations while still requiring substantial similarity, capturing meaningful matches without being too restrictive. This threshold consistently delivers strong F1 scores across datasets.
* Brand similarity's higher threshold (0.50) reflects that brands should match more precisely when present. Brands are typically shorter strings with standardized representations, so legitimate matches should show higher similarity. The stricter threshold helps avoid false positives from slightly different brand names that actually represent different manufacturers. Despite this higher threshold, brand similarity still shows lower overall performance, suggesting inconsistent brand representation in the dataset.
* Description similarity's low threshold (0.10) acknowledges the inherent variability in product descriptions. Descriptions often contain the same core information but expressed differently, with varying levels of detail, word choices, and formatting. The 0.10 threshold captures cases where descriptions share some key terminology while accommodating this natural variation. Setting a higher threshold would miss too many legitimate matches given how differently the same product can be described.
* The combined similarity threshold (0.20) leverages the weighted combination of all three metrics, allowing strong performance in one area to compensate for weaker matches in others. This explains why it achieves high recall while maintaining reasonable F1 scores - it can identify matches where only some aspects of the products align strongly.




In [None]:
import os
import gzip
import json
import numpy as np
import pandas as pd
from difflib import SequenceMatcher
import re
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import time
from datetime import datetime

# Import your similarity functions from the artifacts I created earlier
# You'll need to copy these functions into a module file or include them directly

def string_similarity(a, b):
    if not a or not b:
        return 0
    return SequenceMatcher(None, str(a).lower(), str(b).lower()).ratio()

def jaccard_similarity(a, b):
    if not a or not b:
        return 0
    a_tokens = set(re.findall(r'\w+', str(a).lower()))
    b_tokens = set(re.findall(r'\w+', str(b).lower()))

    intersection = len(a_tokens.intersection(b_tokens))
    union = len(a_tokens.union(b_tokens))

    return intersection / union if union > 0 else 0

def tokenize(text):
    if not text:
        return []
    return re.findall(r'\w+', str(text).lower())

def cosine_similarity(a, b):
    if not a or not b:
        return 0

    a_tokens = tokenize(a)
    b_tokens = tokenize(b)

    if not a_tokens or not b_tokens:
        return 0

    vocabulary = set(a_tokens + b_tokens)

    a_vector = [a_tokens.count(word) for word in vocabulary]
    b_vector = [b_tokens.count(word) for word in vocabulary]

    a_array = np.array(a_vector)
    b_array = np.array(b_vector)

    dot_product = np.dot(a_array, b_array)
    norm_a = np.linalg.norm(a_array)
    norm_b = np.linalg.norm(b_array)

    if norm_a == 0 or norm_b == 0:
        return 0

    return dot_product / (norm_a * norm_b)

def evaluate_similarity_metrics(file_path):
    """Evaluate different similarity metrics against provided labels"""
    # Implementation from the artifact above
    # ...

    # Simplified version for brevity
    true_labels = []
    title_sims = []
    brand_sims = []
    desc_sims = []
    title_cosine_sims = []
    brand_cosine_sims = []
    desc_cosine_sims = []
    combined_sims = []
    cosine_combined_sims = []

    weights = {
        'title': 0.5,
        'brand': 0.3,
        'desc': 0.2
    }

    total_records = 0
    match_count = 0

    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            raw = json.loads(line)
            total_records += 1

            true_label = int(raw.get("label", 0))
            true_labels.append(true_label)

            if true_label == 1:
                match_count += 1

            # Extract field values
            title_left = raw.get("title_left", "")
            title_right = raw.get("title_right", "")
            brand_left = raw.get("brand_left", "")
            brand_right = raw.get("brand_right", "")
            desc_left = raw.get("description_left", "")
            desc_right = raw.get("description_right", "")

            # Calculate similarities
            title_sim = string_similarity(title_left, title_right)
            brand_sim = string_similarity(brand_left, brand_right)
            desc_sim = jaccard_similarity(desc_left, desc_right)

            title_cosine_sim = cosine_similarity(title_left, title_right)
            brand_cosine_sim = cosine_similarity(brand_left, brand_right)
            desc_cosine_sim = cosine_similarity(desc_left, desc_right)

            # Store similarities
            title_sims.append(title_sim)
            brand_sims.append(brand_sim)
            desc_sims.append(desc_sim)
            title_cosine_sims.append(title_cosine_sim)
            brand_cosine_sims.append(brand_cosine_sim)
            desc_cosine_sims.append(desc_cosine_sim)

            # Compute combined scores
            combined_sim = (weights['title'] * title_sim +
                           weights['brand'] * brand_sim +
                           weights['desc'] * desc_sim)
            combined_sims.append(combined_sim)

            cosine_combined_sim = (weights['title'] * title_cosine_sim +
                                  weights['brand'] * brand_cosine_sim +
                                  weights['desc'] * desc_cosine_sim)
            cosine_combined_sims.append(cosine_combined_sim)

    # Define all metrics to evaluate
    metrics = {
        'Title (String Similarity)': title_sims,
        'Title (Cosine Similarity)': title_cosine_sims,
        'Brand (String Similarity)': brand_sims,
        'Brand (Cosine Similarity)': brand_cosine_sims,
        'Description (Jaccard)': desc_sims,
        'Description (Cosine)': desc_cosine_sims,
        'Combined Original': combined_sims,
        'Combined Cosine': cosine_combined_sims
    }

    results = {}

    print(f"Total records: {total_records}, Matches: {match_count}, Non-matches: {total_records - match_count}")
    print(f"Match rate: {match_count/total_records:.2%}")
    print("-" * 70)

    for name, scores in metrics.items():
        best_f1 = 0
        best_threshold = 0
        best_report = None
        best_confusion = None

        # Try different thresholds
        for threshold in np.arange(0.1, 1.0, 0.05):
            predictions = [1 if score >= threshold else 0 for score in scores]
            report = classification_report(true_labels, predictions, output_dict=True,)
            conf_matrix = confusion_matrix(true_labels, predictions)

            if '1' in report:
                f1 = report['1']['f1-score']
            else:
                f1 = 0

            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
                best_report = report
                best_confusion = conf_matrix

        if best_report and '1' in best_report:
            results[name] = {
                'threshold': best_threshold,
                'f1': best_f1,
                'precision': best_report['1']['precision'],
                'recall': best_report['1']['recall'],
                'confusion_matrix': best_confusion
            }
        else:
            results[name] = {
                'threshold': best_threshold,
                'f1': 0,
                'precision': 0,
                'recall': 0,
                'confusion_matrix': best_confusion if best_confusion is not None else np.zeros((2, 2))
            }

    return results, true_labels, metrics

# Add any additional helper functions from the artifacts
# ...

# Main function to execute
def main():
    # Set base directory from your data processing code
    base_dir = "/content/drive/MyDrive/cs774/WDC"

    # Create output directory
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_base_dir = os.path.join(base_dir, f"similarity_evaluation_{timestamp}")
    os.makedirs(output_base_dir, exist_ok=True)

    # Define splits to evaluate
    splits = {
        "20pair": "20",
        "50pair": "50",
        "80pair": "80"
    }

    # Store all results for comparison
    all_split_results = {}

    # Process each split
    for folder_name, tag in splits.items():
        print(f"\n\n{'='*80}")
        print(f"Processing {folder_name} split")
        print(f"{'='*80}")

        folder_path = os.path.join(base_dir, folder_name)
        output_dir = os.path.join(output_base_dir, folder_name)
        os.makedirs(output_dir, exist_ok=True)

        # Find all validation files for this split
        validation_files = [f for f in os.listdir(folder_path)
                           if f.endswith(".json.gz") and "valid" in f]

        split_results = {}

        for val_file in validation_files:
            file_path = os.path.join(folder_path, val_file)
            file_output_dir = os.path.join(output_dir, os.path.splitext(val_file)[0])
            os.makedirs(file_output_dir, exist_ok=True)

            print(f"\nEvaluating file: {val_file}")
            print("-" * 70)

            # Time the evaluation
            start_time = time.time()

            # Run evaluation
            results, true_labels, metrics_dict = evaluate_similarity_metrics(file_path)

            # Create results DataFrame
            metrics_df = pd.DataFrame([
                {
                    'Metric': metric,
                    'Threshold': stats['threshold'],
                    'F1': stats['f1'],
                    'Precision': stats['precision'],
                    'Recall': stats['recall']
                }
                for metric, stats in results.items()
            ])

            # Sort by F1 score
            metrics_df = metrics_df.sort_values('F1', ascending=False)

            # Print results
            print(metrics_df.to_string(index=False))

            # Save results to CSV
            metrics_df.to_csv(os.path.join(file_output_dir, 'metrics_comparison.csv'), index=False)

            # Generate plots
            # F1 comparison
            plt.figure(figsize=(12, 6))
            ax = sns.barplot(x='Metric', y='F1', data=metrics_df)
            plt.title(f'F1 Score Comparison - {val_file}')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.grid(axis='y', linestyle='--', alpha=0.7)

            # Add value labels
            for i, bar in enumerate(ax.patches):
                ax.text(
                    bar.get_x() + bar.get_width()/2.,
                    bar.get_height() + 0.01,
                    f'{bar.get_height():.3f}',
                    ha='center',
                    fontsize=9
                )

            plt.savefig(os.path.join(file_output_dir, 'f1_comparison.png'))
            plt.close()

            # Log execution time
            execution_time = time.time() - start_time
            print(f"Evaluation completed in {execution_time:.2f} seconds")

            # Store results for this file
            split_results[val_file] = metrics_df

        # Combine results for this split
        if split_results:
            combined_df = pd.DataFrame()

            for file_name, df in split_results.items():
                temp_df = df.copy()
                temp_df['File'] = file_name
                combined_df = pd.concat([combined_df, temp_df])

            # Save combined results for this split
            combined_df.to_csv(os.path.join(output_dir, 'all_files_comparison.csv'), index=False)

            # Create aggregate metrics for this split
            agg_df = combined_df.groupby('Metric').agg({
                'F1': 'mean',
                'Precision': 'mean',
                'Recall': 'mean',
                'Threshold': 'mean'
            }).reset_index()

            agg_df = agg_df.sort_values('F1', ascending=False)
            agg_df.to_csv(os.path.join(output_dir, 'aggregate_metrics.csv'), index=False)

            # Store results for overall comparison
            all_split_results[folder_name] = agg_df

    # Create overall comparison across all splits
    if all_split_results:
        print("\n\nGenerating comparison across all splits...")
        print("=" * 80)

        # Create a comparison table
        comparison_data = []

        for split_name, df in all_split_results.items():
            for _, row in df.iterrows():
                comparison_data.append({
                    'Split': split_name,
                    'Metric': row['Metric'],
                    'F1': row['F1'],
                    'Precision': row['Precision'],
                    'Recall': row['Recall'],
                    'Threshold': row['Threshold']
                })

        comparison_df = pd.DataFrame(comparison_data)
        comparison_df.to_csv(os.path.join(output_base_dir, 'all_splits_comparison.csv'), index=False)

        # Create a pivot table for easier comparison
        pivot_df = comparison_df.pivot_table(
            index='Metric',
            columns='Split',
            values=['F1', 'Precision', 'Recall'],
            aggfunc='mean'
        )

        pivot_df.to_csv(os.path.join(output_base_dir, 'metrics_by_split.csv'))

        # Plot comparison of F1 scores across splits
        plt.figure(figsize=(14, 8))

        # Reshape data for plotting
        f1_comparison = comparison_df.pivot(index='Metric', columns='Split', values='F1')

        # Plot
        ax = f1_comparison.plot(kind='bar', figsize=(14, 8))
        plt.title('F1 Score Comparison Across Different Splits')
        plt.xlabel('Metric')
        plt.ylabel('F1 Score')
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Split')
        plt.tight_layout()

        plt.savefig(os.path.join(output_base_dir, 'f1_comparison_across_splits.png'))
        plt.close()

        print(f"All evaluation results saved to: {output_base_dir}")

if __name__ == "__main__":
    main()



Processing 20pair split

Evaluating file: wdcproducts20cc80rnd000un_valid_large.json.gz
----------------------------------------------------------------------
Total records: 4500, Matches: 500, Non-matches: 4000
Match rate: 11.11%
----------------------------------------------------------------------
                   Metric  Threshold       F1  Precision  Recall
Title (Cosine Similarity)       0.55 0.424242   0.382637   0.476
          Combined Cosine       0.30 0.400564   0.309368   0.568
        Combined Original       0.25 0.318813   0.209566   0.666
Title (String Similarity)       0.40 0.315491   0.197864   0.778
    Description (Jaccard)       0.10 0.265125   0.238782   0.298
     Description (Cosine)       0.25 0.253943   0.209635   0.322
Brand (String Similarity)       0.45 0.143552   0.183230   0.118
Brand (Cosine Similarity)       0.50 0.136986   0.181518   0.110
Evaluation completed in 7.01 seconds

Evaluating file: wdcproducts20cc80rnd000un_valid_medium.json.gz
---------

<Figure size 1400x800 with 0 Axes>