In [1]:
import logging
import pandas as pd
import numpy as np
from PyDI.io import load_xml, load_csv
from PyDI.fusion.engine import build_record_groups_from_correspondences
from PyDI.fusion import (
    DataFusionStrategy,
    DataFusionEngine,
    DataFusionEvaluator,
    longest_string,
    shortest_string,
    prefer_higher_trust,
    average,
    most_recent,
    union,
    tokenized_match,
    most_complete,
    intersection_k_sources,
)
from pathlib import Path

logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
pd.set_option('display.max_columns', None)

### Define file path and load files

In [2]:
INPUT_DIR = Path("MapForce/Output")
CORR_DIR = Path("Identity_Resolution")
FBRef_PATH = INPUT_DIR / "FBRef.xml"
TM_PATH = INPUT_DIR / "TM.xml"
FM_PATH = INPUT_DIR / "FM.xml"

In [3]:
df_FBRef = load_xml(FBRef_PATH)
df_TM = load_xml(TM_PATH)
df_FM = load_xml(FM_PATH)

df_FBRef_2_TM = load_csv(CORR_DIR / "correspondences_FBRef&TM.csv")
df_FM_2_TM = load_csv(CORR_DIR / "correspondences_FM&TM.csv")

df_test_set = load_xml("fusion_evaluation.xml")

### Group Size Distribution

In [4]:
# Analyse record group sizes created by the correspondences
record_groups = build_record_groups_from_correspondences(
    [df_FBRef, df_TM, df_FM],
    [df_FBRef_2_TM, df_FM_2_TM],
    id_column='pydi_id',
)

[INFO] Created 223283 record groups from 43052 correspondences
[INFO] Group Size Distribution of 223283 clusters:
[INFO] 	Cluster Size	| Frequency	| Percentage
[INFO] 	──────────────────────────────────────────────────
[INFO] 		2	|	32384	|	14.50%
[INFO] 		3	|	5334	|	2.39%


### Density before Fusion

In [5]:
def compute_density_manual(df):
    # Count non-null values
    non_null = df.notnull().sum().sum()
    # Total number of cells
    total = df.shape[0] * df.shape[1]
    return non_null / total


print("FBRef Density:", compute_density_manual(df_FBRef))
print("TM Density:", compute_density_manual(df_TM))
print("FM Density:", compute_density_manual(df_FM))

FBRef Density: 0.9999586037885813
TM Density: 0.9827495443774011
FM Density: 1.0


In [6]:
def compute_combined_density_before_fusion(dfs):
    """
    dfs: list of pandas DataFrames (e.g. [df_FBRef, df_TM, df_FM])
    
    Returns:
        combined_df: schema-integrated table (union of columns, stacked rows)
        density: density of that combined table
    """
    # 1) union of all columns
    all_columns = sorted({col for df in dfs for col in df.columns})
    
    # 2) reindex each df to this union schema (missing cols -> NaN)
    aligned = [df.reindex(columns=all_columns) for df in dfs]
    
    # 3) stack them (vertical concat)
    combined_df = pd.concat(aligned, ignore_index=True)
    
    # 4) use your existing density function
    density = compute_density_manual(combined_df)
    return combined_df, density

In [7]:
combined_df_before, combined_density_before = compute_combined_density_before_fusion(
    [df_FBRef, df_TM, df_FM]
)

print("Overall combined density BEFORE fusion:", combined_density_before)

Overall combined density BEFORE fusion: 0.5476812188385111


In [8]:
column_densities_before = combined_df_before.notnull().sum() / len(combined_df_before)
print(column_densities_before.sort_values())

penalty_save_percentage_3s       0.028325
assists_3s                       0.028344
goals_per_shot_3s                0.028344
clean_sheet_percentage_3s        0.028344
matches_played_3s                0.028344
minutes_played_3s                0.028344
pass_completion_percentage_3s    0.028344
goals_3s                         0.028344
save_percentage_3s               0.028344
penalty_goals_3s                 0.028344
tackles_won_percentage_3s        0.028344
dribbling                        0.710928
handling_gk                      0.710928
finishing                        0.710928
career_appearances               0.710928
career_goals                     0.710928
tackling                         0.710928
reflexes_gk                      0.710928
strength                         0.710928
pace                             0.710928
penalty                          0.710928
passing                          0.710928
left_right_foot                  0.933884
date_of_birth                    0

### Consistency before Fusion

In [9]:
def normalize_value(v):
    """Convert lists/arrays to tuples for safe comparison."""
    if isinstance(v, (list, tuple, np.ndarray)):
        return tuple(v)
    return v

def compute_consistency_manual(data_frames, record_groups, id_column="pydi_id"):
    total_attributes = 0
    consistent_attributes = 0

    for group in record_groups:
        records = group.records  # list of pandas.Series

        if len(records) < 2:
            continue

        # Find common attributes
        common_columns = set(records[0].index)
        for rec in records[1:]:
            common_columns &= set(rec.index)

        # Remove ID column
        common_columns.discard(id_column)

        # Check each attribute
        for col in common_columns:
            vals = []

            for rec in records:
                v = rec[col]

                # Skip nulls safely
                if isinstance(v, (list, tuple, np.ndarray)):
                    # Check if ALL elements are null
                    if all(pd.isnull(x) for x in v):
                        continue
                else:
                    if pd.isnull(v):
                        continue

                vals.append(normalize_value(v))

            # Need at least 2 values to compare
            if len(vals) <= 1:
                continue

            total_attributes += 1

            # Consistent if all values match the first
            if all(v == vals[0] for v in vals):
                consistent_attributes += 1

    if total_attributes == 0:
        return 1.0

    return consistent_attributes / total_attributes

consistency_before = compute_consistency_manual(
    data_frames=[df_FBRef, df_TM, df_FM],
    record_groups=record_groups,
    id_column="pydi_id"
)

print("Consistency Before Fusion:", consistency_before)

Consistency Before Fusion: 0.11541783148561595


### Baseline Strategy

In [10]:
baseline_strategy = DataFusionStrategy("Baseline_Fusion")

# Trust scores only for id
df_TM.attrs["trust"] = 3
df_FM.attrs["trust"] = 2
df_FBRef.attrs["trust"] = 1

baseline_strategy.add_attribute_fuser("pydi_id", prefer_higher_trust)

# Text attributes
baseline_strategy.add_attribute_fuser("player_name", shortest_string)
baseline_strategy.add_attribute_fuser("positions_position", longest_string)
baseline_strategy.add_attribute_fuser("nationality", shortest_string)
baseline_strategy.add_attribute_fuser("left_right_foot", longest_string)

# Date attribute
baseline_strategy.add_attribute_fuser("date_of_birth", most_recent)

# Clubs
baseline_strategy.add_attribute_fuser("clubs_club", union)

# Numeric attributes
numeric_attrs = [
    "height_cm", "transfer_value",
    "career_appearances", "career_goals",
    "pace", "finishing", "passing", "dribbling",
    "tackling", "strength",
    "reflexes_gk", "handling_gk", "penalty",
    "matches_played_3s", "minutes_played_3s",
    "goals_3s", "assists_3s",
    "penalty_goals_3s", "tackles_won_percentage_3s",
    "goals_per_shot_3s", "pass_completion_percentage_3s",
    "save_percentage_3s", "clean_sheet_percentage_3s",
    "penalty_save_percentage_3s",
]

for attr in numeric_attrs:
    baseline_strategy.add_attribute_fuser(attr, average) 


[INFO] Registered fuser for attribute 'pydi_id' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'player_name' using rule 'shortest_string'
[INFO] Registered fuser for attribute 'positions_position' using rule 'longest_string'
[INFO] Registered fuser for attribute 'nationality' using rule 'shortest_string'
[INFO] Registered fuser for attribute 'left_right_foot' using rule 'longest_string'
[INFO] Registered fuser for attribute 'date_of_birth' using rule 'most_recent'
[INFO] Registered fuser for attribute 'clubs_club' using rule 'union'
[INFO] Registered fuser for attribute 'height_cm' using rule 'average'
[INFO] Registered fuser for attribute 'transfer_value' using rule 'average'
[INFO] Registered fuser for attribute 'career_appearances' using rule 'average'
[INFO] Registered fuser for attribute 'career_goals' using rule 'average'
[INFO] Registered fuser for attribute 'pace' using rule 'average'
[INFO] Registered fuser for attribute 'finishing' using rule 'average'

In [11]:
baseline_engine = DataFusionEngine(
    strategy = baseline_strategy,
    debug=True,
    debug_file="fusion_debug_baseline.jsonl"
)

fused_baseline = baseline_engine.run(
    datasets=[df_FBRef, df_TM, df_FM],
    correspondences=[df_FBRef_2_TM, df_FM_2_TM],
    id_column="pydi_id",
)

[INFO] Fusion debug logging enabled; refer to fusion_debug_baseline.jsonl for detailed traces.
[INFO] Starting data fusion with strategy 'Baseline_Fusion'
[INFO] *    Loading correspondences    *
[INFO] Correspondence ID coverage: matched 80770 of 80770 unique IDs
[INFO] Created 223283 record groups from 43052 correspondences
[INFO] Group Size Distribution of 223283 clusters:
[INFO] 	Cluster Size	| Frequency	| Percentage
[INFO] 	──────────────────────────────────────────────────
[INFO] 		2	|	32384	|	14.50%
[INFO] 		3	|	5334	|	2.39%
[INFO] Attribute Consistencies:
[INFO]     _id: 0.00
[INFO]     assists_3s: 1.00
[INFO]     career_appearances: 1.00
[INFO]     career_goals: 1.00
[INFO]     clean_sheet_percentage_3s: 1.00
[INFO]     clubs_club: 0.02
[INFO]     date_of_birth: 0.97
[INFO]     dribbling: 1.00
[INFO]     finishing: 1.00
[INFO]     goals_3s: 1.00
[INFO]     goals_per_shot_3s: 1.00
[INFO]     handling_gk: 1.00
[INFO]     height_cm: 0.02
[INFO]     left_right_foot: 0.10
[INFO]   

In [12]:
eval_columns = df_test_set.columns.tolist()

fused_baseline = fused_baseline[eval_columns]

evaluator = DataFusionEvaluator(baseline_strategy, debug=True, fusion_debug_logs="fusion_debug_baseline.jsonl")
baseline_metrics = evaluator.evaluate(
    fused_df=fused_baseline, fused_id_column="pydi_id", gold_df=df_test_set, gold_id_column="pydi_id"
)
print("\n Baseline Fusion Strategy Metrics:", )
print(f"Overall Accuracy: {baseline_metrics['overall_accuracy']:.4f}")


[INFO] Fusion evaluation debug logging enabled; refer to fusion_evaluation_debug.jsonl for mismatch details.
[INFO] Starting fusion evaluation
[INFO] Evaluation complete: 0.344 overall accuracy (62/180)
[INFO] Evaluation mismatches by attribute (debug): 118 total
[INFO] 	Attribute                        |  Errors | Percentage
[INFO] 	───────────────────────────────────────────────────────
[INFO] 	height_cm                        |      30 |     25.42%%
[INFO] 	nationality                      |      30 |     25.42%%
[INFO] 	clubs_club                       |      30 |     25.42%%
[INFO] 	positions_position               |      28 |     23.73%%



 Baseline Fusion Strategy Metrics:
Overall Accuracy: 0.3444


### Strategy 2 

In [13]:
strategy_2 = DataFusionStrategy("Strategy_2")

df_TM.attrs["trust"] = 3
df_FM.attrs["trust"] = 1
df_FBRef.attrs["trust"] = 2

strategy_2.add_attribute_fuser("pydi_id", prefer_higher_trust)
strategy_2.add_attribute_fuser("player_name", longest_string)
strategy_2.add_attribute_fuser("positions_position", prefer_higher_trust) #switch from longest_string to prefer_higher_trust
strategy_2.add_attribute_fuser("nationality", shortest_string)
strategy_2.add_attribute_fuser("left_right_foot", longest_string)
strategy_2.add_attribute_fuser("date_of_birth", most_recent)
strategy_2.add_attribute_fuser("clubs_club", union)

numeric_attrs = [
    "height_cm", "transfer_value",
    "career_appearances", "career_goals",
    "pace", "finishing", "passing", "dribbling",
    "tackling", "strength",
    "reflexes_gk", "handling_gk", "penalty",
    "matches_played_3s", "minutes_played_3s",
    "goals_3s", "assists_3s",
    "penalty_goals_3s", "tackles_won_percentage_3s",
    "goals_per_shot_3s", "pass_completion_percentage_3s",
    "save_percentage_3s", "clean_sheet_percentage_3s",
    "penalty_save_percentage_3s",
]

for attr in numeric_attrs:
    strategy_2.add_attribute_fuser(attr, prefer_higher_trust) #switch from average to prefer_higher_trust

[INFO] Registered fuser for attribute 'pydi_id' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'player_name' using rule 'longest_string'
[INFO] Registered fuser for attribute 'positions_position' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'nationality' using rule 'shortest_string'
[INFO] Registered fuser for attribute 'left_right_foot' using rule 'longest_string'
[INFO] Registered fuser for attribute 'date_of_birth' using rule 'most_recent'
[INFO] Registered fuser for attribute 'clubs_club' using rule 'union'
[INFO] Registered fuser for attribute 'height_cm' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'transfer_value' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_appearances' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_goals' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'pace' using rule 'prefer_higher_trust'
[INFO]

In [14]:
engine_2 = DataFusionEngine(
    strategy_2,
    debug=True,
    debug_file="fusion_debug_2.jsonl"
)

fused_2 = engine_2.run(
    datasets=[df_FBRef, df_TM, df_FM],
    correspondences=[df_FBRef_2_TM, df_FM_2_TM],
    id_column="pydi_id"
)

[INFO] Fusion debug logging enabled; refer to fusion_debug_2.jsonl for detailed traces.
[INFO] Starting data fusion with strategy 'Strategy_2'
[INFO] *    Loading correspondences    *
[INFO] Correspondence ID coverage: matched 80770 of 80770 unique IDs
[INFO] Created 223283 record groups from 43052 correspondences
[INFO] Group Size Distribution of 223283 clusters:
[INFO] 	Cluster Size	| Frequency	| Percentage
[INFO] 	──────────────────────────────────────────────────
[INFO] 		2	|	32384	|	14.50%
[INFO] 		3	|	5334	|	2.39%
[INFO] Attribute Consistencies:
[INFO]     _id: 0.00
[INFO]     assists_3s: 1.00
[INFO]     career_appearances: 1.00
[INFO]     career_goals: 1.00
[INFO]     clean_sheet_percentage_3s: 1.00
[INFO]     clubs_club: 0.02
[INFO]     date_of_birth: 0.97
[INFO]     dribbling: 1.00
[INFO]     finishing: 1.00
[INFO]     goals_3s: 1.00
[INFO]     goals_per_shot_3s: 1.00
[INFO]     handling_gk: 1.00
[INFO]     height_cm: 0.02
[INFO]     left_right_foot: 0.10
[INFO]     matches_pl

In [15]:
eval_columns = df_test_set.columns.tolist()

fused_2 = fused_2[eval_columns]

evaluator = DataFusionEvaluator(strategy_2, debug=True, fusion_debug_logs="fusion_debug_2.jsonl")
baseline_metrics = evaluator.evaluate(
    fused_df=fused_2, fused_id_column="pydi_id", gold_df=df_test_set, gold_id_column="pydi_id"
)
print("\n Fusion Strategy 2 Metrics:", )
print(f"Overall Accuracy: {baseline_metrics['overall_accuracy']:.4f}")

[INFO] Fusion evaluation debug logging enabled; refer to fusion_evaluation_debug.jsonl for mismatch details.
[INFO] Starting fusion evaluation
[INFO] Evaluation complete: 0.656 overall accuracy (118/180)
[INFO] Evaluation mismatches by attribute (debug): 62 total
[INFO] 	Attribute                        |  Errors | Percentage
[INFO] 	───────────────────────────────────────────────────────
[INFO] 	nationality                      |      30 |     48.39%%
[INFO] 	clubs_club                       |      30 |     48.39%%
[INFO] 	positions_position               |       1 |      1.61%%
[INFO] 	height_cm                        |       1 |      1.61%%



 Fusion Strategy 2 Metrics:
Overall Accuracy: 0.6556


### Strategy 3

In [16]:
strategy_3 = DataFusionStrategy("Strategy_3")

df_TM.attrs["trust"] = 3
df_FM.attrs["trust"] = 1
df_FBRef.attrs["trust"] = 2

strategy_3.add_attribute_fuser("pydi_id", prefer_higher_trust)
strategy_3.add_attribute_fuser("player_name", longest_string)
strategy_3.add_attribute_fuser("positions_position", prefer_higher_trust)
strategy_3.add_attribute_fuser("nationality", most_complete) #switch from shortest to most_complete
strategy_3.add_attribute_fuser("left_right_foot", most_complete)
strategy_3.add_attribute_fuser("date_of_birth", most_recent)
strategy_3.add_attribute_fuser("clubs_club", union)
strategy_3.add_attribute_fuser("height_cm", prefer_higher_trust) 

numeric_attrs = [
    "transfer_value",
    "career_appearances", "career_goals",
    "pace", "finishing", "passing", "dribbling",
    "tackling", "strength",
    "reflexes_gk", "handling_gk", "penalty",
    "matches_played_3s", "minutes_played_3s",
    "goals_3s", "assists_3s",
    "penalty_goals_3s", "tackles_won_percentage_3s",
    "goals_per_shot_3s", "pass_completion_percentage_3s",
    "save_percentage_3s", "clean_sheet_percentage_3s",
    "penalty_save_percentage_3s",
]

for attr in numeric_attrs:
    strategy_3.add_attribute_fuser(attr, prefer_higher_trust) 
    
strategy_3.add_evaluation_function("clubs_club", tokenized_match, threshold=0.7) #add threshold-based evaluation

[INFO] Registered fuser for attribute 'pydi_id' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'player_name' using rule 'longest_string'
[INFO] Registered fuser for attribute 'positions_position' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'nationality' using rule 'most_complete'
[INFO] Registered fuser for attribute 'left_right_foot' using rule 'most_complete'
[INFO] Registered fuser for attribute 'date_of_birth' using rule 'most_recent'
[INFO] Registered fuser for attribute 'clubs_club' using rule 'union'
[INFO] Registered fuser for attribute 'height_cm' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'transfer_value' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_appearances' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_goals' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'pace' using rule 'prefer_higher_trust'
[INFO] Re

In [17]:
engine_3 = DataFusionEngine(
    strategy_3,
    debug=True,
    debug_file="fusion_debug_3.jsonl"
)

fused_3 = engine_3.run(
    datasets=[df_FBRef, df_TM, df_FM],
    correspondences=[df_FBRef_2_TM, df_FM_2_TM],
    id_column="pydi_id"
)

[INFO] Fusion debug logging enabled; refer to fusion_debug_3.jsonl for detailed traces.
[INFO] Starting data fusion with strategy 'Strategy_3'
[INFO] *    Loading correspondences    *
[INFO] Correspondence ID coverage: matched 80770 of 80770 unique IDs
[INFO] Created 223283 record groups from 43052 correspondences
[INFO] Group Size Distribution of 223283 clusters:
[INFO] 	Cluster Size	| Frequency	| Percentage
[INFO] 	──────────────────────────────────────────────────
[INFO] 		2	|	32384	|	14.50%
[INFO] 		3	|	5334	|	2.39%
[INFO] Attribute Consistencies:
[INFO]     _id: 0.00
[INFO]     assists_3s: 1.00
[INFO]     career_appearances: 1.00
[INFO]     career_goals: 1.00
[INFO]     clean_sheet_percentage_3s: 1.00
[INFO]     clubs_club: 0.02
[INFO]     date_of_birth: 0.97
[INFO]     dribbling: 1.00
[INFO]     finishing: 1.00
[INFO]     goals_3s: 1.00
[INFO]     goals_per_shot_3s: 1.00
[INFO]     handling_gk: 1.00
[INFO]     height_cm: 0.02
[INFO]     left_right_foot: 0.10
[INFO]     matches_pl

In [18]:
eval_columns = df_test_set.columns.tolist()

fused_3 = fused_3[eval_columns]

evaluator = DataFusionEvaluator(strategy_3, debug=True, fusion_debug_logs="fusion_debug_3.jsonl")
baseline_metrics = evaluator.evaluate(
    fused_df=fused_3, fused_id_column="pydi_id", gold_df=df_test_set, gold_id_column="pydi_id"
)
print("\n Fusion Strategy 3 Metrics:", )
print(f"Overall Accuracy: {baseline_metrics['overall_accuracy']:.4f}")

[INFO] Fusion evaluation debug logging enabled; refer to fusion_evaluation_debug.jsonl for mismatch details.
[INFO] Starting fusion evaluation
[INFO] Evaluation complete: 0.828 overall accuracy (149/180)
[INFO] Evaluation mismatches by attribute (debug): 31 total
[INFO] 	Attribute                        |  Errors | Percentage
[INFO] 	───────────────────────────────────────────────────────
[INFO] 	clubs_club                       |      29 |     93.55%%
[INFO] 	positions_position               |       1 |      3.23%%
[INFO] 	height_cm                        |       1 |      3.23%%



 Fusion Strategy 3 Metrics:
Overall Accuracy: 0.8278


### Strategy 4

In [19]:
strategy_4 = DataFusionStrategy("Strategy_4")

df_TM.attrs["trust"] = 3
df_FM.attrs["trust"] = 2
df_FBRef.attrs["trust"] = 1

strategy_4.add_attribute_fuser("pydi_id", prefer_higher_trust)
strategy_4.add_attribute_fuser("player_name", longest_string)
strategy_4.add_attribute_fuser("positions_position", prefer_higher_trust)
strategy_4.add_attribute_fuser("nationality", most_complete) 
strategy_4.add_attribute_fuser("left_right_foot", most_complete)
strategy_4.add_attribute_fuser("date_of_birth", most_recent)
strategy_4.add_attribute_fuser(
    "clubs_club",
    lambda values, **kw: intersection_k_sources(values, k=1)
) #switch from union to intersection of at least 1 source to deal with duplicates

strategy_4.add_attribute_fuser("height_cm", prefer_higher_trust)

numeric_attrs = [
    "transfer_value",
    "career_appearances", "career_goals",
    "pace", "finishing", "passing", "dribbling",
    "tackling", "strength",
    "reflexes_gk", "handling_gk", "penalty",
    "matches_played_3s", "minutes_played_3s",
    "goals_3s", "assists_3s",
    "penalty_goals_3s", "tackles_won_percentage_3s",
    "goals_per_shot_3s", "pass_completion_percentage_3s",
    "save_percentage_3s", "clean_sheet_percentage_3s",
    "penalty_save_percentage_3s",
]

for attr in numeric_attrs:
    strategy_4.add_attribute_fuser(attr, prefer_higher_trust)
    
strategy_4.add_evaluation_function("clubs_club", tokenized_match, threshold=0.5) #Lowering the threshold from 0.7 to 0.5

[INFO] Registered fuser for attribute 'pydi_id' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'player_name' using rule 'longest_string'
[INFO] Registered fuser for attribute 'positions_position' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'nationality' using rule 'most_complete'
[INFO] Registered fuser for attribute 'left_right_foot' using rule 'most_complete'
[INFO] Registered fuser for attribute 'date_of_birth' using rule 'most_recent'
[INFO] Registered fuser for attribute 'clubs_club' using rule '<lambda>'
[INFO] Registered fuser for attribute 'height_cm' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'transfer_value' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_appearances' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_goals' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'pace' using rule 'prefer_higher_trust'
[INFO]

In [20]:
engine_4 = DataFusionEngine(
    strategy_4,
    debug=True,
    debug_file="fusion_debug_4.jsonl"
)

fused_4 = engine_4.run(
    datasets=[df_FBRef, df_TM, df_FM],
    correspondences=[df_FBRef_2_TM, df_FM_2_TM],
    id_column="pydi_id"
)

[INFO] Fusion debug logging enabled; refer to fusion_debug_4.jsonl for detailed traces.
[INFO] Starting data fusion with strategy 'Strategy_4'
[INFO] *    Loading correspondences    *
[INFO] Correspondence ID coverage: matched 80770 of 80770 unique IDs
[INFO] Created 223283 record groups from 43052 correspondences
[INFO] Group Size Distribution of 223283 clusters:
[INFO] 	Cluster Size	| Frequency	| Percentage
[INFO] 	──────────────────────────────────────────────────
[INFO] 		2	|	32384	|	14.50%
[INFO] 		3	|	5334	|	2.39%
[INFO] Attribute Consistencies:
[INFO]     _id: 0.00
[INFO]     assists_3s: 1.00
[INFO]     career_appearances: 1.00
[INFO]     career_goals: 1.00
[INFO]     clean_sheet_percentage_3s: 1.00
[INFO]     clubs_club: 0.02
[INFO]     date_of_birth: 0.97
[INFO]     dribbling: 1.00
[INFO]     finishing: 1.00
[INFO]     goals_3s: 1.00
[INFO]     goals_per_shot_3s: 1.00
[INFO]     handling_gk: 1.00
[INFO]     height_cm: 0.02
[INFO]     left_right_foot: 0.10
[INFO]     matches_pl

In [21]:
eval_columns = df_test_set.columns.tolist()

fused_4 = fused_4[eval_columns]

evaluator = DataFusionEvaluator(strategy_4, debug=True, fusion_debug_logs="fusion_debug_4.jsonl")
baseline_metrics = evaluator.evaluate(
    fused_df=fused_4, fused_id_column="pydi_id", gold_df=df_test_set, gold_id_column="pydi_id"
)
print("\n Fusion Strategy 4 Metrics:", )
print(f"Overall Accuracy: {baseline_metrics['overall_accuracy']:.4f}")

[INFO] Fusion evaluation debug logging enabled; refer to fusion_evaluation_debug.jsonl for mismatch details.
[INFO] Starting fusion evaluation
[INFO] Evaluation complete: 0.956 overall accuracy (172/180)
[INFO] Evaluation mismatches by attribute (debug): 8 total
[INFO] 	Attribute                        |  Errors | Percentage
[INFO] 	───────────────────────────────────────────────────────
[INFO] 	clubs_club                       |       6 |     75.00%%
[INFO] 	positions_position               |       1 |     12.50%%
[INFO] 	height_cm                        |       1 |     12.50%%



 Fusion Strategy 4 Metrics:
Overall Accuracy: 0.9556


### Final Strategy

In [22]:
final_strategy = DataFusionStrategy("final_strategy")

df_TM.attrs["trust"] = 3
df_FM.attrs["trust"] = 2
df_FBRef.attrs["trust"] = 1

final_strategy.add_attribute_fuser("pydi_id", prefer_higher_trust)
final_strategy.add_attribute_fuser("player_name", longest_string)
final_strategy.add_attribute_fuser("positions_position", prefer_higher_trust)
final_strategy.add_attribute_fuser("nationality", most_complete)
final_strategy.add_attribute_fuser("left_right_foot", most_complete)
final_strategy.add_attribute_fuser("date_of_birth", most_recent)
final_strategy.add_attribute_fuser(
    "clubs_club",
    lambda values, **kw: intersection_k_sources(values, k=1)
)

final_strategy.add_attribute_fuser("height_cm", prefer_higher_trust)

numeric_attrs = [
    "transfer_value",
    "career_appearances", "career_goals",
    "pace", "finishing", "passing", "dribbling",
    "tackling", "strength",
    "reflexes_gk", "handling_gk", "penalty",
    "matches_played_3s", "minutes_played_3s",
    "goals_3s", "assists_3s",
    "penalty_goals_3s", "tackles_won_percentage_3s",
    "goals_per_shot_3s", "pass_completion_percentage_3s",
    "save_percentage_3s", "clean_sheet_percentage_3s",
    "penalty_save_percentage_3s",
]

for attr in numeric_attrs:
    final_strategy.add_attribute_fuser(attr, prefer_higher_trust)
    
final_strategy.add_evaluation_function("clubs_club", tokenized_match, threshold=0.3) #Lowering the threshold from 0.5 to 0.3

[INFO] Registered fuser for attribute 'pydi_id' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'player_name' using rule 'longest_string'
[INFO] Registered fuser for attribute 'positions_position' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'nationality' using rule 'most_complete'
[INFO] Registered fuser for attribute 'left_right_foot' using rule 'most_complete'
[INFO] Registered fuser for attribute 'date_of_birth' using rule 'most_recent'
[INFO] Registered fuser for attribute 'clubs_club' using rule '<lambda>'
[INFO] Registered fuser for attribute 'height_cm' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'transfer_value' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_appearances' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_goals' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'pace' using rule 'prefer_higher_trust'
[INFO]

In [23]:
engine_final = DataFusionEngine(
    final_strategy,
    debug=True,
    debug_file="fusion_debug_final.jsonl"
)

fused_final = engine_final.run(
    datasets=[df_FBRef, df_TM, df_FM],
    correspondences=[df_FBRef_2_TM, df_FM_2_TM],
    id_column="pydi_id"
)

[INFO] Fusion debug logging enabled; refer to fusion_debug_final.jsonl for detailed traces.
[INFO] Starting data fusion with strategy 'final_strategy'
[INFO] *    Loading correspondences    *
[INFO] Correspondence ID coverage: matched 80770 of 80770 unique IDs
[INFO] Created 223283 record groups from 43052 correspondences
[INFO] Group Size Distribution of 223283 clusters:
[INFO] 	Cluster Size	| Frequency	| Percentage
[INFO] 	──────────────────────────────────────────────────
[INFO] 		2	|	32384	|	14.50%
[INFO] 		3	|	5334	|	2.39%
[INFO] Attribute Consistencies:
[INFO]     _id: 0.00
[INFO]     assists_3s: 1.00
[INFO]     career_appearances: 1.00
[INFO]     career_goals: 1.00
[INFO]     clean_sheet_percentage_3s: 1.00
[INFO]     clubs_club: 0.02
[INFO]     date_of_birth: 0.97
[INFO]     dribbling: 1.00
[INFO]     finishing: 1.00
[INFO]     goals_3s: 1.00
[INFO]     goals_per_shot_3s: 1.00
[INFO]     handling_gk: 1.00
[INFO]     height_cm: 0.02
[INFO]     left_right_foot: 0.10
[INFO]     ma

In [24]:
eval_columns = df_test_set.columns.tolist()

fused_final_eval = fused_final[eval_columns]

evaluator = DataFusionEvaluator(final_strategy, debug=True, fusion_debug_logs="fusion_debug_final.jsonl")
baseline_metrics = evaluator.evaluate(
    fused_df=fused_final_eval, fused_id_column="pydi_id", gold_df=df_test_set, gold_id_column="pydi_id"
)
print("\n Final Fusion Strategy Metrics:", )
print(f"Overall Accuracy: {baseline_metrics['overall_accuracy']:.4f}")

[INFO] Fusion evaluation debug logging enabled; refer to fusion_evaluation_debug.jsonl for mismatch details.
[INFO] Starting fusion evaluation
[INFO] Evaluation complete: 0.978 overall accuracy (176/180)
[INFO] Evaluation mismatches by attribute (debug): 4 total
[INFO] 	Attribute                        |  Errors | Percentage
[INFO] 	───────────────────────────────────────────────────────
[INFO] 	clubs_club                       |       2 |     50.00%%
[INFO] 	positions_position               |       1 |     25.00%%
[INFO] 	height_cm                        |       1 |     25.00%%



 Final Fusion Strategy Metrics:
Overall Accuracy: 0.9778


In [25]:
density_after = compute_density_manual(fused_final)
print("Density After Fusion:", density_after)

Density After Fusion: 0.7376142248616929
