In [34]:
import logging
import pandas as pd
import numpy as np
from PyDI.io import load_xml, load_csv
from PyDI.fusion.engine import build_record_groups_from_correspondences
from PyDI.fusion import (
    DataFusionStrategy,
    DataFusionEngine,
    DataFusionEvaluator,
    longest_string,
    shortest_string,
    prefer_higher_trust,
    average,
    most_recent,
    union,
    tokenized_match,
    most_complete,
)
from pathlib import Path

logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
pd.set_option('display.max_columns', None)

### Define file path and load files

In [2]:
INPUT_DIR = Path("MapForce/Output")
CORR_DIR = Path("Identity_Resolution")
FBRef_PATH = INPUT_DIR / "FBRef.xml"
TM_PATH = INPUT_DIR / "TM.xml"
FM_PATH = INPUT_DIR / "FM.xml"

In [23]:
df_FBRef = load_xml(FBRef_PATH)
df_TM = load_xml(TM_PATH)
df_FM = load_xml(FM_PATH)

df_FBRef_2_TM = load_csv(CORR_DIR / "correspondences_FBRef&TM.csv")
df_FM_2_TM = load_csv(CORR_DIR / "correspondences_FM&TM.csv")

df_test_set = load_xml("fusion_evaluation.xml")

### Group Size Distribution

In [24]:
# Analyse record group sizes created by the correspondences
record_groups = build_record_groups_from_correspondences(
    [df_FBRef, df_TM, df_FM],
    [df_FBRef_2_TM, df_FM_2_TM],
    id_column='pydi_id',
)

[INFO] Created 223283 record groups from 43052 correspondences
[INFO] Group Size Distribution of 223283 clusters:
[INFO] 	Cluster Size	| Frequency	| Percentage
[INFO] 	──────────────────────────────────────────────────
[INFO] 		2	|	32384	|	14.50%
[INFO] 		3	|	5334	|	2.39%


### Density before Fusion

In [5]:
def compute_density_manual(df):
    # Count non-null values
    non_null = df.notnull().sum().sum()
    # Total number of cells
    total = df.shape[0] * df.shape[1]
    return non_null / total


print("FBRef Density:", compute_density_manual(df_FBRef))
print("TM Density:", compute_density_manual(df_TM))
print("FM Density:", compute_density_manual(df_FM))

FBRef Density: 0.9999586037885813
TM Density: 0.9827495443774011
FM Density: 1.0


### Consistency before Fusion

In [6]:
def compute_consistency_manual(data_frames, record_groups, id_column="pydi_id"):
    total_attributes = 0
    consistent_attributes = 0

    for group in record_groups:
        records = group.records   # list of pandas.Series objects

        if len(records) < 2:
            continue  # cannot compare singletons

        # To avoid KeyErrors, only consider columns that appear in ALL records
        common_columns = set(records[0].index)
        for rec in records[1:]:
            common_columns &= set(rec.index)

        # Remove ID column
        if id_column in common_columns:
            common_columns.remove(id_column)

        # Now compute consistency only on attributes ALL records share
        for col in common_columns:

            # Collect non-null values
            vals = [rec[col] for rec in records if pd.notnull(rec[col])]

            if len(vals) <= 1:
                continue  # nothing to compare

            total_attributes += 1

            # Check if all values match
            if all(v == vals[0] for v in vals):
                consistent_attributes += 1

    if total_attributes == 0:
        return 1.0

    return consistent_attributes / total_attributes

consistency_before = compute_consistency_manual(
    data_frames=[df_FBRef, df_TM, df_FM],
    record_groups=record_groups,
    id_column="pydi_id"
)

print("Consistency Before Fusion:", consistency_before)

Consistency Before Fusion: 0.11541783148561595


### Baseline Strategy

In [28]:
baseline_strategy = DataFusionStrategy("Baseline_Fusion")

# Trust scores only for id
df_TM.attrs["trust"] = 3
df_FM.attrs["trust"] = 2
df_FBRef.attrs["trust"] = 1

baseline_strategy.add_attribute_fuser("pydi_id", prefer_higher_trust)

# Text attributes
baseline_strategy.add_attribute_fuser("player_name", shortest_string)
baseline_strategy.add_attribute_fuser("positions_position", longest_string)
baseline_strategy.add_attribute_fuser("nationality", shortest_string)
baseline_strategy.add_attribute_fuser("left_right_foot", longest_string)

# Date attribute
baseline_strategy.add_attribute_fuser("date_of_birth", most_recent)

# Clubs
baseline_strategy.add_attribute_fuser("clubs_club", union)

# Numeric attributes
numeric_attrs = [
    "height_cm", "transfer_value",
    "career_appearances", "career_goals",
    "pace", "finishing", "passing", "dribbling",
    "tackling", "strength",
    "reflexes_gk", "handling_gk", "penalty",
    "matches_played_3s", "minutes_played_3s",
    "goals_3s", "assists_3s",
    "penalty_goals_3s", "tackles_won_percentage_3s",
    "goals_per_shot_3s", "pass_completion_percentage_3s",
    "save_percentage_3s", "clean_sheet_percentage_3s",
    "penalty_save_percentage_3s",
]

for attr in numeric_attrs:
    baseline_strategy.add_attribute_fuser(attr, average) 


[INFO] Registered fuser for attribute 'pydi_id' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'player_name' using rule 'shortest_string'
[INFO] Registered fuser for attribute 'positions_position' using rule 'longest_string'
[INFO] Registered fuser for attribute 'nationality' using rule 'shortest_string'
[INFO] Registered fuser for attribute 'left_right_foot' using rule 'longest_string'
[INFO] Registered fuser for attribute 'date_of_birth' using rule 'most_recent'
[INFO] Registered fuser for attribute 'clubs_club' using rule 'union'
[INFO] Registered fuser for attribute 'height_cm' using rule 'average'
[INFO] Registered fuser for attribute 'transfer_value' using rule 'average'
[INFO] Registered fuser for attribute 'career_appearances' using rule 'average'
[INFO] Registered fuser for attribute 'career_goals' using rule 'average'
[INFO] Registered fuser for attribute 'pace' using rule 'average'
[INFO] Registered fuser for attribute 'finishing' using rule 'average'

In [29]:
baseline_engine = DataFusionEngine(
    strategy = baseline_strategy,
    debug=True,
    debug_file="fusion_debug_baseline.jsonl"
)

fused_baseline = baseline_engine.run(
    datasets=[df_FBRef, df_TM, df_FM],
    correspondences=[df_FBRef_2_TM, df_FM_2_TM],
    id_column="pydi_id",
)

[INFO] Fusion debug logging enabled; refer to fusion_debug_baseline.jsonl for detailed traces.
[INFO] Starting data fusion with strategy 'Baseline_Fusion'
[INFO] *    Loading correspondences    *
[INFO] Correspondence ID coverage: matched 80770 of 80770 unique IDs
[INFO] Created 223283 record groups from 43052 correspondences
[INFO] Group Size Distribution of 223283 clusters:
[INFO] 	Cluster Size	| Frequency	| Percentage
[INFO] 	──────────────────────────────────────────────────
[INFO] 		2	|	32384	|	14.50%
[INFO] 		3	|	5334	|	2.39%
[INFO] Attribute Consistencies:
[INFO]     _id: 0.00
[INFO]     assists_3s: 1.00
[INFO]     career_appearances: 1.00
[INFO]     career_goals: 1.00
[INFO]     clean_sheet_percentage_3s: 1.00
[INFO]     clubs_club: 0.02
[INFO]     date_of_birth: 0.97
[INFO]     dribbling: 1.00
[INFO]     finishing: 1.00
[INFO]     goals_3s: 1.00
[INFO]     goals_per_shot_3s: 1.00
[INFO]     handling_gk: 1.00
[INFO]     height_cm: 0.02
[INFO]     left_right_foot: 0.10
[INFO]   

In [9]:
fused_baseline.head(3)

Unnamed: 0,_id,_fusion_sources,_fusion_source_datasets,penalty_goals_3s,save_percentage_3s,assists_3s,pydi_id,goals_per_shot_3s,clean_sheet_percentage_3s,date_of_birth,nationality,clubs_club,player_name,tackles_won_percentage_3s,matches_played_3s,minutes_played_3s,height_cm,positions_position,left_right_foot,penalty_save_percentage_3s,pass_completion_percentage_3s,goals_3s,transfer_value,_fusion_confidence,_fusion_metadata,strength,career_appearances,pace,passing,dribbling,reflexes_gk,finishing,career_goals,handling_gk,tackling,penalty
0,TM_056768,"[TM_056768, FBRef_001645]","[TM, FBRef]",0.0,0.0,1.0,TM_056768,0.0,0.0,2002-09-25,Netherlands United States,"[FC Volendam, Volendam]",Deron Payne,62.5,18.0,974.0,182.0,DF,right,0.0,73.95,0.0,300000,0.947115,"{'penalty_goals_3s_rule': 'longest_string', 'p...",,,,,,,,,,,
1,FM_105482,"[FM_105482, TM_011149]","[FM, TM]",,,,TM_011149,,,1994-03-23,Israel Hungary,"[AEK Larnakas, Hapoel Tel Aviv]",Omri Altman,,,,181.0,"M (C), AM (LC)",Right,,,,1000000,0.876429,"{'strength_rule': 'longest_string', 'strength_...",10.0,204.0,11.0,12.0,12.0,1.0,11.0,36.0,2.0,8.0,15.0
2,FBRef_000996,"[FBRef_000996, TM_006550]","[FBRef, TM]",0.0,0.0,0.0,TM_006550,0.0,0.0,2009-01-12,England Luxembourg,"[FC Metz, Metz]",Brian Madjo,0.0,3.0,138.0,193.0,FW,right,0.0,53.33,0.0,4000000,0.946053,"{'penalty_goals_3s_rule': 'longest_string', 'p...",,,,,,,,,,,


In [10]:
df_test_set.head(3)

Unnamed: 0,pydi_id,player_name,positions_position,date_of_birth,clubs_club,height_cm,nationality
0,TM_040882,Edu Exposito,MC,1996-08-01,RCD Espanyol Barcelona,178,Spain
1,TM_060716,Lucas Moura,AMR,1992-08-13,"Tottenham, Sao Paulo Futebol Clube",172,Brazil
2,TM_022432,Matthieu Udol,DL,1996-03-20,"RC Lens, FC Metz",174,France Guadeloupe


In [30]:
eval_columns = df_test_set.columns.tolist()

fused_baseline = fused_baseline[eval_columns]

evaluator = DataFusionEvaluator(baseline_strategy, debug=True, fusion_debug_logs="fusion_debug_baseline.jsonl")
baseline_metrics = evaluator.evaluate(
    fused_df=fused_baseline, fused_id_column="pydi_id", gold_df=df_test_set, gold_id_column="pydi_id"
)
print("\n Baseline Fusion Strategy Metrics:", )
print(f"Overall Accuracy: {baseline_metrics['overall_accuracy']:.4f}")


[INFO] Fusion evaluation debug logging enabled; refer to fusion_evaluation_debug.jsonl for mismatch details.
[INFO] Starting fusion evaluation
[INFO] Evaluation complete: 0.325 overall accuracy (39/120)
[INFO] Evaluation mismatches by attribute (debug): 81 total
[INFO] 	Attribute                        |  Errors | Percentage
[INFO] 	───────────────────────────────────────────────────────
[INFO] 	height_cm                        |      20 |     24.69%%
[INFO] 	positions_position               |      20 |     24.69%%
[INFO] 	nationality                      |      20 |     24.69%%
[INFO] 	clubs_club                       |      20 |     24.69%%
[INFO] 	player_name                      |       1 |      1.23%%



 Baseline Fusion Strategy Metrics:
Overall Accuracy: 0.3250


In [31]:
strategy_2 = DataFusionStrategy("Strategy_2")

df_TM.attrs["trust"] = 3
df_FM.attrs["trust"] = 1
df_FBRef.attrs["trust"] = 2

strategy_2.add_attribute_fuser("pydi_id", prefer_higher_trust)
strategy_2.add_attribute_fuser("player_name", longest_string)
strategy_2.add_attribute_fuser("positions_position", prefer_higher_trust)
strategy_2.add_attribute_fuser("nationality", prefer_higher_trust)
strategy_2.add_attribute_fuser("left_right_foot", longest_string)
strategy_2.add_attribute_fuser("date_of_birth", most_recent)
strategy_2.add_attribute_fuser("clubs_club", union)

numeric_attrs = [
    "height_cm", "transfer_value",
    "career_appearances", "career_goals",
    "pace", "finishing", "passing", "dribbling",
    "tackling", "strength",
    "reflexes_gk", "handling_gk", "penalty",
    "matches_played_3s", "minutes_played_3s",
    "goals_3s", "assists_3s",
    "penalty_goals_3s", "tackles_won_percentage_3s",
    "goals_per_shot_3s", "pass_completion_percentage_3s",
    "save_percentage_3s", "clean_sheet_percentage_3s",
    "penalty_save_percentage_3s",
]

for attr in numeric_attrs:
    strategy_2.add_attribute_fuser(attr, prefer_higher_trust)

[INFO] Registered fuser for attribute 'pydi_id' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'player_name' using rule 'longest_string'
[INFO] Registered fuser for attribute 'positions_position' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'nationality' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'left_right_foot' using rule 'longest_string'
[INFO] Registered fuser for attribute 'date_of_birth' using rule 'most_recent'
[INFO] Registered fuser for attribute 'clubs_club' using rule 'union'
[INFO] Registered fuser for attribute 'height_cm' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'transfer_value' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_appearances' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_goals' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'pace' using rule 'prefer_higher_trust'
[I

In [None]:
engine_2 = DataFusionEngine(
    strategy_2,
    debug=True,
    debug_file="fusion_debug_2.jsonl"
)

fused_2 = engine_2.run(
    datasets=[df_FBRef, df_TM, df_FM],
    correspondences=[df_FBRef_2_TM, df_FM_2_TM],
    id_column="pydi_id"
)

[INFO] Fusion debug logging enabled; refer to fusion_debug_2.jsonl for detailed traces.
[INFO] Starting data fusion with strategy 'Strategy_2'
[INFO] *    Loading correspondences    *
[INFO] Correspondence ID coverage: matched 80770 of 80770 unique IDs
[INFO] Created 223283 record groups from 43052 correspondences
[INFO] Group Size Distribution of 223283 clusters:
[INFO] 	Cluster Size	| Frequency	| Percentage
[INFO] 	──────────────────────────────────────────────────
[INFO] 		2	|	32384	|	14.50%
[INFO] 		3	|	5334	|	2.39%
[INFO] Attribute Consistencies:
[INFO]     _id: 0.00
[INFO]     assists_3s: 1.00
[INFO]     career_appearances: 1.00
[INFO]     career_goals: 1.00
[INFO]     clean_sheet_percentage_3s: 1.00
[INFO]     clubs_club: 0.02
[INFO]     date_of_birth: 0.97
[INFO]     dribbling: 1.00
[INFO]     finishing: 1.00
[INFO]     goals_3s: 1.00
[INFO]     goals_per_shot_3s: 1.00
[INFO]     handling_gk: 1.00
[INFO]     height_cm: 0.02
[INFO]     left_right_foot: 0.10
[INFO]     matches_pl

In [33]:
eval_columns = df_test_set.columns.tolist()

fused_2 = fused_2[eval_columns]

evaluator = DataFusionEvaluator(strategy_2, debug=True, fusion_debug_logs="fusion_debug_2.jsonl")
baseline_metrics = evaluator.evaluate(
    fused_df=fused_2, fused_id_column="pydi_id", gold_df=df_test_set, gold_id_column="pydi_id"
)
print("\n Fusion Strategy 2 Metrics:", )
print(f"Overall Accuracy: {baseline_metrics['overall_accuracy']:.4f}")

[INFO] Fusion evaluation debug logging enabled; refer to fusion_evaluation_debug.jsonl for mismatch details.
[INFO] Starting fusion evaluation
[INFO] Evaluation complete: 0.808 overall accuracy (97/120)
[INFO] Evaluation mismatches by attribute (debug): 23 total
[INFO] 	Attribute                        |  Errors | Percentage
[INFO] 	───────────────────────────────────────────────────────
[INFO] 	clubs_club                       |      20 |     86.96%%
[INFO] 	player_name                      |       1 |      4.35%%
[INFO] 	height_cm                        |       1 |      4.35%%
[INFO] 	positions_position               |       1 |      4.35%%



 Fusion Strategy 2 Metrics:
Overall Accuracy: 0.8083


In [38]:
strategy_3 = DataFusionStrategy("Strategy_3")

df_TM.attrs["trust"] = 3
df_FM.attrs["trust"] = 1
df_FBRef.attrs["trust"] = 2

strategy_3.add_attribute_fuser("pydi_id", prefer_higher_trust)
strategy_3.add_attribute_fuser("player_name", longest_string)
strategy_3.add_attribute_fuser("positions_position", prefer_higher_trust)
strategy_3.add_attribute_fuser("nationality", longest_string)
strategy_3.add_attribute_fuser("left_right_foot", most_complete)
strategy_3.add_attribute_fuser("date_of_birth", most_recent)
strategy_3.add_attribute_fuser("clubs_club", union)
strategy_3.add_attribute_fuser("height_cm", prefer_higher_trust)

numeric_attrs = [
    "transfer_value",
    "career_appearances", "career_goals",
    "pace", "finishing", "passing", "dribbling",
    "tackling", "strength",
    "reflexes_gk", "handling_gk", "penalty",
    "matches_played_3s", "minutes_played_3s",
    "goals_3s", "assists_3s",
    "penalty_goals_3s", "tackles_won_percentage_3s",
    "goals_per_shot_3s", "pass_completion_percentage_3s",
    "save_percentage_3s", "clean_sheet_percentage_3s",
    "penalty_save_percentage_3s",
]

for attr in numeric_attrs:
    strategy_3.add_attribute_fuser(attr, prefer_higher_trust)
    
strategy_3.add_evaluation_function("player_name", tokenized_match, threshold=0.8)
strategy_3.add_evaluation_function("clubs_club", tokenized_match, threshold=0.5)

[INFO] Registered fuser for attribute 'pydi_id' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'player_name' using rule 'longest_string'
[INFO] Registered fuser for attribute 'positions_position' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'nationality' using rule 'longest_string'
[INFO] Registered fuser for attribute 'left_right_foot' using rule 'most_complete'
[INFO] Registered fuser for attribute 'date_of_birth' using rule 'most_recent'
[INFO] Registered fuser for attribute 'clubs_club' using rule 'union'
[INFO] Registered fuser for attribute 'height_cm' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'transfer_value' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_appearances' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'career_goals' using rule 'prefer_higher_trust'
[INFO] Registered fuser for attribute 'pace' using rule 'prefer_higher_trust'
[INFO] R

In [39]:
engine_3 = DataFusionEngine(
    strategy_3,
    debug=True,
    debug_file="fusion_debug_3.jsonl"
)

fused_3 = engine_3.run(
    datasets=[df_FBRef, df_TM, df_FM],
    correspondences=[df_FBRef_2_TM, df_FM_2_TM],
    id_column="pydi_id"
)

[INFO] Fusion debug logging enabled; refer to fusion_debug_3.jsonl for detailed traces.
[INFO] Starting data fusion with strategy 'Strategy_3'
[INFO] *    Loading correspondences    *
[INFO] Correspondence ID coverage: matched 80770 of 80770 unique IDs
[INFO] Created 223283 record groups from 43052 correspondences
[INFO] Group Size Distribution of 223283 clusters:
[INFO] 	Cluster Size	| Frequency	| Percentage
[INFO] 	──────────────────────────────────────────────────
[INFO] 		2	|	32384	|	14.50%
[INFO] 		3	|	5334	|	2.39%
[INFO] Attribute Consistencies:
[INFO]     _id: 0.00
[INFO]     assists_3s: 1.00
[INFO]     career_appearances: 1.00
[INFO]     career_goals: 1.00
[INFO]     clean_sheet_percentage_3s: 1.00
[INFO]     clubs_club: 0.02
[INFO]     date_of_birth: 0.97
[INFO]     dribbling: 1.00
[INFO]     finishing: 1.00
[INFO]     goals_3s: 1.00
[INFO]     goals_per_shot_3s: 1.00
[INFO]     handling_gk: 1.00
[INFO]     height_cm: 0.02
[INFO]     left_right_foot: 0.10
[INFO]     matches_pl

In [40]:
eval_columns = df_test_set.columns.tolist()

fused_3 = fused_3[eval_columns]

evaluator = DataFusionEvaluator(strategy_3, debug=True, fusion_debug_logs="fusion_debug_3.jsonl")
baseline_metrics = evaluator.evaluate(
    fused_df=fused_3, fused_id_column="pydi_id", gold_df=df_test_set, gold_id_column="pydi_id"
)
print("\n Fusion Strategy 3 Metrics:", )
print(f"Overall Accuracy: {baseline_metrics['overall_accuracy']:.4f}")

[INFO] Fusion evaluation debug logging enabled; refer to fusion_evaluation_debug.jsonl for mismatch details.
[INFO] Starting fusion evaluation
[INFO] Evaluation complete: 0.900 overall accuracy (108/120)
[INFO] Evaluation mismatches by attribute (debug): 12 total
[INFO] 	Attribute                        |  Errors | Percentage
[INFO] 	───────────────────────────────────────────────────────
[INFO] 	clubs_club                       |       9 |     75.00%%
[INFO] 	player_name                      |       1 |      8.33%%
[INFO] 	height_cm                        |       1 |      8.33%%
[INFO] 	positions_position               |       1 |      8.33%%



 Fusion Strategy 3 Metrics:
Overall Accuracy: 0.9000
