In [1]:
import recordlinkage as rl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
import time
import os
import joblib

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
gt_train = pd.read_csv("../datasets/ground_truth/GT_train/train.csv")
gt_val = pd.read_csv("../datasets/ground_truth/GT_train/val.csv")
gt_test = pd.read_csv("../datasets/ground_truth/GT_train/test.csv")

In [3]:
df_unificato = pd.read_csv("../datasets/mediated_schema/mediated_schema_normalized.csv", dtype={'id_source_vehicles': 'object'})

  df_unificato = pd.read_csv("../datasets/mediated_schema/mediated_schema_normalized.csv", dtype={'id_source_vehicles': 'object'})


In [5]:
df_unificato.drop(columns=['vin', 'description'], inplace=True)

In [6]:
gt_train.drop(columns=['description_A', 'description_B'], inplace=True)
gt_val.drop(columns=['description_A', 'description_B'], inplace=True)
gt_test.drop(columns=['description_A', 'description_B'], inplace=True)

In [7]:
# 1. Creiamo la colonna id_unificato combinando le due sorgenti
# Usiamo fillna() perché abbiamo garantito che dove manca uno c'è l'altro
df_unificato['id_unificato'] = (
    df_unificato['id_source_vehicles']
    .fillna(df_unificato['id_source_used_cars'])
)

# 2. Impostiamo l'id_unificato come INDICE del DataFrame
# Questo è il requisito fondamentale per compare.compute()
df_unificato = df_unificato.set_index('id_unificato')

In [8]:
df_unificato.head()

Unnamed: 0_level_0,id_source_vehicles,id_source_used_cars,location,price,year,manufacturer,model,cylinders,fuel_type,mileage,transmission,traction,body_type,main_color,latitude,longitude,pubblication_date
id_unificato,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
7316814884,7316814884,,auburn,33590.0,2014.0,gmc,sierra 1500 crew cab slt,8 cylinders,gas,57923.0,other,,pickup,white,32.59,-85.48,2021-05-04T12:31:18-0500
7316814758,7316814758,,auburn,22590.0,2010.0,chevrolet,silverado 1500,8 cylinders,gas,71229.0,other,,pickup,blue,32.59,-85.48,2021-05-04T12:31:08-0500
7316814989,7316814989,,auburn,39590.0,2020.0,chevrolet,silverado 1500 crew,8 cylinders,gas,19160.0,other,,pickup,red,32.59,-85.48,2021-05-04T12:31:25-0500
7316743432,7316743432,,auburn,30990.0,2017.0,toyota,tundra double cab sr,8 cylinders,gas,41124.0,other,,pickup,red,32.59,-85.48,2021-05-04T10:41:31-0500
7316356412,7316356412,,auburn,15000.0,2013.0,ford,f150 xlt,6 cylinders,gas,128000.0,automatic,rwd,truck,black,32.592,-85.5189,2021-05-03T14:02:03-0500


In [9]:
df_unificato[df_unificato['id_source_vehicles'].isna()].head()

Unnamed: 0_level_0,id_source_vehicles,id_source_used_cars,location,price,year,manufacturer,model,cylinders,fuel_type,mileage,transmission,traction,body_type,main_color,latitude,longitude,pubblication_date
id_unificato,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
S2_0,,S2_0,bayamon,23141.0,2019.0,jeep,renegade,i4,gas,7.0,automatic,fwd,suv crossover,yellow,18.3988,-66.1582,2019-04-06
S2_1,,S2_1,san juan,46500.0,2020.0,land rover,discovery sport,i4,gas,8.0,automatic,awd,suv crossover,black,18.4439,-66.0785,2020-02-15
S2_3,,S2_3,san juan,67430.0,2020.0,land rover,discovery,v6,gas,11.0,automatic,awd,suv crossover,gray,18.4439,-66.0785,2020-02-26
S2_4,,S2_4,san juan,48880.0,2020.0,land rover,discovery sport,i4,gas,7.0,automatic,awd,suv crossover,black,18.4439,-66.0785,2020-04-25
S2_5,,S2_5,san juan,66903.0,2020.0,land rover,range rover velar,i4,gas,12.0,automatic,awd,suv crossover,unknown,18.4439,-66.0785,2020-01-11


In [10]:
compare = rl.Compare()

# --- STRINGHE (Fuzzy) ---
# Usiamo Jaro-Winkler per tollerare piccoli errori di battitura
compare.string('manufacturer', 'manufacturer', method='jarowinkler', threshold=0.85, label='manufacturer')
compare.string('model', 'model', method='jarowinkler', threshold=0.85, label='model')
compare.string('location', 'location', method='jarowinkler', threshold=0.85, label='location')
compare.string('cylinders', 'cylinders', method='jarowinkler', threshold=0.70, label='cylinders')

# --- ESATTI (Categorie) ---
# Se sono diversi, il punteggio è 0; se uguali è 1
compare.exact('year', 'year', label='year')
compare.exact('fuel_type', 'fuel_type', label='fuel_type')
compare.exact('traction', 'traction', label='traction')
compare.exact('body_type', 'body_type', label='body_type')
compare.exact('main_color', 'main_color', label='main_color')
compare.exact('transmission', 'transmission', label='transmission')

# --- NUMERICI (Probabilistici) ---
# La funzione 'gauss' non dà 0/1, ma un valore tra 0 e 1 in base alla distanza
# Esempio: se il prezzo differisce di 500€ è quasi 1, se differisce di 2000€ scende drasticamente
compare.numeric('price', 'price', method='gauss', offset=500, scale=2000, label='price')
compare.numeric('mileage', 'mileage', method='gauss', offset=1000, scale=10000, label='mileage')
compare.numeric('latitude', 'latitude', method='gauss', offset=0.01, scale=0.1, label='lat')
compare.numeric('longitude', 'longitude', method='gauss', offset=0.01, scale=0.1, label='lon')

<Compare>

In [11]:
# df_unificato: il dataset mediato normalizzato (contenente record di A e B)
# gt_train, gt_val: i tuoi split della Ground Truth

# Generazione coppie di training dagli ID della GT
training_pairs = gt_train.set_index(['id_A', 'id_B']).index
val_pairs = gt_val.set_index(['id_A', 'id_B']).index

# Calcolo delle matrici di similarità (X)
# compute() legge gli ID nella GT e va a pescare i dati descrittivi nel df_unificato
print("Calcolo Feature Matrix per Training e Validation")
X_train = compare.compute(training_pairs, df_unificato, df_unificato)
X_val = compare.compute(val_pairs, df_unificato, df_unificato)

# Target vectors (y)
y_train = gt_train['label']
y_val = gt_val['label']

Calcolo Feature Matrix per Training e Validation


In [12]:
gt_train.head()

Unnamed: 0,id_A,location_A,price_A,year_A,manufacturer_A,model_A,cylinders_A,fuel_type_A,mileage_A,transmission_A,...,fuel_type_B,mileage_B,transmission_B,traction_B,body_type_B,main_color_B,latitude_B,longitude_B,pubblication_date_B,label
0,S2_2432565,mesquite,39215.0,2020.0,ford,f150,v6,gas,3.0,automatic,...,flex fuel vehicle,114363.0,automatic,4wd,suv crossover,black,41.9351,-79.5971,2020-08-05,0
1,S2_1790203,east peoria,37988.0,2016.0,ford,f150,v6,gas,55039.0,automatic,...,gas,184388.0,cvt,,sedan,green,41.9034,-88.0615,2020-08-14,0
2,S2_2689970,el paso,10995.0,2011.0,mini,cooper,i4,gas,68000.0,automatic,...,gas,113000.0,cvt,fwd,sedan,white,35.7062,-81.9413,2020-07-26,0
3,7305782445,albany,24590.0,2017.0,toyota,rav4 hybrid limited sport,,hybrid,70054.0,other,...,hybrid,70054.0,other,,other,blue,43.1,-75.23,2021-04-22T10:41:22-0400,1
4,S2_1042141,new bern,55980.0,2020.0,cadillac,xt6,v6,gas,5520.0,automatic,...,gas,9.0,automatic,awd,suv crossover,white,33.8117,-118.171,2020-02-17,0


In [13]:
X_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,manufacturer,model,location,cylinders,year,fuel_type,traction,body_type,main_color,transmission,price,mileage,lat,lon
id_A,id_B,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
S2_2432565,S2_539849,0.0,0.0,0.0,0.0,0,0,0,0,0,1,6.70886e-40,2.0709600000000002e-39,0.0,0.0
S2_1790203,S2_559180,0.0,0.0,0.0,0.0,0,1,0,0,0,0,1.027658e-88,2.5699609999999996e-50,1.578248e-42,3.21219e-65
S2_2689970,S2_929693,0.0,0.0,0.0,1.0,0,1,1,0,0,0,0.4339178,1.486138e-06,0.0,0.0
7305782445,7310824539,1.0,1.0,0.0,0.0,1,1,0,0,1,1,0.9982686,1.0,2.71616e-06,2.769339e-62
S2_1042141,S2_2805535,0.0,0.0,0.0,0.0,1,1,0,1,0,1,9.197828e-16,0.8684464,7.22177e-48,0.0


In [14]:
# 1. PREPARAZIONE DELLE MATRICI (Assicurati che X_train e X_val siano già calcolate)
# y_train e y_val devono contenere le label (0 o 1)

# Lista di valori per la regolarizzazione C
c_values = [0.001, 0.01, 0.1, 1, 10, 100]
# Opzioni per il peso delle classi
weights = [None, 'balanced']

best_f1 = 0
best_params = {}
best_model = None

print("--- INIZIO TUNING IPERPARAMETRI ---")
print(f"Distribuzione Training: {y_train.value_counts().to_dict()}")

start_time = time.time()

# Ciclo di Tuning
for c in c_values:
    for w in weights:
        # Inizializzazione modello
        model = LogisticRegression(C=c, class_weight=w, max_iter=1000, random_state=42)
        
        # Addestramento
        model.fit(X_train, y_train)
        
        # Predizione su Validation Set
        y_val_pred = model.predict(X_val)
        
        # Calcolo metriche
        current_f1 = f1_score(y_val, y_val_pred)
        current_prec = precision_score(y_val, y_val_pred, zero_division=0)
        current_rec = recall_score(y_val, y_val_pred, zero_division=0)
        
        print(f"C: {c:7} | Weight: {str(w):10} | F1: {current_f1:.4f} (P: {current_prec:.2f}, R: {current_rec:.2f})")
        
        # Selezione del modello migliore basata su F1-Score
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_model = model
            best_params = {'C': c, 'class_weight': w}

end_time = time.time()
tuning_duration = end_time - start_time

print("\n--- RISULTATI TUNING ---")
print(f"Miglior F1-Score su Validation: {best_f1:.4f}")
print(f"Migliori Parametri: {best_params}")
print(f"Tempo impiegato: {tuning_duration:.2f} secondi")

--- INIZIO TUNING IPERPARAMETRI ---
Distribuzione Training: {0: 5948, 1: 2974}
C:   0.001 | Weight: None       | F1: 0.9760 (P: 1.00, R: 0.96)
C:   0.001 | Weight: balanced   | F1: 0.9859 (P: 0.98, R: 0.99)
C:    0.01 | Weight: None       | F1: 0.9882 (P: 0.99, R: 0.99)
C:    0.01 | Weight: balanced   | F1: 0.9883 (P: 0.98, R: 0.99)
C:     0.1 | Weight: None       | F1: 0.9906 (P: 0.99, R: 0.99)
C:     0.1 | Weight: balanced   | F1: 0.9891 (P: 0.98, R: 0.99)
C:       1 | Weight: None       | F1: 0.9929 (P: 0.99, R: 0.99)
C:       1 | Weight: balanced   | F1: 0.9891 (P: 0.98, R: 1.00)
C:      10 | Weight: None       | F1: 0.9914 (P: 0.99, R: 0.99)
C:      10 | Weight: balanced   | F1: 0.9875 (P: 0.98, R: 1.00)
C:     100 | Weight: None       | F1: 0.9914 (P: 0.99, R: 0.99)
C:     100 | Weight: balanced   | F1: 0.9875 (P: 0.98, R: 1.00)

--- RISULTATI TUNING ---
Miglior F1-Score su Validation: 0.9929
Migliori Parametri: {'C': 1, 'class_weight': None}
Tempo impiegato: 0.92 secondi


In [15]:
# 2. VALUTAZIONE FINALE SUL TEST SET (Punto 4.H)
print("\n--- VALUTAZIONE FINALE SUL TEST SET (DATI MAI VISTI) ---")

# Generazione Feature Matrix per il Test Set
test_pairs = gt_test.set_index(['id_A', 'id_B']).index
X_test = compare.compute(test_pairs, df_unificato, df_unificato)
y_test = gt_test['label']

# Predizione finale
y_test_pred = best_model.predict(X_test)

# Report dettagliato richiesto per la consegna 4.H
print(classification_report(y_test, y_test_pred))


--- VALUTAZIONE FINALE SUL TEST SET (DATI MAI VISTI) ---
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1275
           1       0.99      0.99      0.99       638

    accuracy                           0.99      1913
   macro avg       0.99      0.99      0.99      1913
weighted avg       0.99      0.99      0.99      1913



In [16]:
# Salvatto del modello addestrato
model_dir = "../models"
os.makedirs(model_dir, exist_ok=True)

# 2. Definisci il path del file
model_path = os.path.join(model_dir, "recordlinkage_model.joblib")

# 3. Salva il modello
joblib.dump(best_model, model_path)

print(f"Modello salvato con successo in: {model_path}")

Modello salvato con successo in: ../models/recordlinkage_model.joblib
