In [14]:
from valentine import valentine_match, valentine_metrics
from valentine.algorithms import Coma
from valentine.algorithms import Cupid
import os
import multiprocessing as mp
from multiprocessing import Pool
import time
import pandas as pd
from compute_match import *

# Preparazione data frames

In [15]:
path = "../test_datasets"

# prende i data frames dal path di sopra
data_frames = {}
for file in os.listdir(path):
    filepath = f"{path}/{file}"
    data_frames[file] = (pd.read_json(filepath))

In [16]:
# creo tutte le coppie di data frames (evitando ripetizioni)
keys = list(data_frames.keys())
tuples = []
dfs_len = len(keys)
for i in range(dfs_len):
    for j in range(i + 1, dfs_len):
        dfl = data_frames[keys[i]]
        dfr = data_frames[keys[j]]
        tuples.append((keys[i], keys[j], dfl, dfr))

# Valentine *-based evaluation

## Schema based

In [17]:
# Calcola schema matching (multi processor)
with Pool() as pool:
    result = pool.map(calculate_match_coma_schema, tuples)
print("Program finished!")

Program finished!


In [18]:
# valentaine matching evaluation
ground_truth = [('name', 'name'), ('country', 'country')]
metrics = []
for res in result:
    metrics.append(valentine_metrics.all_metrics(res[2], ground_truth))
print(metrics)


[{'precision': 0.6666666666666666, 'recall': 1.0, 'f1_score': 0.8, 'precision_at_10_percent': 1.0, 'precision_at_30_percent': 1.0, 'precision_at_50_percent': 1.0, 'precision_at_70_percent': 0.6666666666666666, 'precision_at_90_percent': 0.6666666666666666, 'recall_at_sizeof_ground_truth': 1.0}]


In [19]:
print(result)

[('DDD-cbinsight.com.json', 'DDD-companiesmarketcap.com.json', {(('table_1', 'country'), ('table_2', 'country')): 0.7887501, (('table_1', 'name'), ('table_2', 'name')): 0.7792963, (('table_1', 'stage'), ('table_2', 'share_price')): 0.28008994})]


## Instance based

In [20]:
# Calcola schema matching (multi processor)
with Pool() as pool:
    result_inst = pool.map(calculate_match_coma_instance, tuples)
print("Program finished!")

Program finished!


In [21]:
# valentaine matching evaluation
ground_truth = [('name', 'name'), ('country', 'country')]
metrics = []
for res in result_inst:
    metrics.append(valentine_metrics.all_metrics(res[2], ground_truth))
print(metrics)


[{'precision': 0, 'recall': 0.0, 'f1_score': 0, 'precision_at_10_percent': 0, 'precision_at_30_percent': 0, 'precision_at_50_percent': 0, 'precision_at_70_percent': 0, 'precision_at_90_percent': 0, 'recall_at_sizeof_ground_truth': 0.0}]


In [22]:
print(result_inst)

[('DDD-cbinsight.com.json', 'DDD-companiesmarketcap.com.json', {})]


# Cupid

In [23]:
# Calcola schema matching (multi processor)
with Pool() as pool:
    result_cupid = pool.map(calculate_match_cupid, tuples)
print("Program finished!")

Program finished!


In [24]:
# valentaine matching evaluation
ground_truth = [('name', 'name'), ('country', 'country')]
metrics = []
for res in result_cupid:
    metrics.append(valentine_metrics.all_metrics(res[2], ground_truth))
print(metrics)


[{'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, 'precision_at_10_percent': 1.0, 'precision_at_30_percent': 0.6666666666666666, 'precision_at_50_percent': 0.5, 'precision_at_70_percent': 0.3333333333333333, 'precision_at_90_percent': 0.25, 'recall_at_sizeof_ground_truth': 1.0}]


In [25]:
print(result_cupid)

[('DDD-cbinsight.com.json', 'DDD-companiesmarketcap.com.json', {(('table_1', 'name'), ('table_2', 'name')): 1.0, (('table_1', 'country'), ('table_2', 'country')): 1.0, (('table_1', 'city'), ('table_2', 'country')): 0.9000000000000001, (('table_1', 'stage'), ('table_2', 'country')): 0.8857142857142857, (('table_1', 'valuation'), ('table_2', 'share_price')): 0.8441830065359479, (('table_1', 'totalRaised'), ('table_2', 'name')): 0.8162962962962963, (('table_1', 'stage'), ('table_2', 'name')): 0.7333333333333334, (('table_1', 'industry'), ('table_2', 'country')): 0.7})]


# Distribution-based

In [26]:
# Calcola schema matching (multi processor)
with Pool() as pool:
    result_dist = pool.map(calculate_match_distribution, tuples)
print("Program finished!")

Program finished!


In [27]:
# valentaine matching evaluation
ground_truth = [('name', 'name'), ('country', 'country')]
metrics = []
for res in result_dist:
    metrics.append(valentine_metrics.all_metrics(res[2], ground_truth))
print(metrics)


[{'precision': 1.0, 'recall': 0.5, 'f1_score': 0.6666666666666666, 'precision_at_10_percent': 1.0, 'precision_at_30_percent': 1.0, 'precision_at_50_percent': 1.0, 'precision_at_70_percent': 1.0, 'precision_at_90_percent': 1.0, 'recall_at_sizeof_ground_truth': 0.5}]


In [28]:
print(result_dist)

[('DDD-cbinsight.com.json', 'DDD-companiesmarketcap.com.json', {(('table_1', 'name'), ('table_2', 'name')): 0.9771106293743441})]


# Jaccard

In [29]:
# Calcola schema matching (multi processor)
with Pool() as pool:
    result_jacc = pool.map(calculate_match_jaccard, tuples)
print("Program finished!")

Program finished!


In [30]:
# valentaine matching evaluation
ground_truth = [('name', 'name'), ('country', 'country')]
metrics = []
for res in result_jacc:
    metrics.append(valentine_metrics.all_metrics(res[2], ground_truth))
print(metrics)


[{'precision': 0.2857142857142857, 'recall': 1.0, 'f1_score': 0.4444444444444445, 'precision_at_10_percent': 0.3333333333333333, 'precision_at_30_percent': 0.2222222222222222, 'precision_at_50_percent': 0.14285714285714285, 'precision_at_70_percent': 0.10526315789473684, 'precision_at_90_percent': 0.08, 'recall_at_sizeof_ground_truth': 0.5}]


In [31]:
print(result_jacc)

[('DDD-cbinsight.com.json', 'DDD-companiesmarketcap.com.json', {(('table_1', 'country'), ('table_2', 'country')): 0.41935483870967744, (('table_1', 'totalRaised'), ('table_2', 'marketcap')): 0.19134253450439148, (('table_1', 'totalRaised'), ('table_2', 'share_price')): 0.16418889374726717, (('table_1', 'valuation'), ('table_2', 'marketcap')): 0.07579022988505747, (('table_1', 'valuation'), ('table_2', 'share_price')): 0.04700162074554295, (('table_1', 'founded'), ('table_2', 'country')): 0.013333333333333334, (('table_1', 'name'), ('table_2', 'name')): 0.013149957775364941, (('table_1', 'stage'), ('table_2', 'country')): 0.006993006993006993, (('table_1', 'founded'), ('table_2', 'share_price')): 0.005351326198231736, (('table_1', 'founded'), ('table_2', 'rank')): 0.0050062578222778474, (('table_1', 'city'), ('table_2', 'country')): 0.003134796238244514, (('table_1', 'city'), ('table_2', 'name')): 0.0022727272727272726, (('table_1', 'founded'), ('table_2', 'change1y')): 0.00132651127534

# Similarity flooding

In [32]:
# Calcola schema matching (multi processor)
with Pool() as pool:
    result_sim = pool.map(calculate_match_sim, tuples)
print("Program finished!")

Program finished!


In [33]:
# valentaine matching evaluation
ground_truth = [('name', 'name'), ('country', 'country')]
metrics = []
for res in result_sim:
    metrics.append(valentine_metrics.all_metrics(res[2], ground_truth))
print(metrics)


[{'precision': 0.5, 'recall': 1.0, 'f1_score': 0.6666666666666666, 'precision_at_10_percent': 0.25, 'precision_at_30_percent': 0.08333333333333333, 'precision_at_50_percent': 0.05, 'precision_at_70_percent': 0.03571428571428571, 'precision_at_90_percent': 0.027777777777777776, 'recall_at_sizeof_ground_truth': 1.0}]


In [34]:
print(result_sim)

[('DDD-cbinsight.com.json', 'DDD-companiesmarketcap.com.json', {(('table_1', 'name'), ('table_2', 'name')): 0.2338157481891429, (('table_1', 'country'), ('table_2', 'country')): 0.2338157481891429, (('table_1', 'dateJoined'), ('table_2', 'categories')): 0.1758631739154523, (('table_1', 'city'), ('table_2', 'country')): 0.17142422411225539, (('table_1', 'industry'), ('table_2', 'country')): 0.16976045013687172, (('table_1', 'stage'), ('table_2', 'change1d')): 0.15990578889806076, (('table_1', 'stage'), ('table_2', 'change1y')): 0.15990578889806076, (('table_1', 'stage'), ('table_2', 'name')): 0.15755944098405816, (('table_1', 'dateJoined'), ('table_2', 'change1d')): 0.15451140789802856, (('table_1', 'investors'), ('table_2', 'categories')): 0.15434864910173882, (('table_1', 'founded'), ('table_2', 'country')): 0.1523326562635976, (('table_1', 'country'), ('table_2', 'change1y')): 0.15145893640765137, (('table_1', 'stage'), ('table_2', 'categories')): 0.15145893640765137, (('table_1', 't