In [1]:
import sparker
import pandas as pd

# Generalized supervised meta-blocking
Generalized Supervised Meta-blocking employs the probability provided by a probabilistic classifier to score the edges.

## Load the data

First, load a clean dataset with the groundtruth

In [2]:
dataset = "datasets/DblpAcm"

profiles1 = sparker.JSONWrapper.load_profiles('/data2/luca/ER/'+dataset+'/dataset1.json', 
                                              real_id_field = "realProfileID",
                                              source_id=1)
separator_id = profiles1.map(lambda profile: profile.profile_id).max()
separator_ids = [separator_id]

profiles2 = sparker.JSONWrapper.load_profiles('/data2/luca/ER/'+dataset+'/dataset2.json', 
                                              start_id_from = separator_id+1, 
                                              real_id_field = "realProfileID",
                                              source_id=2)
max_profile_id = profiles2.map(lambda profile: profile.profile_id).max()
profiles = profiles1.union(profiles2)

gt = sparker.JSONWrapper.load_groundtruth('/data2/luca/ER/'+dataset+'/groundtruth.json', 'id1', 'id2')
new_gt = sparker.Converters.convert_groundtruth(gt, profiles1, profiles2)

## Blocking 

Performs the blocking by using standard token blocking

In [3]:
blocks = sparker.Blocking.create_blocks(profiles, separator_ids)

## Block cleaning
Applying some block cleaning techniques to remove some superfluous comparisons

In [4]:
# Perfoms the purging
blocks_purged = sparker.BlockPurging.block_purging(blocks, 1.025)
# Performs the cleaning
(profile_blocks, profile_blocks_filtered, blocks_after_filtering) = sparker.BlockFiltering.\
                                                                            block_filtering_quick(blocks_purged, 
                                                                                                  0.8, 
                                                                                                  separator_ids)

## Features generation
Generate the features set for each pair of entity profiles that co-occurs in at least one block (i.e. the edges of the meta-blocking graph)

In [5]:
features = sparker.FeatureGenerator.generate_features(profiles, blocks_after_filtering, separator_ids, new_gt, False)

In [6]:
features.show(10)

+---+----+---------+------------+------------+---------+---------+-----------+----------+-----------+------------+--------+
| p1|  p2|    cfibf|       raccb|          js|numCompP1|numCompP2|         rs|      aejs|        nrs|         wjs|is_match|
+---+----+---------+------------+------------+---------+---------+-----------+----------+-----------+------------+--------+
|  0|2816|41.659367|0.0034722222|0.0035714286|      143|       90|0.029411765|0.99048966|0.014045067|0.0017414601|       0|
|  0|4167|43.542595|0.0034722222|0.0038610038|      143|       87|0.029411765| 1.1286342|0.013155922|0.0010594048|       0|
|  0|3399| 41.13539|0.0034722222| 0.003021148|      143|      105|0.029411765|0.79306006|0.020184455|0.0025181086|       0|
|  0|3340|41.659367|0.0034722222|0.0041841003|      143|       66|0.029411765| 1.1582178|0.009341111| 7.561535E-4|       0|
|  0|4241| 45.18775|0.0034722222| 0.004672897|      143|       43|0.029411765| 1.4860764|0.017673213| 0.002070604|       0|
|  0|424

## Scoring the edges
By using a probabilistic classifier (logistic regression) we assign to each pair (edge of the meta-blocking graph) the probability of being a match.

In [7]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
import pyspark.sql.functions as f
from pyspark.sql.types import FloatType, BooleanType

### Generation of the feature vector for the classifier

In [8]:
# Features to employ
features_set = ["cfibf", "raccb", "js", "numCompP1", "numCompP2", "rs", "aejs", "nrs", "wjs"]

va = VectorAssembler(inputCols=features_set, outputCol="features")

df = va.transform(features)

### Split the data in train/test
This will generate a balanced training set in which the number of positive/negative samples is the same.

In [9]:
# Number of samples per class (actually spark do not ensure this exact value during sampling)
n_samples = 20

# Sampling of matching pairs
matches = df.where("is_match == 1")
m_n = n_samples/matches.count()
m_train, m_test = matches.randomSplit([m_n, 1-m_n])

# Sampling of non-matching pairs
non_matches = df.where("is_match == 0")
nm_n = n_samples/non_matches.count()
nm_train, nm_test = non_matches.randomSplit([nm_n, 1-nm_n])

# Train/Test
train = m_train.union(nm_train)
test = m_test.union(nm_test)

### Training the classifier and get the probabilities

In [10]:
lr = LogisticRegression(featuresCol='features', 
                        labelCol='is_match', 
                        predictionCol='prediction', 
                        maxIter=1000, 
                        probabilityCol='probability'
                       )
# Training
model = lr.fit(train)
# Performs the predictions
predictions = model.transform(test)

# Get the results as the probability of each pair (edge) of being a match
get_p_match = f.udf(lambda v: float(v[1]), FloatType())
edges = predictions\
        .withColumn("p_match", get_p_match("probability"))\
        .select("p1", "p2", "p_match", "is_match")

#edges.cache()
#edges.count()

## Perform the pruning
Perform the pruning and get the scores

In [11]:
pruned_edges = sparker.SupervisedMB.wep(edges)
pc, pq, f1 = sparker.SupervisedMB.get_stats(pruned_edges, new_gt)
print("Recall "+str(pc))
print("Precision "+str(pq))
print("F1 "+str(f1))

Recall 0.9680755395683454
Precision 0.6123435722411832
F1 0.7501742160278746


In [12]:
pruned_edges = sparker.SupervisedMB.blast(edges)
pc, pq, f1 = sparker.SupervisedMB.get_stats(pruned_edges, new_gt)
print("Recall "+str(pc))
print("Precision "+str(pq))
print("F1 "+str(f1))

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/data2/luca/spark/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
    process()
  File "/data2/luca/spark/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
    serializer.dump_stream(out_iter, outfile)
  File "/data2/luca/spark/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 223, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/data2/luca/spark/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 141, in dump_stream
    for obj in iterator:
  File "/data2/luca/spark/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 212, in _batched
    for item in iterator:
  File "/data2/luca/spark/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/data2/luca/spark/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/data2/luca/spark/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 90, in <lambda>
    return lambda *a: f(*a)
  File "/data2/luca/spark/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "/data2/luca/gnn/features/sparker/supervised_metablocking.py", line 35, in do_pruning
    threshold = 0.35 * (profiles1_max_proba.value[p1] + profiles2_max_proba.value[p2])
KeyError: 753


In [None]:
pruned_edges = sparker.SupervisedMB.rcnp(edges, profiles, blocks)
pc, pq, f1 = sparker.SupervisedMB.get_stats(pruned_edges, new_gt)
print("Recall "+str(pc))
print("Precision "+str(pq))
print("F1 "+str(f1))

In [None]:
pruned_edges = sparker.SupervisedMB.cnp(edges, profiles, blocks)
pc, pq, f1 = sparker.SupervisedMB.get_stats(pruned_edges, new_gt)
print("Recall "+str(pc))
print("Precision "+str(pq))
print("F1 "+str(f1))