In [1]:
import sparker
import pandas as pd

# Generalized supervised meta-blocking
Generalized Supervised Meta-blocking employs the probability provided by a probabilistic classifier to score the edges.

## Load the data

First, load a clean dataset with the groundtruth

In [2]:
profiles1 = sparker.JSONWrapper.load_profiles('../datasets/clean/DblpAcm/dataset1.json', 
                                              real_id_field = "realProfileID",
                                              source_id=1)
separator_id = profiles1.map(lambda profile: profile.profile_id).max()
separator_ids = [separator_id]

profiles2 = sparker.JSONWrapper.load_profiles('../datasets/clean/DblpAcm/dataset2.json', 
                                              start_id_from = separator_id+1, 
                                              real_id_field = "realProfileID",
                                              source_id=2)
max_profile_id = profiles2.map(lambda profile: profile.profile_id).max()
profiles = profiles1.union(profiles2)

gt = sparker.JSONWrapper.load_groundtruth('../datasets/clean/DblpAcm/groundtruth.json', 'id1', 'id2')
new_gt = sparker.Converters.convert_groundtruth(gt, profiles1, profiles2)

## Blocking 

Performs the blocking by using standard token blocking

In [3]:
blocks = sparker.Blocking.create_blocks(profiles, separator_ids)

## Block cleaning
Applying some block cleaning techniques to remove some superfluous comparisons

In [4]:
# Perfoms the purging
blocks_purged = sparker.BlockPurging.block_purging(blocks, 1.025)
# Performs the cleaning
(profile_blocks, profile_blocks_filtered, blocks_after_filtering) = sparker.BlockFiltering.\
                                                                            block_filtering_quick(blocks_purged, 
                                                                                                  0.8, 
                                                                                                  separator_ids)

## Features generation
Generate the features set for each pair of entity profiles that co-occurs in at least one block (i.e. the edges of the meta-blocking graph)

In [5]:
features = sparker.FeatureGenerator.generate_features(profiles, blocks_after_filtering, separator_ids, new_gt, False)

In [6]:
features.show(10)

+---+----+---------+------------+------------+---------+---------+----------+----------+------------+------------+--------+
| p1|  p2|    cfibf|       raccb|          js|numCompP1|numCompP2|        rs|      aejs|         nrs|         wjs|is_match|
+---+----+---------+------------+------------+---------+---------+----------+----------+------------+------------+--------+
|  0|2880|43.542595|0.0011947431|0.0036764706|      143|       86|0.01724138| 1.0872893| 0.006724811|2.8883503E-4|       0|
|  0|2689|42.852882|0.0011947431|0.0036630037|      143|       84|0.01724138| 1.1100789| 0.011110576|8.1310613E-4|       0|
|  0|4610|42.852882|0.0011947431|0.0031055901|      143|      151|0.01724138| 1.0214063| 0.013291508|0.0010024176|       0|
|  0|3329| 39.37928|0.0011947431|0.0028735632|      143|      151|0.01724138| 0.7247032| 0.004989114|2.2629971E-4|       0|
|  0|2883| 45.18775|0.0011947431|0.0034722222|      143|      112|0.01724138| 1.1792971|   0.0170716| 0.001097521|       0|
|  0|487

## Scoring the edges
By using a probabilistic classifier (logistic regression) we assign to each pair (edge of the meta-blocking graph) the probability of being a match.

In [7]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
import pyspark.sql.functions as f
from pyspark.sql.types import FloatType, BooleanType

### Generation of the feature vector for the classifier

In [8]:
# Features to employ
features_set = ["cfibf", "raccb", "js", "numCompP1", "numCompP2", "rs", "aejs", "nrs", "wjs"]

va = VectorAssembler(inputCols=features_set, outputCol="features")

df = va.transform(features)

### Split the data in train/test
This will generate a balanced training set in which the number of positive/negative samples is the same.

In [9]:
# Number of samples per class (actually spark do not ensure this exact value during sampling)
n_samples = 20

# Sampling of matching pairs
matches = df.where("is_match == 1")
m_n = n_samples/matches.count()
m_train, m_test = matches.randomSplit([m_n, 1-m_n])

# Sampling of non-matching pairs
non_matches = df.where("is_match == 0")
nm_n = n_samples/non_matches.count()
nm_train, nm_test = non_matches.randomSplit([nm_n, 1-nm_n])

# Train/Test
train = m_train.union(nm_train)
test = m_test.union(nm_test)

### Training the classifier and get the probabilities

In [10]:
lr = LogisticRegression(featuresCol='features', 
                        labelCol='is_match', 
                        predictionCol='prediction', 
                        maxIter=1000, 
                        probabilityCol='probability'
                       )
# Training
model = lr.fit(train)
# Performs the predictions
predictions = model.transform(test)

# Get the results as the probability of each pair (edge) of being a match
get_p_match = f.udf(lambda v: float(v[1]), FloatType())
edges = predictions\
        .withColumn("p_match", get_p_match("probability"))\
        .select("p1", "p2", "p_match", "is_match")

edges.cache()
edges.count()

178234

## Perform the pruning
Perform the pruning and get the scores

In [11]:
pruned_edges = sparker.SupervisedMB.wep(edges)
pc, pq, f1 = sparker.SupervisedMB.get_stats(pruned_edges, new_gt)
print("Recall "+str(pc))
print("Precision "+str(pq))
print("F1 "+str(f1))

Recall 0.9806654676258992
Precision 0.5608125482129082
F1 0.7135612628823818


In [12]:
pruned_edges = sparker.SupervisedMB.blast(edges)
pc, pq, f1 = sparker.SupervisedMB.get_stats(pruned_edges, new_gt)
print("Recall "+str(pc))
print("Precision "+str(pq))
print("F1 "+str(f1))

Recall 0.9824640287769785
Precision 0.519990480723465
F1 0.6800497976968566


In [13]:
pruned_edges = sparker.SupervisedMB.rcnp(edges, profiles, blocks)
pc, pq, f1 = sparker.SupervisedMB.get_stats(pruned_edges, new_gt)
print("Recall "+str(pc))
print("Precision "+str(pq))
print("F1 "+str(f1))

Recall 0.9824640287769785
Precision 0.512670107930549
F1 0.6737588652482269


In [14]:
pruned_edges = sparker.SupervisedMB.cnp(edges, profiles, blocks)
pc, pq, f1 = sparker.SupervisedMB.get_stats(pruned_edges, new_gt)
print("Recall "+str(pc))
print("Precision "+str(pq))
print("F1 "+str(f1))

Recall 0.9824640287769785
Precision 0.5124296435272045
F1 0.6735511713933416
