Skip to content

Commit

Permalink
Added code for r:0.1.5
Browse files Browse the repository at this point in the history
Added
- SCANN/FAISS
- RowColumnClustering
- Schema Matching & Clustering component

Co-Authored-By: Jakub Maciejewski <71031279+JacobMaciejewski@users.noreply.github.com>
  • Loading branch information
Nikoletos-K and JacobMaciejewski committed Jan 15, 2024
1 parent ba566f2 commit 6bd3b25
Show file tree
Hide file tree
Showing 9 changed files with 495 additions and 46 deletions.
2 changes: 1 addition & 1 deletion docs/tutorials/SchemaMatching.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@
"metadata": {},
"outputs": [],
"source": [
"from pyjedai.schema_matching import ValentineMethodBuilder, ValentineSchemaMatching\n",
"from pyjedai.schema.matching import ValentineMethodBuilder, ValentineSchemaMatching\n",
"\n",
"sm = ValentineSchemaMatching(ValentineMethodBuilder.cupid_matcher())\n",
"sm.process(data)\n"
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "pyjedai"
version = "0.1.4"
version = "0.1.5"
description = "An open-source library that builds powerful end-to-end Entity Resolution workflows."
readme = "README.md"
authors = [
Expand Down Expand Up @@ -67,7 +67,8 @@ dependencies = [
"ordered-set >= 4.0",
"plotly >= 5.16.0",
"shapely >= 2.0",
'scann ; platform_system == "Linux"'
'scann ; platform_system == "Linux"',
'falconn ; platform_system == "Linux"'
]

[project.optional-dependencies]
Expand Down
2 changes: 1 addition & 1 deletion src/pyjedai/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.4"
__version__ = "0.1.5"
155 changes: 154 additions & 1 deletion src/pyjedai/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from collections import defaultdict
import random
from ordered_set import OrderedSet
import math

RANDOM_SEED = 42

Expand Down Expand Up @@ -320,10 +321,15 @@ def stats(self) -> None:

class AbstractClustering(PYJEDAIFeature):

_method_name: str = "Abstract Clustering"
_method_short_name: str = "AC"
_method_info: str = "Abstract Clustering Method"

def __init__(self) -> None:
super().__init__()
self.data: Data
self.similarity_threshold: float = 0.1
self.execution_time: float = 0.0

def evaluate(self,
prediction,
Expand Down Expand Up @@ -367,6 +373,9 @@ def evaluate(self,
def stats(self) -> None:
pass

def _configuration(self) -> dict:
return {}

def export_to_df(self, prediction: list) -> pd.DataFrame:
"""creates a dataframe for the evaluation report
Expand Down Expand Up @@ -769,6 +778,7 @@ class CorrelationClustering(AbstractClustering):
"In essence, it implements iterative clustering, " + \
"reassigning clusters to randomly chosen entities based on the reassignment's effect on our objective function " + \
"that evaluates the quality of the newly defined clusters."

def __init__(self) -> None:
super().__init__()
self.similarity_threshold: float
Expand All @@ -777,6 +787,7 @@ def __init__(self) -> None:
self.non_similarity_threshold : float
self.move_limit : int
self.lsi_iterations: int

def process(self,
graph: Graph,
data: Data,
Expand Down Expand Up @@ -1214,7 +1225,7 @@ def process(self,
for edge in edges:
man, woman, similarity = edge.left_node, edge.right_node, edge.similarity
new_graph.add_edge(man, woman, weight=similarity)

clusters = list(connected_components(new_graph))
self.execution_time = time() - start_time
return clusters
Expand Down Expand Up @@ -1431,5 +1442,147 @@ def process(self,
self.execution_time = time() - start_time
return clusters

def _configuration(self) -> dict:
return {}


class RowColumnClustering(AbstractClustering):
"""Ιmplements the Row Column Clustering algorithm. For each row and column find their equivalent
column and row respectively corresponding to the smallest similarity. Subsequently, chooses
either rows or columns dependent on which one has the highest out of the lowest similariities
on average.
"""

_method_name: str = "Row Column Clustering"
_method_short_name: str = "RCC"
_method_info: str = "Ιmplements the Row Column Clustering algorithm," + \
"In essence, it is a 3/2-approximation to the Maximum Stable Marriage (MSM) problem."
def __init__(self) -> None:
super().__init__()
self.similarity_threshold : float

def process(self,
graph: Graph,
data: Data,
similarity_threshold: float = 0.5) -> list:

start_time = time()
self.similarity_threshold : float = similarity_threshold
self.data = data
number_of_comparisons : int = len(graph.edges(data=True))
self.similarity = lil_matrix((self.data.num_of_entities_1, self.data.num_of_entities_2), dtype=float)
matched_ids = set()
new_graph : Graph = Graph()
clusters : list = []

if(number_of_comparisons == 0):
return clusters

if self.data.is_dirty_er:
raise ValueError(f"Kiraly MSM Approximate Clustering doesn't support Dirty ER.")

for (v1, v2, data) in graph.edges(data=True):
d1_id, d2_id = self.sorted_indicators(v1, v2)
d1_index, d2_index = (self.id_to_index(d1_id), self.id_to_index(d2_id))
similarity_score = data.get('weight', 0)

if(similarity_score > self.similarity_threshold):
self.similarity[d1_index, d2_index] = similarity_score

self.initialize(self.get_negative(self.similarity))
self.solution_proxy = self.get_solution()

for entry in range(len(self.solution_proxy)):
d1_index = entry
d2_index = self.solution_proxy[entry]
_similarity = self.similarity[d1_index, d2_index]
if(_similarity < self.similarity_threshold):
continue
d2_index += self.data.dataset_limit

if(d1_index in matched_ids or d2_index in matched_ids):
continue

matched_ids.add(d1_index)
matched_ids.add(d2_index)
new_graph.add_edge(d1_index, d2_index, weight=_similarity)


clusters = list(connected_components(new_graph))
self.execution_time = time() - start_time
return clusters

def get_min_row(self, column):
position = -1
minimum = math.inf

for row in range(self.similarity.shape[0]):
if(self.row_covered[row]): continue
if(self.similarity[row, column] < minimum):
position = row
minimum = self.similarity[row, column]

return position

def get_min_column(self, row):
position = -1
minimum = math.inf

for column in range(self.similarity.shape[1]):
if(self.column_covered[column]): continue
if(self.similarity[row, column] < minimum):
position = column
minimum = self.similarity[row, column]

return position

def get_row_assignment(self):
self.row_scan_cost = 0.0

for row in range(self.similarity.shape[0]):
self.selected_column[row] = self.get_min_column(row)
_selected_column = self.selected_column[row]
if(_selected_column == -1): break
self.column_covered[_selected_column] = True
self.row_scan_cost += self.similarity[row, _selected_column]

def get_column_assignment(self):
self.column_scan_cost = 0.0

for column in range(self.similarity.shape[1]):
self.selected_row[column] = self.get_min_row(column)
_selected_row = self.selected_row[column]
if(_selected_row == -1): break
self.columns_from_selected_row[_selected_row] = column
self.row_covered[_selected_row] = True
self.column_scan_cost += self.similarity[_selected_row, column]

def get_solution(self):
self.get_row_assignment()
self.get_column_assignment()

if(self.row_scan_cost < self.column_scan_cost):
return self.selected_column
else:
return self.columns_from_selected_row

def get_negative(self, similarity_matrix) -> np.array:
self.negative_similarity = lil_matrix((self.data.num_of_entities_1, self.data.num_of_entities_2), dtype=float)

for row in range(similarity_matrix.shape[0]):
for column in range(similarity_matrix.shape[1]):
self.negative_similarity[row, column] = 1.0 - similarity_matrix[row, column]

return self.negative_similarity

def initialize(self, similarity_matrix) -> None:
self.similarity = similarity_matrix
self.selected_column = [0] * similarity_matrix.shape[0]
self.selected_row = [0] * similarity_matrix.shape[1]
self.row_covered = [False] * similarity_matrix.shape[0]
self.column_covered = [False] * similarity_matrix.shape[1]

self.columns_from_selected_row = [0] * similarity_matrix.shape[0]

def _configuration(self) -> dict:
return {}
Loading

0 comments on commit 6bd3b25

Please sign in to comment.