--swap_source_target and --drop_invalid args for convenience

joernhees · joernhees · commit 9a60ce40a03a · 2019-04-09T16:47:40.000+02:00
diff --git a/gp_learner.py b/gp_learner.py
@@ -1682,6 +1682,8 @@ def main(
         splitting_variant='random',
         train_filename=None,
         test_filename=None,
+        swap_source_target=False,
+        drop_invalid=False,
         init_patterns_filename=None,
         print_train_test_sets=True,
         reset=False,
@@ -1708,14 +1710,18 @@ def main(
     timer_start = datetime.utcnow()
     main_start = timer_start
 
+    gsa = partial(
+        get_semantic_associations,
+        swap_source_target=swap_source_target,
+        drop_invalid=drop_invalid,
+    )
     if not train_filename and not test_filename:
         # get semantic association pairs and split in train and test sets
-        semantic_associations = get_semantic_associations(associations_filename)
+        semantic_associations = gsa(associations_filename)
         assocs_train, assocs_test = split_training_test_set(
             semantic_associations, variant=splitting_variant
         )
     else:
-        gsa = get_semantic_associations
         assocs_train = gsa(train_filename) if train_filename else []
         assocs_test = gsa(test_filename) if test_filename else []
         if predict == 'train_set':
diff --git a/ground_truth_tools.py b/ground_truth_tools.py
@@ -81,7 +81,9 @@ def URIRefify(links):
     return tuple([URIRef(l) for l in links])
 
 
-def get_semantic_associations(fn=None, limit=None):
+def get_semantic_associations(
+        fn=None, limit=None, swap_source_target=False, drop_invalid=False
+):
     if not fn:
         verified_mappings = get_verified_mappings()
         semantic_associations = get_dbpedia_pairs_from_mappings(
@@ -105,7 +107,31 @@ def get_semantic_associations(fn=None, limit=None):
                     break
                 source = from_n3(row['source'].decode('UTF-8'))
                 target = from_n3(row['target'].decode('UTF-8'))
-                semantic_associations.append((source, target))
+
+                for x in (source, target):
+                    # noinspection PyBroadException
+                    try:
+                        x.n3()
+                    except Exception:
+                        if drop_invalid:
+                            logger.warning(
+                                'ignoring ground truth pair %r: %r cannot be '
+                                'serialized as N3',
+                                (row['source'], row['target']), x
+                            )
+                            break
+                        else:
+                            logger.error(
+                                'error in ground truth pair %r: %r cannot be '
+                                'serialized as N3',
+                                (row['source'], row['target']), x
+                            )
+                            raise
+                else:
+                    semantic_associations.append((source, target))
+    if swap_source_target:
+        logger.info('swapping all (source, target) pairs: (s,t) --> (t,s)')
+        semantic_associations = [(t, s) for s, t in semantic_associations]
     return semantic_associations
 
 
diff --git a/run.py b/run.py
@@ -64,6 +64,22 @@
         default=None,
     )
 
+    parser.add_argument(
+        "--swap_source_target",
+        help="allows to turn the ground truth source-target-pairs around for "
+             "all following considerations: (s,t) --> (t,s)",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--drop_invalid",
+        help="drops invalid ground truth source-target-pairs (i.e., invalid N3 "
+             "pairs (e.g., due to bad (URI) encoding)). Will still warn about "
+             "them.",
+        action="store_true",
+        default=False,
+    )
+
     parser.add_argument(
         "--print_train_test_sets",
         help="prints the sets used for training and testing",