MAINT run pyupgrade (#190)

FreeDiscovery · Jan 22, 2019 · 5c9d722 · 5c9d722
1 parent 403e831
commit 5c9d722
Show file tree

Hide file tree

Showing 21 changed files with 38 additions and 38 deletions.
diff --git a/doc/conf.py b/doc/conf.py
@@ -97,7 +97,7 @@ def __getattr__(cls, name):
 
     # Modules for which function level galleries are created.
     'doc_module'          : ('freediscovery'),
-    'filename_pattern': '.*\.py'
+    'filename_pattern': r'.*\.py'
 }
 
 # General information about the project.

diff --git a/freediscovery/cluster/base.py b/freediscovery/cluster/base.py
@@ -22,7 +22,7 @@ def select_top_words(word_list, n=10):
     for word in word_list:
         word_st = st.stem(word)
         if len(word_st) <= 2 or\
-                re.match('\d+', word_st) or \
+                re.match(r'\d+', word_st) or \
                 re.match('[^a-zA-Z0-9]', word_st) or\
                 word in COMMON_FIRST_NAMES or \
                 word in CUSTOM_STOP_WORDS or\

diff --git a/freediscovery/engine/ingestion.py b/freediscovery/engine/ingestion.py
@@ -29,7 +29,7 @@ def _list_filenames(data_dir, dir_pattern=None, file_pattern=None):
 
 def _infer_document_id_from_path(file_path):
     basename = os.path.basename
-    document_id = [re.sub('\D', '', basename(el)) for el in file_path]
+    document_id = [re.sub(r'\D', '', basename(el)) for el in file_path]
     non_digits = [el for el in document_id if not el.isdigit()]
     failed_msg = ('Warning: Could not infer document_id from file_path ({}), '
                   'falling back '

diff --git a/freediscovery/engine/tests/test_categorize.py b/freediscovery/engine/tests/test_categorize.py
@@ -36,7 +36,7 @@
 
 fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
 vect_uuid = fe.setup()
-fe.ingest(str(data_dir), file_pattern='.*\d.txt')
+fe.ingest(str(data_dir), file_pattern=r'.*\d.txt')
 
 
 lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w')

diff --git a/freediscovery/engine/tests/test_cluster.py b/freediscovery/engine/tests/test_cluster.py
@@ -26,7 +26,7 @@ def fd_setup():
     dsid = fe.setup(n_features=n_features, use_hashing=False,
                     stop_words='english',
                     min_df=0.1, max_df=0.9)
-    fe.ingest(data_dir, file_pattern='.*\d.txt')
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt')
 
     lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=dsid, mode='w')
     lsi.fit_transform(n_components=6)

diff --git a/freediscovery/engine/tests/test_integration.py b/freediscovery/engine/tests/test_integration.py
@@ -39,7 +39,7 @@ def test_features_hashing(use_hashing, use_lsi, method):
 
     fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
     uuid = fe.setup(n_features=n_features, use_hashing=use_hashing)
-    fe.ingest(data_dir, file_pattern='.*\d.txt')
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt')
 
     ground_truth = parse_ground_truth_file(os.path.join(data_dir,
                                            "..", "ground_truth_file.txt"))

diff --git a/freediscovery/engine/tests/test_lsi.py b/freediscovery/engine/tests/test_lsi.py
@@ -25,7 +25,7 @@ def test_lsi():
 
     fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
     uuid = fe.setup()
-    fe.ingest(data_dir, file_pattern='.*\d.txt')
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt')
 
     lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
     lsi_res, exp_var = lsi.fit_transform(n_components=n_components, alpha=1.0)

diff --git a/freediscovery/engine/tests/test_near_duplicates.py b/freediscovery/engine/tests/test_near_duplicates.py
@@ -18,7 +18,7 @@ def fd_setup(**fe_options):
     uuid = fe.setup(n_features=n_features, use_hashing=True,
                     stop_words='english',
                     **fe_options)
-    fe.ingest(data_dir, file_pattern='.*\d.txt')
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt')
     return cache_dir, uuid, fe.filenames_, fe
 
 

diff --git a/freediscovery/engine/tests/test_search.py b/freediscovery/engine/tests/test_search.py
@@ -21,7 +21,7 @@ def test_search_wrapper(kind):
 
     fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
     vect_uuid = fe.setup()
-    fe.ingest(data_dir, file_pattern='.*\d.txt')
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt')
 
     if kind == 'semantic':
         lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w')

diff --git a/freediscovery/engine/tests/test_vectorizer.py b/freediscovery/engine/tests/test_vectorizer.py
@@ -34,7 +34,7 @@ def test_feature_extraction_tokenization(analyzer, ngram_range, use_hashing):
     fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
     uuid = fe.setup(analyzer=analyzer, ngram_range=ngram_range,
                     use_hashing=use_hashing)
-    fe.ingest(data_dir, file_pattern='.*\d.txt')
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt')
 
     res2 = fe._load_features(uuid)
     assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2)
@@ -51,7 +51,7 @@ def test_feature_extraction_storage():
 
     fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
     uuid = fe.setup()
-    fe.ingest(data_dir, file_pattern='.*\d.txt')
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt')
     db = pd.read_pickle(os.path.join(cache_dir, 'ediscovery_cache',
                                      uuid, 'db'))
     assert 'file_path' not in db.columns
@@ -68,7 +68,7 @@ def test_feature_extraction_weighting(weighting,
 
     fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
     uuid = fe.setup(weighting=weighting, use_hashing=use_hashing)
-    fe.ingest(data_dir, file_pattern='.*\d.txt')
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt')
 
     res2 = fe._load_features(uuid)
     assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), \
@@ -91,7 +91,7 @@ def test_feature_extraction_nfeatures(n_features, weighting, use_hashing):
 
     fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
     uuid = fe.setup(n_features=n_features, weighting=weighting, use_hashing=use_hashing)
-    fe.ingest(data_dir, file_pattern='.*\d.txt')
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt')
 
     res2 = fe._load_features(uuid)
     assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), \
@@ -110,7 +110,7 @@ def test_search_filenames(use_hashing):
 
     fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
     uuid = fe.setup(use_hashing=use_hashing)
-    fe.ingest(data_dir, file_pattern='.*\d.txt')
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt')
 
     assert fe.db_ is not None
 
@@ -239,7 +239,7 @@ def test_sampling_filenames():
     with pytest.warns(UserWarning):
         # there is a warning because we don't use norm='l2'
         uuid = fe.setup(use_hashing=True, **fe_pars)
-        fe.ingest(data_dir, file_pattern='.*\d.txt')
+        fe.ingest(data_dir, file_pattern=r'.*\d.txt')
     X = fe._load_features(uuid)
 
     # don't use any sampling
@@ -313,7 +313,7 @@ def test_feature_extraction_cyrillic(use_hashing):
 
     fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
     uuid = fe.setup(use_hashing=use_hashing)
-    fe.ingest(data_dir, file_pattern='.*\d.txt')
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt')
 
     res2 = fe._load_features(uuid)
 
@@ -352,9 +352,9 @@ def test_ingestion_batches():
     uuid = fe.setup()
     with pytest.raises(ValueError):
         fe.ingest(vectorize=True)  # no ingested files
-    fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False)
-    fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False)
-    fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False)
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt', vectorize=False)
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt', vectorize=False)
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt', vectorize=False)
 
     fe.ingest(vectorize=True)
 
@@ -404,7 +404,7 @@ def test_non_random_dsid():
     fe = FeatureVectorizer(cache_dir=cache_dir, mode='w', dsid=dsid)
     uuid = fe.setup()
     assert dsid == uuid
-    fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False)
+    fe.ingest(data_dir, file_pattern=r'.*\d.txt', vectorize=False)
     # writing with the same name fails
     with pytest.raises(WrongParameter):
         FeatureVectorizer(cache_dir=cache_dir, mode='w', dsid=dsid)

diff --git a/freediscovery/engine/utils.py b/freediscovery/engine/utils.py
@@ -5,7 +5,7 @@
 def validate_mid(mid):
     """Validate a user provided dataset id"""
 
-    if not re.match('^[a-zA-Z0-9_\-]+$', mid):
+    if not re.match(r'^[a-zA-Z0-9_\-]+$', mid):
         raise WrongParameter(('id={} is not valid. '
                               'It can only contain letters, numbers '
                               'and "-", "_" characters. ')

diff --git a/freediscovery/externals/sklearn/metrics/base.py b/freediscovery/externals/sklearn/metrics/base.py
@@ -64,12 +64,12 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
     """
     average_options = (None, 'micro', 'macro', 'weighted', 'samples')
     if average not in average_options:
-        raise ValueError('average has to be one of {0}'
+        raise ValueError('average has to be one of {}'
                          ''.format(average_options))
 
     y_type = type_of_target(y_true)
     if y_type not in ("binary", "multilabel-indicator"):
-        raise ValueError("{0} format is not supported".format(y_type))
+        raise ValueError("{} format is not supported".format(y_type))
 
     if y_type == "binary":
         return binary_metric(y_true, y_score, sample_weight=sample_weight)

diff --git a/freediscovery/externals/sklearn/metrics/ranking.py b/freediscovery/externals/sklearn/metrics/ranking.py
@@ -603,7 +603,7 @@ def label_ranking_average_precision_score(y_true, y_score):
     y_type = type_of_target(y_true)
     if (y_type != "multilabel-indicator" and
             not (y_type == "binary" and y_true.ndim == 2)):
-        raise ValueError("{0} format is not supported".format(y_type))
+        raise ValueError("{} format is not supported".format(y_type))
 
     y_true = csr_matrix(y_true)
     y_score = -y_score
@@ -674,7 +674,7 @@ def coverage_error(y_true, y_score, sample_weight=None):
 
     y_type = type_of_target(y_true)
     if y_type != "multilabel-indicator":
-        raise ValueError("{0} format is not supported".format(y_type))
+        raise ValueError("{} format is not supported".format(y_type))
 
     if y_true.shape != y_score.shape:
         raise ValueError("y_true and y_score have different shape")
@@ -733,7 +733,7 @@ def label_ranking_loss(y_true, y_score, sample_weight=None):
 
     y_type = type_of_target(y_true)
     if y_type not in ("multilabel-indicator",):
-        raise ValueError("{0} format is not supported".format(y_type))
+        raise ValueError("{} format is not supported".format(y_type))
 
     if y_true.shape != y_score.shape:
         raise ValueError("y_true and y_score have different shape")

diff --git a/freediscovery/interpretation.py b/freediscovery/interpretation.py
@@ -285,7 +285,7 @@ def _create_random_weights(text, perc_keywords=0.5):
 if __name__ == "__main__":
     fname = 'data/ds_001/raw/0.7.6.28635.txt'
     with open(fname) as in_file:  #, encoding='utf-8') as in_file:
-        document_text = in_file.read().replace(u'\ufeff','')
+        document_text = in_file.read().replace('\ufeff','')
     words_weights = _create_random_weights(document_text, 0.2)
 
     COLORMAP = _make_cmap()

diff --git a/freediscovery/io.py b/freediscovery/io.py
@@ -8,7 +8,7 @@
 def parse_ground_truth_file(filename):
     """ Parse a ground truth file specified by a filename.
     Replace '/' by '\' when running in Windows """
-    df = pd.read_csv(filename, sep='[\s\t]+',
+    df = pd.read_csv(filename, sep='[\\s\t]+',
                      names=['file_path', 'is_relevant'], engine='python')
     if platform.system() == 'Windows':
         df.file_path = df.file_path.map(lambda path: path.replace('/', '\\'))
@@ -39,7 +39,7 @@ def parse_smart_tokens(text):
     data_key = None
 
     for line in text.splitlines():
-        key_match = re.match('^\.(?P<key>[A-Z])\s?(?P<val>.*)', line)
+        key_match = re.match(r'^\.(?P<key>[A-Z])\s?(?P<val>.*)', line)
         if key_match:
             data_key = key_match.group('key')
             if data_key == 'I':

diff --git a/freediscovery/server/gunicorn.py b/freediscovery/server/gunicorn.py
@@ -9,8 +9,8 @@ def __init__(self, app, options=None):
         super(GunicornApplication, self).__init__()
 
     def load_config(self):
-        config = dict([(key, value) for key, value in self.options.items()
-                       if key in self.cfg.settings and value is not None])
+        config = {key: value for key, value in self.options.items()
+                       if key in self.cfg.settings and value is not None}
         for key, value in config.items():
             self.cfg.set(key.lower(), value)
 

diff --git a/freediscovery/server/tests/test_categorization.py b/freediscovery/server/tests/test_categorization.py
@@ -71,7 +71,7 @@ def _api_categorization_wrapper(app, solver, cv, n_categories,
     method = V01 + "/feature-extraction/{}".format(dsid)
     data = app.get_check(method)
 
-    categories_list = list(set([row['category'] for row in ds_input['dataset']]))
+    categories_list = list({row['category'] for row in ds_input['dataset']})
 
     if n_categories_train is None:
         training_set = ds_input['training_set']

diff --git a/freediscovery/server/tests/test_cluster.py b/freediscovery/server/tests/test_cluster.py
@@ -106,7 +106,7 @@ def test_api_clustering(app, model, use_lsi, n_clusters, optimal_sampling):
             ref_res['cluster_depth'] = 'int'
         if not optimal_sampling:
             ref_res['cluster_label'] = 'str'
-            assert re.match('[^\[]+', row['cluster_label'])
+            assert re.match(r'[^\[]+', row['cluster_label'])
         assert dict2type(row, max_depth=1) == ref_res
         # make sure we have space separated words, not a str(list)
         for irow in row['documents']:

diff --git a/freediscovery/server/tests/test_exceptions.py b/freediscovery/server/tests/test_exceptions.py
@@ -40,13 +40,13 @@ def test_get_model_train_404(app_notest, method):
 @pytest.mark.parametrize("method", ['feature-extraction', 'categorization', 'lsi', 'clustering'])
 def test_get_model_predict_404(app_notest, method):
 
-    method = V01 + "/{0}/DOES_NOT_EXISTS/DOES_NOT_EXIST/predict".format(method)
+    method = V01 + "/{}/DOES_NOT_EXISTS/DOES_NOT_EXIST/predict".format(method)
     with _silent('stderr'):
         res = app_notest.get(method)
 
     assert res.status_code == 404
 
-    method = V01 + "/{0}/DOES_NOT_EXISTS/DOES_NOT_EXIST/test".format(method)
+    method = V01 + "/{}/DOES_NOT_EXISTS/DOES_NOT_EXIST/test".format(method)
     with _silent('stderr'):
         res = app_notest.post(method)
 

diff --git a/freediscovery/server/tests/test_search.py b/freediscovery/server/tests/test_search.py
@@ -63,7 +63,7 @@ def test_search(app, method, min_score, max_results):
 
     # check that relevant documents get returned first
     res = df.set_index('document_id').merge(id_mapping, left_index=True, right_index=True)
-    res['document_id_new'] = res.file_path.str.extract('(\d+)', expand=True).astype('int')
+    res['document_id_new'] = res.file_path.str.extract(r'(\d+)', expand=True).astype('int')
     res['rank_position'] = np.arange(len(res))
     res = res.set_index('document_id_new')
     res['ground_truth'] = np.in1d(res.index.values, [1190, 2256]).astype('int')

diff --git a/freediscovery/tests/test_datasets.py b/freediscovery/tests/test_datasets.py
@@ -32,7 +32,7 @@ def test_load_20newsgoups_dataset(name):
     assert dict2type(dataset[0]) == response_ref
     assert dict2type(training_set[1]) == response_ref
 
-    categories = sorted(list(set([row['category'] for row in dataset])))
+    categories = sorted(list({row['category'] for row in dataset}))
     for categories_sel in \
             [[categories[0]],
              [categories[1]],
@@ -46,7 +46,7 @@ def test_load_20newsgoups_dataset(name):
         for resp in [training_set, dataset]:
 
             assert dict2type(resp[0]) == response_ref
-            result_fields = list(set([el['category'] for el in resp]))
+            result_fields = list({el['category'] for el in resp})
 
             # the opposite if not always true (e.g. for small training sets)
             for key in result_fields: