Skip to content

Commit

Permalink
MAINT run pyupgrade (#190)
Browse files Browse the repository at this point in the history
  • Loading branch information
rth committed Jan 22, 2019
1 parent 403e831 commit 5c9d722
Show file tree
Hide file tree
Showing 21 changed files with 38 additions and 38 deletions.
2 changes: 1 addition & 1 deletion doc/conf.py
Expand Up @@ -97,7 +97,7 @@ def __getattr__(cls, name):

# Modules for which function level galleries are created.
'doc_module' : ('freediscovery'),
'filename_pattern': '.*\.py'
'filename_pattern': r'.*\.py'
}

# General information about the project.
Expand Down
2 changes: 1 addition & 1 deletion freediscovery/cluster/base.py
Expand Up @@ -22,7 +22,7 @@ def select_top_words(word_list, n=10):
for word in word_list:
word_st = st.stem(word)
if len(word_st) <= 2 or\
re.match('\d+', word_st) or \
re.match(r'\d+', word_st) or \
re.match('[^a-zA-Z0-9]', word_st) or\
word in COMMON_FIRST_NAMES or \
word in CUSTOM_STOP_WORDS or\
Expand Down
2 changes: 1 addition & 1 deletion freediscovery/engine/ingestion.py
Expand Up @@ -29,7 +29,7 @@ def _list_filenames(data_dir, dir_pattern=None, file_pattern=None):

def _infer_document_id_from_path(file_path):
basename = os.path.basename
document_id = [re.sub('\D', '', basename(el)) for el in file_path]
document_id = [re.sub(r'\D', '', basename(el)) for el in file_path]
non_digits = [el for el in document_id if not el.isdigit()]
failed_msg = ('Warning: Could not infer document_id from file_path ({}), '
'falling back '
Expand Down
2 changes: 1 addition & 1 deletion freediscovery/engine/tests/test_categorize.py
Expand Up @@ -36,7 +36,7 @@

fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
vect_uuid = fe.setup()
fe.ingest(str(data_dir), file_pattern='.*\d.txt')
fe.ingest(str(data_dir), file_pattern=r'.*\d.txt')


lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w')
Expand Down
2 changes: 1 addition & 1 deletion freediscovery/engine/tests/test_cluster.py
Expand Up @@ -26,7 +26,7 @@ def fd_setup():
dsid = fe.setup(n_features=n_features, use_hashing=False,
stop_words='english',
min_df=0.1, max_df=0.9)
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')

lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=dsid, mode='w')
lsi.fit_transform(n_components=6)
Expand Down
2 changes: 1 addition & 1 deletion freediscovery/engine/tests/test_integration.py
Expand Up @@ -39,7 +39,7 @@ def test_features_hashing(use_hashing, use_lsi, method):

fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup(n_features=n_features, use_hashing=use_hashing)
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')

ground_truth = parse_ground_truth_file(os.path.join(data_dir,
"..", "ground_truth_file.txt"))
Expand Down
2 changes: 1 addition & 1 deletion freediscovery/engine/tests/test_lsi.py
Expand Up @@ -25,7 +25,7 @@ def test_lsi():

fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup()
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')

lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
lsi_res, exp_var = lsi.fit_transform(n_components=n_components, alpha=1.0)
Expand Down
2 changes: 1 addition & 1 deletion freediscovery/engine/tests/test_near_duplicates.py
Expand Up @@ -18,7 +18,7 @@ def fd_setup(**fe_options):
uuid = fe.setup(n_features=n_features, use_hashing=True,
stop_words='english',
**fe_options)
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')
return cache_dir, uuid, fe.filenames_, fe


Expand Down
2 changes: 1 addition & 1 deletion freediscovery/engine/tests/test_search.py
Expand Up @@ -21,7 +21,7 @@ def test_search_wrapper(kind):

fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
vect_uuid = fe.setup()
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')

if kind == 'semantic':
lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w')
Expand Down
22 changes: 11 additions & 11 deletions freediscovery/engine/tests/test_vectorizer.py
Expand Up @@ -34,7 +34,7 @@ def test_feature_extraction_tokenization(analyzer, ngram_range, use_hashing):
fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup(analyzer=analyzer, ngram_range=ngram_range,
use_hashing=use_hashing)
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')

res2 = fe._load_features(uuid)
assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2)
Expand All @@ -51,7 +51,7 @@ def test_feature_extraction_storage():

fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup()
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')
db = pd.read_pickle(os.path.join(cache_dir, 'ediscovery_cache',
uuid, 'db'))
assert 'file_path' not in db.columns
Expand All @@ -68,7 +68,7 @@ def test_feature_extraction_weighting(weighting,

fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup(weighting=weighting, use_hashing=use_hashing)
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')

res2 = fe._load_features(uuid)
assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), \
Expand All @@ -91,7 +91,7 @@ def test_feature_extraction_nfeatures(n_features, weighting, use_hashing):

fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup(n_features=n_features, weighting=weighting, use_hashing=use_hashing)
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')

res2 = fe._load_features(uuid)
assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), \
Expand All @@ -110,7 +110,7 @@ def test_search_filenames(use_hashing):

fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup(use_hashing=use_hashing)
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')

assert fe.db_ is not None

Expand Down Expand Up @@ -239,7 +239,7 @@ def test_sampling_filenames():
with pytest.warns(UserWarning):
# there is a warning because we don't use norm='l2'
uuid = fe.setup(use_hashing=True, **fe_pars)
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')
X = fe._load_features(uuid)

# don't use any sampling
Expand Down Expand Up @@ -313,7 +313,7 @@ def test_feature_extraction_cyrillic(use_hashing):

fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup(use_hashing=use_hashing)
fe.ingest(data_dir, file_pattern='.*\d.txt')
fe.ingest(data_dir, file_pattern=r'.*\d.txt')

res2 = fe._load_features(uuid)

Expand Down Expand Up @@ -352,9 +352,9 @@ def test_ingestion_batches():
uuid = fe.setup()
with pytest.raises(ValueError):
fe.ingest(vectorize=True) # no ingested files
fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False)
fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False)
fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False)
fe.ingest(data_dir, file_pattern=r'.*\d.txt', vectorize=False)
fe.ingest(data_dir, file_pattern=r'.*\d.txt', vectorize=False)
fe.ingest(data_dir, file_pattern=r'.*\d.txt', vectorize=False)

fe.ingest(vectorize=True)

Expand Down Expand Up @@ -404,7 +404,7 @@ def test_non_random_dsid():
fe = FeatureVectorizer(cache_dir=cache_dir, mode='w', dsid=dsid)
uuid = fe.setup()
assert dsid == uuid
fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False)
fe.ingest(data_dir, file_pattern=r'.*\d.txt', vectorize=False)
# writing with the same name fails
with pytest.raises(WrongParameter):
FeatureVectorizer(cache_dir=cache_dir, mode='w', dsid=dsid)
Expand Down
2 changes: 1 addition & 1 deletion freediscovery/engine/utils.py
Expand Up @@ -5,7 +5,7 @@
def validate_mid(mid):
"""Validate a user provided dataset id"""

if not re.match('^[a-zA-Z0-9_\-]+$', mid):
if not re.match(r'^[a-zA-Z0-9_\-]+$', mid):
raise WrongParameter(('id={} is not valid. '
'It can only contain letters, numbers '
'and "-", "_" characters. ')
Expand Down
4 changes: 2 additions & 2 deletions freediscovery/externals/sklearn/metrics/base.py
Expand Up @@ -64,12 +64,12 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
"""
average_options = (None, 'micro', 'macro', 'weighted', 'samples')
if average not in average_options:
raise ValueError('average has to be one of {0}'
raise ValueError('average has to be one of {}'
''.format(average_options))

y_type = type_of_target(y_true)
if y_type not in ("binary", "multilabel-indicator"):
raise ValueError("{0} format is not supported".format(y_type))
raise ValueError("{} format is not supported".format(y_type))

if y_type == "binary":
return binary_metric(y_true, y_score, sample_weight=sample_weight)
Expand Down
6 changes: 3 additions & 3 deletions freediscovery/externals/sklearn/metrics/ranking.py
Expand Up @@ -603,7 +603,7 @@ def label_ranking_average_precision_score(y_true, y_score):
y_type = type_of_target(y_true)
if (y_type != "multilabel-indicator" and
not (y_type == "binary" and y_true.ndim == 2)):
raise ValueError("{0} format is not supported".format(y_type))
raise ValueError("{} format is not supported".format(y_type))

y_true = csr_matrix(y_true)
y_score = -y_score
Expand Down Expand Up @@ -674,7 +674,7 @@ def coverage_error(y_true, y_score, sample_weight=None):

y_type = type_of_target(y_true)
if y_type != "multilabel-indicator":
raise ValueError("{0} format is not supported".format(y_type))
raise ValueError("{} format is not supported".format(y_type))

if y_true.shape != y_score.shape:
raise ValueError("y_true and y_score have different shape")
Expand Down Expand Up @@ -733,7 +733,7 @@ def label_ranking_loss(y_true, y_score, sample_weight=None):

y_type = type_of_target(y_true)
if y_type not in ("multilabel-indicator",):
raise ValueError("{0} format is not supported".format(y_type))
raise ValueError("{} format is not supported".format(y_type))

if y_true.shape != y_score.shape:
raise ValueError("y_true and y_score have different shape")
Expand Down
2 changes: 1 addition & 1 deletion freediscovery/interpretation.py
Expand Up @@ -285,7 +285,7 @@ def _create_random_weights(text, perc_keywords=0.5):
if __name__ == "__main__":
fname = 'data/ds_001/raw/0.7.6.28635.txt'
with open(fname) as in_file: #, encoding='utf-8') as in_file:
document_text = in_file.read().replace(u'\ufeff','')
document_text = in_file.read().replace('\ufeff','')
words_weights = _create_random_weights(document_text, 0.2)

COLORMAP = _make_cmap()
Expand Down
4 changes: 2 additions & 2 deletions freediscovery/io.py
Expand Up @@ -8,7 +8,7 @@
def parse_ground_truth_file(filename):
""" Parse a ground truth file specified by a filename.
Replace '/' by '\' when running in Windows """
df = pd.read_csv(filename, sep='[\s\t]+',
df = pd.read_csv(filename, sep='[\\s\t]+',
names=['file_path', 'is_relevant'], engine='python')
if platform.system() == 'Windows':
df.file_path = df.file_path.map(lambda path: path.replace('/', '\\'))
Expand Down Expand Up @@ -39,7 +39,7 @@ def parse_smart_tokens(text):
data_key = None

for line in text.splitlines():
key_match = re.match('^\.(?P<key>[A-Z])\s?(?P<val>.*)', line)
key_match = re.match(r'^\.(?P<key>[A-Z])\s?(?P<val>.*)', line)
if key_match:
data_key = key_match.group('key')
if data_key == 'I':
Expand Down
4 changes: 2 additions & 2 deletions freediscovery/server/gunicorn.py
Expand Up @@ -9,8 +9,8 @@ def __init__(self, app, options=None):
super(GunicornApplication, self).__init__()

def load_config(self):
config = dict([(key, value) for key, value in self.options.items()
if key in self.cfg.settings and value is not None])
config = {key: value for key, value in self.options.items()
if key in self.cfg.settings and value is not None}
for key, value in config.items():
self.cfg.set(key.lower(), value)

Expand Down
2 changes: 1 addition & 1 deletion freediscovery/server/tests/test_categorization.py
Expand Up @@ -71,7 +71,7 @@ def _api_categorization_wrapper(app, solver, cv, n_categories,
method = V01 + "/feature-extraction/{}".format(dsid)
data = app.get_check(method)

categories_list = list(set([row['category'] for row in ds_input['dataset']]))
categories_list = list({row['category'] for row in ds_input['dataset']})

if n_categories_train is None:
training_set = ds_input['training_set']
Expand Down
2 changes: 1 addition & 1 deletion freediscovery/server/tests/test_cluster.py
Expand Up @@ -106,7 +106,7 @@ def test_api_clustering(app, model, use_lsi, n_clusters, optimal_sampling):
ref_res['cluster_depth'] = 'int'
if not optimal_sampling:
ref_res['cluster_label'] = 'str'
assert re.match('[^\[]+', row['cluster_label'])
assert re.match(r'[^\[]+', row['cluster_label'])
assert dict2type(row, max_depth=1) == ref_res
# make sure we have space separated words, not a str(list)
for irow in row['documents']:
Expand Down
4 changes: 2 additions & 2 deletions freediscovery/server/tests/test_exceptions.py
Expand Up @@ -40,13 +40,13 @@ def test_get_model_train_404(app_notest, method):
@pytest.mark.parametrize("method", ['feature-extraction', 'categorization', 'lsi', 'clustering'])
def test_get_model_predict_404(app_notest, method):

method = V01 + "/{0}/DOES_NOT_EXISTS/DOES_NOT_EXIST/predict".format(method)
method = V01 + "/{}/DOES_NOT_EXISTS/DOES_NOT_EXIST/predict".format(method)
with _silent('stderr'):
res = app_notest.get(method)

assert res.status_code == 404

method = V01 + "/{0}/DOES_NOT_EXISTS/DOES_NOT_EXIST/test".format(method)
method = V01 + "/{}/DOES_NOT_EXISTS/DOES_NOT_EXIST/test".format(method)
with _silent('stderr'):
res = app_notest.post(method)

Expand Down
2 changes: 1 addition & 1 deletion freediscovery/server/tests/test_search.py
Expand Up @@ -63,7 +63,7 @@ def test_search(app, method, min_score, max_results):

# check that relevant documents get returned first
res = df.set_index('document_id').merge(id_mapping, left_index=True, right_index=True)
res['document_id_new'] = res.file_path.str.extract('(\d+)', expand=True).astype('int')
res['document_id_new'] = res.file_path.str.extract(r'(\d+)', expand=True).astype('int')
res['rank_position'] = np.arange(len(res))
res = res.set_index('document_id_new')
res['ground_truth'] = np.in1d(res.index.values, [1190, 2256]).astype('int')
Expand Down
4 changes: 2 additions & 2 deletions freediscovery/tests/test_datasets.py
Expand Up @@ -32,7 +32,7 @@ def test_load_20newsgoups_dataset(name):
assert dict2type(dataset[0]) == response_ref
assert dict2type(training_set[1]) == response_ref

categories = sorted(list(set([row['category'] for row in dataset])))
categories = sorted(list({row['category'] for row in dataset}))
for categories_sel in \
[[categories[0]],
[categories[1]],
Expand All @@ -46,7 +46,7 @@ def test_load_20newsgoups_dataset(name):
for resp in [training_set, dataset]:

assert dict2type(resp[0]) == response_ref
result_fields = list(set([el['category'] for el in resp]))
result_fields = list({el['category'] for el in resp})

# the opposite if not always true (e.g. for small training sets)
for key in result_fields:
Expand Down

0 comments on commit 5c9d722

Please sign in to comment.