Skip to content

Commit

Permalink
Merge pull request #570 from HyunjunA/master
Browse files Browse the repository at this point in the history
Update react and validateDataset.py to make errors on uploads more user-friendly
  • Loading branch information
HyunjunA committed Jan 31, 2023
2 parents ff1896d + 2ce389a commit d8d2045
Show file tree
Hide file tree
Showing 3 changed files with 527 additions and 68 deletions.
286 changes: 286 additions & 0 deletions lab/pyutils/validateDataset copy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
"""~This file is part of the Aliro library~
Copyright (C) 2023 Epistasis Lab,
Center for Artificial Intelligence Research and Education (CAIRE),
Department of Computational Biomedicine (CBM),
Cedars-Sinai Medical Center.
Aliro is maintained by:
- Hyunjun Choi (hyunjun.choi@cshs.org)
- Miguel Hernandez (miguel.e.hernandez@cshs.org)
- Nick Matsumoto (nicholas.matsumoto@cshs.org)
- Jay Moran (jay.moran@cshs.org)
- and many other generous open source contributors
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
(Autogenerated header, do not modify)
"""
import argparse
import sys
import simplejson
from sklearn.utils import check_X_y, check_array
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import os
import os.path
import pandas as pd
import numpy as np
import logging
import requests
import time
import traceback
from io import StringIO


logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)

MIN_ROWS = 10
MIN_COLS = 2
MIN_ROW_PER_CLASS = 2


def validate_data_from_server(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs):
# Read the data set into memory
raw_data = get_file_from_server(file_id)
df = pd.read_csv(StringIO(raw_data), sep=None, engine='python', **kwargs)
return validate_data(df, prediction_type, target_field, categories, ordinals)


def validate_data_from_filepath(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs):
# Read the data set into memory
df = pd.read_csv(file_id, sep=None, engine='python', **kwargs)
return validate_data(df, prediction_type, target_field, categories, ordinals)


def encode_data(df, target_column, categories, ordinals, encoding_strategy="OneHotEncoder"):
'''
use OneHotEncoder or OrdinalEncoder to convert categorical features
See skl_utils
'''

# check that categorical and ordinal columns can be encoded
if categories or ordinals:
transformers = []
if categories:
if encoding_strategy == "OneHotEncoder":
transformers.append(
("categorical_encoder", OneHotEncoder(), categories))
elif encoding_strategy == "OrdinalEncoder":
transformers.append(
("categorical_encoder", OrdinalEncoder(), categories))
if ordinals:
ordinal_features = sorted(list(ordinals.keys()))
ordinal_map = [ordinals[k] for k in ordinal_features]
transformers.append(("ordinalencoder",
OrdinalEncoder(categories=ordinal_map),
ordinal_features))

ct = ColumnTransformer(
transformers=transformers,
remainder='passthrough',
sparse_threshold=0
)
return ct.fit_transform(df)
else:
return df


def validate_data(df, prediction_type="classification", target_column=None, categories=None, ordinals=None):
'''
Check that a datafile is valid
@return tuple
boolean - validation result
string - message
'''

if prediction_type not in ["classification", "regression"]:
logger.warn(f"Invalid prediction type: '{prediction_type}'")
return False, f"Invalid prediction type: '{prediction_type}'"

num_df = df

# dimension validation
if df.shape[0] < MIN_ROWS:
logger.warn("Dataset has dimensions {}, classification datasets must have at least {} rows.".format(
df.shape, MIN_ROWS))
return False, "Dataset has dimensions {}, classification datasets must have at least {} rows.".format(df.shape, MIN_ROWS)

if df.shape[1] < MIN_COLS:
logger.warn("Dataset has dimensions {}, classification datasets must have at least {} columns.".format(
df.shape, MIN_COLS))
return False, "Dataset has dimensions {}, classification datasets must have at least {} columns.".format(df.shape, MIN_COLS)

# target column validation
if (target_column != None):
if not (target_column in df.columns):
logger.warn("Target column '" + target_column + "' not in data")
return False, "Target column '" + target_column + "' not in data"
if categories and target_column in categories:
logger.warn("Target column '" + target_column +
"' cannot be a categorical feature")
return False, "Target column '" + target_column + "' cannot be a categorical feature"
if ordinals and target_column in ordinals:
logger.warn("Target column '" + target_column +
"' cannot be an ordinal feature")
return False, "Target column '" + target_column + "' cannot be an ordinal feature"

# check that cat columns can be encoded
if categories or ordinals:
try:
encode_data(df, target_column, categories,
ordinals, "OneHotEncoder")
encode_data(df, target_column, categories,
ordinals, "OrdinalEncoder")
except Exception as e:
logger.warn("encode_data() failed, " + str(e))
return False, "encode_data() failed, " + str(e)

if categories:
num_df = num_df.drop(columns=categories)
if ordinals:
num_df = num_df.drop(columns=list(ordinals.keys()))

# check only check if target is specified
if target_column:

# classification
if (prediction_type == "classification"):
# target column of classification problem does not need to be numeric
num_df = num_df.drop(columns=target_column, axis=1)

# Check rows per class
counts = df.groupby(target_column).count()
fails_validation = counts[counts[counts.columns[1]]
< MIN_ROW_PER_CLASS]
if (not fails_validation.empty):
msg = "Classification datasets must have at least 2 rows per class, class(es) '{}' have only 1 row.".format(
list(fails_validation.index.values))
logger.warn(msg)
return False, msg

# check that non-cat feature columns contain only numeric data
if (len(num_df.columns)) > 0:
try:
check_array(num_df, dtype=np.float64,
order="C", force_all_finite=True)

except Exception as e:
logger.warn("sklearn.check_array() validation " + str(e))
return False, "sklearn.check_array() validation " + str(e)

# check t

return True, None


def get_file_from_server(file_id):
'''
Retrieve a file from the main Aliro server
'''
apiPath = 'http://' + os.environ['LAB_HOST'] + ':' + os.environ['LAB_PORT']
path = apiPath + "/api/v1/files/" + file_id

logger.debug("retrieving file:" + file_id)
logger.debug("api path: " + path)

res = None
try:
res = requests.request('GET', path, timeout=15)
except:
logger.error("Unexpected error in get_file_from_server for path 'GET: " +
str(path) + "': " + str(sys.exc_info()[0]))
raise

if res.status_code != requests.codes.ok:
msg = "Request GET status_code not ok, path: '" + \
str(path) + "'' status code: '" + str(res.status_code) + \
"'' response text: " + str(res.text)
logger.error(msg)
raise RuntimeError(msg)

logger.info("File retrieved, file_id: '" + file_id +
"', path: '" + path + "', status_code: " + str(res.status_code))
return res.text


def main():
meta_features_all = []
parser = argparse.ArgumentParser(
description="Validate a dataset", add_help=False)
parser.add_argument('INPUT_FILE', type=str, help='Filepath or fileId.')
parser.add_argument('-target', action='store', dest='TARGET', type=str, default='class',
help='Name of target column', required=False)
parser.add_argument('-identifier_type', action='store', dest='IDENTIFIER_TYPE', type=str, choices=['filepath', 'fileid'], default='filepath',
help='Name of target column')
parser.add_argument('-categorical_features', action='store', dest='JSON_CATEGORIES', type=str, required=False, default=None,
help='JSON list of categorical features')
parser.add_argument('-ordinal_features', action='store', dest='JSON_ORDINALS', type=str, required=False, default=None,
help='JSON dict of ordianl features and possible values')
parser.add_argument('-prediction_type', action='store', dest='PREDICTION_TYPE', type=str, choices=['classification', 'regression'], default="classification",
help="Classification or regression problem")

args = parser.parse_args()

# set up the file logger
logpath = os.path.join(os.environ['PROJECT_ROOT'], "target/logs")
if not os.path.exists(logpath):
os.makedirs(logpath)

formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
fhandler = logging.FileHandler(
os.path.join(logpath, 'validateDataset.log'))
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)

success = None
errorMessage = None
meta_json = None

categories = None
ordinals = None

try:
if args.JSON_CATEGORIES:
categories = simplejson.loads(args.JSON_CATEGORIES)
if args.JSON_ORDINALS:
ordinals = simplejson.loads(args.JSON_ORDINALS)
prediction_type = args.PREDICTION_TYPE
# print("categories: ")
# print(categories)

if (args.IDENTIFIER_TYPE == 'filepath'):
success, errorMessage = validate_data_from_filepath(
args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals)
else:
success, errorMessage = validate_data_from_server(
args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals)
meta_json = simplejson.dumps(
{"success": success, "errorMessage": errorMessage}, ignore_nan=True) # , ensure_ascii=False)
except Exception as e:
logger.error(traceback.format_exc())
meta_json = simplejson.dumps(
{"success": False, "errorMessage": "Exception: " + repr(e)}, ignore_nan=True) # , ensure_ascii=False)

print(meta_json)
sys.stdout.flush()


if __name__ == '__main__':
main()
Loading

0 comments on commit d8d2045

Please sign in to comment.