EpistasisLab · HyunjunA · Jan 31, 2023 · Jan 28, 2023 · Jan 28, 2023 · Jan 30, 2023
diff --git a/lab/pyutils/validateDataset copy.py b/lab/pyutils/validateDataset copy.py
@@ -0,0 +1,286 @@
+"""~This file is part of the Aliro library~
+
+Copyright (C) 2023 Epistasis Lab, 
+Center for Artificial Intelligence Research and Education (CAIRE),
+Department of Computational Biomedicine (CBM),
+Cedars-Sinai Medical Center.
+
+Aliro is maintained by:
+    - Hyunjun Choi (hyunjun.choi@cshs.org)
+    - Miguel Hernandez (miguel.e.hernandez@cshs.org)
+    - Nick Matsumoto (nicholas.matsumoto@cshs.org)
+    - Jay Moran (jay.moran@cshs.org)
+    - and many other generous open source contributors
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+(Autogenerated header, do not modify)
+
+"""
+import argparse
+import sys
+import simplejson
+from sklearn.utils import check_X_y, check_array
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
+from sklearn.compose import ColumnTransformer
+import os
+import os.path
+import pandas as pd
+import numpy as np
+import logging
+import requests
+import time
+import traceback
+from io import StringIO
+
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler())
+logger.setLevel(logging.INFO)
+
+MIN_ROWS = 10
+MIN_COLS = 2
+MIN_ROW_PER_CLASS = 2
+
+
+def validate_data_from_server(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs):
+    # Read the data set into memory
+    raw_data = get_file_from_server(file_id)
+    df = pd.read_csv(StringIO(raw_data), sep=None, engine='python', **kwargs)
+    return validate_data(df, prediction_type, target_field, categories, ordinals)
+
+
+def validate_data_from_filepath(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs):
+    # Read the data set into memory
+    df = pd.read_csv(file_id, sep=None, engine='python', **kwargs)
+    return validate_data(df, prediction_type, target_field, categories, ordinals)
+
+
+def encode_data(df, target_column, categories, ordinals, encoding_strategy="OneHotEncoder"):
+    '''
+    use OneHotEncoder or OrdinalEncoder to convert categorical features
+    See skl_utils
+    '''
+
+    # check that categorical and ordinal columns can be encoded
+    if categories or ordinals:
+        transformers = []
+        if categories:
+            if encoding_strategy == "OneHotEncoder":
+                transformers.append(
+                    ("categorical_encoder", OneHotEncoder(), categories))
+            elif encoding_strategy == "OrdinalEncoder":
+                transformers.append(
+                    ("categorical_encoder", OrdinalEncoder(), categories))
+        if ordinals:
+            ordinal_features = sorted(list(ordinals.keys()))
+            ordinal_map = [ordinals[k] for k in ordinal_features]
+            transformers.append(("ordinalencoder",
+                                 OrdinalEncoder(categories=ordinal_map),
+                                 ordinal_features))
+
+        ct = ColumnTransformer(
+            transformers=transformers,
+            remainder='passthrough',
+            sparse_threshold=0
+        )
+        return ct.fit_transform(df)
+    else:
+        return df
+
+
+def validate_data(df, prediction_type="classification", target_column=None, categories=None, ordinals=None):
+    '''
+    Check that a datafile is valid
+
+
+    @return tuple
+        boolean - validation result
+        string  - message
+    '''
+
+    if prediction_type not in ["classification", "regression"]:
+        logger.warn(f"Invalid prediction type: '{prediction_type}'")
+        return False, f"Invalid prediction type: '{prediction_type}'"
+
+    num_df = df
+
+    # dimension validation
+    if df.shape[0] < MIN_ROWS:
+        logger.warn("Dataset has dimensions {}, classification datasets must have at least {} rows.".format(
+            df.shape, MIN_ROWS))
+        return False, "Dataset has dimensions {}, classification datasets must have at least {} rows.".format(df.shape, MIN_ROWS)
+
+    if df.shape[1] < MIN_COLS:
+        logger.warn("Dataset has dimensions {}, classification datasets must have at least {} columns.".format(
+            df.shape, MIN_COLS))
+        return False, "Dataset has dimensions {}, classification datasets must have at least {} columns.".format(df.shape, MIN_COLS)
+
+    # target column validation
+    if (target_column != None):
+        if not (target_column in df.columns):
+            logger.warn("Target column '" + target_column + "' not in data")
+            return False, "Target column '" + target_column + "' not in data"
+        if categories and target_column in categories:
+            logger.warn("Target column '" + target_column +
+                        "' cannot be a categorical feature")
+            return False, "Target column '" + target_column + "' cannot be a categorical feature"
+        if ordinals and target_column in ordinals:
+            logger.warn("Target column '" + target_column +
+                        "' cannot be an ordinal feature")
+            return False, "Target column '" + target_column + "' cannot be an ordinal feature"
+
+    # check that cat columns can be encoded
+    if categories or ordinals:
+        try:
+            encode_data(df, target_column, categories,
+                        ordinals, "OneHotEncoder")
+            encode_data(df, target_column, categories,
+                        ordinals, "OrdinalEncoder")
+        except Exception as e:
+            logger.warn("encode_data() failed, " + str(e))
+            return False, "encode_data() failed, " + str(e)
+
+        if categories:
+            num_df = num_df.drop(columns=categories)
+        if ordinals:
+            num_df = num_df.drop(columns=list(ordinals.keys()))
+
+    # check only check if target is specified
+    if target_column:
+
+        # classification
+        if (prediction_type == "classification"):
+            # target column of classification problem does not need to be numeric
+            num_df = num_df.drop(columns=target_column, axis=1)
+
+            # Check rows per class
+            counts = df.groupby(target_column).count()
+            fails_validation = counts[counts[counts.columns[1]]
+                                      < MIN_ROW_PER_CLASS]
+            if (not fails_validation.empty):
+                msg = "Classification datasets must have at least 2 rows per class, class(es) '{}' have only 1 row.".format(
+                    list(fails_validation.index.values))
+                logger.warn(msg)
+                return False, msg
+
+        # check that non-cat feature columns contain only numeric data
+        if (len(num_df.columns)) > 0:
+            try:
+                check_array(num_df, dtype=np.float64,
+                            order="C", force_all_finite=True)
+
+            except Exception as e:
+                logger.warn("sklearn.check_array() validation " + str(e))
+                return False, "sklearn.check_array() validation " + str(e)
+
+        # check t
+
+    return True, None
+
+
+def get_file_from_server(file_id):
+    '''
+    Retrieve a file from the main Aliro server
+    '''
+    apiPath = 'http://' + os.environ['LAB_HOST'] + ':' + os.environ['LAB_PORT']
+    path = apiPath + "/api/v1/files/" + file_id
+
+    logger.debug("retrieving file:" + file_id)
+    logger.debug("api path: " + path)
+
+    res = None
+    try:
+        res = requests.request('GET', path, timeout=15)
+    except:
+        logger.error("Unexpected error in get_file_from_server for path 'GET: " +
+                     str(path) + "': " + str(sys.exc_info()[0]))
+        raise
+
+    if res.status_code != requests.codes.ok:
+        msg = "Request GET status_code not ok, path: '" + \
+            str(path) + "'' status code: '" + str(res.status_code) + \
+            "'' response text: " + str(res.text)
+        logger.error(msg)
+        raise RuntimeError(msg)
+
+    logger.info("File retrieved, file_id: '" + file_id +
+                "', path: '" + path + "', status_code: " + str(res.status_code))
+    return res.text
+
+
+def main():
+    meta_features_all = []
+    parser = argparse.ArgumentParser(
+        description="Validate a dataset", add_help=False)
+    parser.add_argument('INPUT_FILE', type=str, help='Filepath or fileId.')
+    parser.add_argument('-target', action='store', dest='TARGET', type=str, default='class',
+                        help='Name of target column', required=False)
+    parser.add_argument('-identifier_type', action='store', dest='IDENTIFIER_TYPE', type=str, choices=['filepath', 'fileid'], default='filepath',
+                        help='Name of target column')
+    parser.add_argument('-categorical_features', action='store', dest='JSON_CATEGORIES', type=str, required=False, default=None,
+                        help='JSON list of categorical features')
+    parser.add_argument('-ordinal_features', action='store', dest='JSON_ORDINALS', type=str, required=False, default=None,
+                        help='JSON dict of ordianl features and possible values')
+    parser.add_argument('-prediction_type', action='store', dest='PREDICTION_TYPE', type=str, choices=['classification', 'regression'], default="classification",
+                        help="Classification or regression problem")
+
+    args = parser.parse_args()
+
+    # set up the file logger
+    logpath = os.path.join(os.environ['PROJECT_ROOT'], "target/logs")
+    if not os.path.exists(logpath):
+        os.makedirs(logpath)
+
+    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+    fhandler = logging.FileHandler(
+        os.path.join(logpath, 'validateDataset.log'))
+    fhandler.setFormatter(formatter)
+    logger.addHandler(fhandler)
+
+    success = None
+    errorMessage = None
+    meta_json = None
+
+    categories = None
+    ordinals = None
+
+    try:
+        if args.JSON_CATEGORIES:
+            categories = simplejson.loads(args.JSON_CATEGORIES)
+        if args.JSON_ORDINALS:
+            ordinals = simplejson.loads(args.JSON_ORDINALS)
+        prediction_type = args.PREDICTION_TYPE
+        # print("categories: ")
+        # print(categories)
+
+        if (args.IDENTIFIER_TYPE == 'filepath'):
+            success, errorMessage = validate_data_from_filepath(
+                args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals)
+        else:
+            success, errorMessage = validate_data_from_server(
+                args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals)
+        meta_json = simplejson.dumps(
+            {"success": success, "errorMessage": errorMessage}, ignore_nan=True)  # , ensure_ascii=False)
+    except Exception as e:
+        logger.error(traceback.format_exc())
+        meta_json = simplejson.dumps(
+            {"success": False, "errorMessage": "Exception: " + repr(e)}, ignore_nan=True)  # , ensure_ascii=False)
+
+    print(meta_json)
+    sys.stdout.flush()
+
+
+if __name__ == '__main__':
+    main()