In [31]:
import json
from itertools import product
import logging

# import pipeliner
# import indexer
from core.trainingjob import TrainingJob
import core.datacache as datacache
import core.indexer as indexer


logging.basicConfig(
    level=logging.INFO,
    format="'%(name)s - %(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("debug.log"),
        logging.StreamHandler()
    ]
)


def generate_combinations(steps):
    methods = []
    for step in steps.values():
        methods.append([(m, step["type"]) for m in step['method']])
    return list(product(*methods))


# TRAINING
training_config = {
    "seed": [42],  # int
    "random_scope": "dataset",  # str - ["dataset", "global", "models"]
    # List[str] - [path1, path2, ...]
    "paths": ["data/_RefSet/ALPINE_C_424_Murguzur_RMSE1.16", "data/_Raisin/Raisin_Tavernier_830_GFratio"],
    # List[(Splitter, Dict)]  - [splitter, params] -> to create dataset indexes trees
    "indexation": [],
    "pre_indexation": {
        "step_1": {
            "type": "filter",
            "method": ["A", "B", "C"],  # List[List[(TransformerMixin, Dict)]]
        },
        "step_2": {
            "type": "filter",
            "method": ["D", "E"],  # List[List[(TransformerMixin, Dict)]]
        }
    },
    "post_indexation": {
        "step_1": {
            "type": "augmentation",
            "method": [],  # List[List[(TransformerMixin, Dict)]]
        },
        "step_2": {
            "type": "preprocessing",
            "method": [],  # List[List[(TransformerMixin, Dict)]]
        },
    },
    "models": [
        # List[(Estimator, Dict)]  - [estimator, params] -> to create sklearn pipeline
        [],
    ],
}

# def


def train(config):
    for dataset_path in config["paths"]:
        # 1. Load data
        dataset_uid, dataset_name, data = datacache.register_dataset(dataset_path)
        print(dataset_uid, dataset_name)
        print("*"*100)
        # # 2 .Filter data
        # pre_indexation_steps = generate_combinations(config["pre_indexation"])
        # The above code is converting the data in a pandas DataFrame called `data` to numeric format
        # using the `pd.to_numeric()` function. The `args=("coerce",)` argument is used to convert any
        # non-numeric values to NaN (Not a Number) values. This is useful for cleaning and preparing data
        # for analysis.
        # print(json.dumps(pre_indexation_steps, indent=4))
        # # 3. Indexation
        # if not pre_indexation_steps:
        #     pre_indexation_steps = [None]

        # for pre_indexation_step in pre_indexation_steps:
        #     indexations = indexer.get(dataset, config["indexation"], pre_indexation_step)
        #     post_indexation_steps = generate_combinations(config["post_indexation"])
        #     runs = generate_combinations(indexations, post_indexation_steps, config["models"])
        #     for run in runs:
        #         scheduler.add(run)


train(training_config)

'root - 2023-06-02 14:29:33,118 [INFO] Loading file: data\_RefSet\ALPINE_C_424_Murguzur_RMSE1.16\Xcal.csv.gz
'root - 2023-06-02 14:29:33,133 [INFO] Header inference: True
'root - 2023-06-02 14:29:33,135 [INFO] Delimiter inference: ;
'root - 2023-06-02 14:29:34,850 [INFO] Data shape: (361, 2151)
'root - 2023-06-02 14:29:34,868 [INFO] Data shape after dropna(all) cols: (361, 2151)
'root - 2023-06-02 14:29:34,887 [INFO] Data shape after dropna(all) rows: (361, 2151)
'root - 2023-06-02 14:29:34,899 [INFO] data_sample:
[[1.0522316  1.0564474 ]
 [0.91029036 0.9535987 ]]
'root - 2023-06-02 14:29:34,902 [INFO] Loading file: data\_RefSet\ALPINE_C_424_Murguzur_RMSE1.16\Xval.csv.gz
'root - 2023-06-02 14:29:34,915 [INFO] Header inference: True
'root - 2023-06-02 14:29:34,916 [INFO] Delimiter inference: ;
'root - 2023-06-02 14:29:35,679 [INFO] Data shape: (63, 2151)
'root - 2023-06-02 14:29:35,691 [INFO] Data shape after dropna(all) cols: (63, 2151)
'root - 2023-06-02 14:29:35,703 [INFO] Data shape

2cf7ed39d11ff271 ALPINE_C_424_Murguzur_RMSE1.16
****************************************************************************************************


'root - 2023-06-02 14:29:35,950 [INFO] Data shape after dropna(all) cols: (664, 125)
'root - 2023-06-02 14:29:35,954 [INFO] Data shape after dropna(all) rows: (664, 125)
'root - 2023-06-02 14:29:35,956 [INFO] Rows to remove: [432, 434, 428, 438]
'root - 2023-06-02 14:29:35,958 [INFO] data_sample:
[[0.2384144 0.2335801]
 [0.248429  0.2475333]]
'root - 2023-06-02 14:29:35,960 [INFO] Loading file: data\_Raisin\Raisin_Tavernier_830_GFratio\Xval.csv
'root - 2023-06-02 14:29:35,963 [INFO] Header inference: False
'root - 2023-06-02 14:29:35,964 [INFO] Delimiter inference: ;
'root - 2023-06-02 14:29:36,028 [INFO] Data shape: (166, 125)
'root - 2023-06-02 14:29:36,032 [INFO] Data shape after dropna(all) cols: (166, 125)
'root - 2023-06-02 14:29:36,036 [INFO] Data shape after dropna(all) rows: (166, 125)
'root - 2023-06-02 14:29:36,037 [INFO] data_sample:
[[0.2509339 0.2475901]
 [0.2625283 0.2588682]]
'root - 2023-06-02 14:29:36,040 [INFO] Loading file: data\_Raisin\Raisin_Tavernier_830_GFratio\

72cbaebc4c46174c Raisin_Tavernier_830_GFratio
****************************************************************************************************


In [30]:
%load_ext autoreload
%autoreload 2
import core.datacache as datacache
import numpy as np
import logging

logging.basicConfig(
    level=logging.INFO,
    format="'%(name)s - %(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("debug.log"),
        # logging.StreamHandler()
    ]
)

# data = datacache.load_csv("data/_RefSet/ALPINE_C_424_Murguzur_RMSE1.16/XCal.csv.gz")
# cache_hash, dataset_name, cache = datacache.register_dataset("data/test_data")
# print(cache_hash, dataset_name)
# for k, v in cache.items():
#     if isinstance(v, np.ndarray):
#         print(k, v.shape)

cache_hash, dataset_name, cache = datacache.register_dataset("data/test_data_2")
print(cache_hash, dataset_name)
for k, v in cache.items():
    if isinstance(v, np.ndarray):
        print(k, v.shape)


# data.isna().any(axis=0)

# hex(hash(str(data)))



'root - 2023-06-02 14:28:51,969 [INFO] Loading file: data\test_data_2\Xcal - Copie.csv
'root - 2023-06-02 14:28:51,973 [INFO] Header inference: False
'root - 2023-06-02 14:28:51,974 [INFO] Delimiter inference: ;
'root - 2023-06-02 14:28:52,158 [INFO] Data shape: (664, 125)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'root - 2023-06-02 14:28:52,163 [INFO] Data shape after dropna(all) cols: (664, 125)
'root - 2023-06-02 14:28:52,167 [INFO] Data shape after dropna(all) rows: (664, 125)
'root - 2023-06-02 14:28:52,169 [INFO] Rows to remove: [432, 434, 428, 438]
'root - 2023-06-02 14:28:52,172 [INFO] data_sample:
[[0.2384144 0.2335801]
 [0.248429  0.2475333]]
'root - 2023-06-02 14:28:52,174 [INFO] Loading file: data\test_data_2\Xcal.csv
'root - 2023-06-02 14:28:52,177 [INFO] Header inference: False
'root - 2023-06-02 14:28:52,178 [INFO] Delimiter inference: ;
'root - 2023-06-02 14:28:52,351 [INFO] Data shape: (664, 125)
'root - 2023-06-02 14:28:52,356 [INFO] Data shape after dropna(all) cols: (664, 125)
'root - 2023-06-02 14:28:52,360 [INFO] Data shape after dropna(all) rows: (664, 125)
'root - 2023-06-02 14:28:52,362 [INFO] Rows to remove: [432, 434, 428, 438]
'root - 2023-06-02 14:28:52,364 [INFO] data_sample:
[[0.2384144 0.2335801]
 [0.248429  0.2475333]]
'root - 2023-06-02 14:28:52,367 [INFO] Loadi

b9489c9781d884a test_data_2


In [6]:
import numpy as np


def remove_nan_rows(X, y):
    nan_indexes_X = np.isnan(X).any(axis=1)
    nan_indexes_y = np.isnan(y).any(axis=0)

    removed_indexes_X = np.where(nan_indexes_X)[0]
    removed_indexes_y = np.where(nan_indexes_y)[0]
    

    matching_indexes_X = np.delete(np.arange(len(X)), removed_indexes_X)
    matching_indexes_y = np.delete(np.arange(len(y)), removed_indexes_y)

    X_matched = X[matching_indexes_X]
    y_matched = y[matching_indexes_y]

    return X_matched, y_matched


# Example usage
# Assuming you have X_train and y_train as the original arrays

# Generate example arrays with NaN values
X_train = np.array([[1, 2, np.nan], [4, 5, 6], [np.nan, 8, 9]])
y_train = np.array([10, 11, np.nan])

# Remove rows with NaN values and get matching arrays
X_matched, y_matched = remove_nan_rows(X_train, y_train)

print("X_matched:\n", X_matched)
print("y_matched:\n", y_matched)

X_matched:
 [[4. 5. 6.]]
y_matched:
 [11. nan]
