In [1]:
%load_ext autoreload
%autoreload 2

import functools
import gc
import logging
import pickle
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Callable

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
#import tensorflow_addons as tfa
from IPython.display import clear_output
#from keras.regularizers import L1L2
from multiprocess import Pool
from multiprocess.dummy import Pool as ThreadPool
from tqdm import tqdm
import lightgbm as lgb


logging.basicConfig(level=logging.DEBUG, format='%(asctime)s :: %(funcName)s :: %(message)s')

from ccf.models_cust import ModelLGB
from ccf.datasets import get_sample, get_initial_setting, get_left_right_name, get_std
from sklearn.metrics import r2_score
from ccf.utils import delete_objs, cuttoff_ts_in_df, natural_reindex, sort_df, create_block_vars_from_separate
from ccf.analytics import get_analytics_row, get_pairs_rel_analytics, paired_test_greedy_fs
#from ccf.callbacks import FrequencyCallback, FrequencyEpoch
from ccf.preprocess import get_sample_2d_lgb
from tensorflow.keras.callbacks import EarlyStopping
pd.set_option('display.max_columns', 10_000)

2022-10-30 12:06:15,475 :: <module> :: Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
2022-10-30 12:06:15,646 :: <module> :: Creating converter from 7 to 5
2022-10-30 12:06:15,647 :: <module> :: Creating converter from 5 to 7
2022-10-30 12:06:15,647 :: <module> :: Creating converter from 7 to 5
2022-10-30 12:06:15,648 :: <module> :: Creating converter from 5 to 7


In [2]:
def one_factor_choose(base, target_type, corr, full_f_list):
    current_feature_list = full_f_list[:-1]
    folds = []
    for i in range(10):
        folds.append([
            Path(f'{base}/{target_type}/X_{i+1}.parquet.gzip'), 
            Path(f'{base}/{target_type}/train_val_{i+1}.parquet.gzip'), 
            Path(f'{base}/{target_type}/val_{i+1}.parquet.gzip'),
            Path(f'{base}/features_1f.parquet.gzip')
        ])


    count_obs_train = None
    count_obs_val_train = None
    count_obs_val = None

    features_path = Path(f'{base}/features_1f.parquet.gzip')
    list_candidates = [full_f_list[-1]]
    list_candidates.sort()

    selection_rule = {"field_name": "rel_diff_macro_lower_boot_95", "ascending": True}

    dict_fields, _, _ = get_initial_setting(
        features_path,
        count_cuttoff = 0
    )

    experiment_name = f'{target_type}_block_vars_corr_0{str(int(corr*10))}'

    analytics_path = Path(f'{base}/one_factor/')

    get_sample_func = lambda possible_feature_list, base_path, count_obs, scaler, features_path: get_sample_2d_lgb(
        possible_feature_list,
        base_path,
        count_obs,
        features_path,
        categoricals=[],
        experiment_name=None,
        keys=["time"],
    )

    logging.debug(f"experiment_name == {experiment_name}")
    curr_setting = {
       "verbose_eval": 50,
        "num_boost_round": 500,
        "early_stopping_rounds": 50,
        "params": {
            "num_leaves": 131_072,
            "max_bin": 256,
            "learning_rate": 0.01,
            "objective": "regression",
            "metric": "rmse",
            "max_depth": 6,
            "feature_fraction": 1.0,
            "feature_fraction_bynode": 0.6,
        },
    }
    model_class = lambda train_matrix_shape, name: ModelLGB(
        save_path = Path(f'{base}') / Path("saved_models"), 
        name = name,
        metric = r2_score,
        learning_setting = curr_setting
    )
    pred_iter_perf=[0.0 for i in range(50)]
    paired_test_greedy_fs(
        current_feature_list,
        list_candidates,
        dict_fields,
        folds,
        count_obs_train,
        count_obs_val,
        experiment_name,
        model_class,
        analytics_path,
        selection_rule,
        get_sample_func,
        pred_iter_perf,
        count_obs_val_train,
        print_iteration=False,
        count_iteration=5,
    )
    clear_output()

In [3]:
base = '../../Storage/alber/'
for target_type in [f'ret_1_10_folds_exp']:
    for corr in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        full_f_list = pd.read_csv(f'{base}/{target_type}/analytics/one_factor_block_vars_vars_60_corr_0{str(int(corr*10))}.csv')
        full_f_list = list(full_f_list.new_var)
        print(target_type, corr, len(full_f_list))
        one_factor_choose(base, target_type, corr, full_f_list)