In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
vm = input()
if vm.lower()=='yes':
    vm=True
else:
    vm=False

yes


In [3]:
train_mode = input()
if train_mode.lower()=='yes':
    train_mode=True
else:
    train_mode=False

yes


In [4]:
if vm:
    path='/mnt/cephfs/ml_data/mc_2021/'
else:
    path=''
    
data_real = pd.read_csv('{}processed_data/ProcessedTrainReal/ProcessedTrain_1M.csv.gz'.format(path))
data_real = data_real[data_real['edepR'] < 17.2]

In [5]:
from sklearn.metrics import mean_squared_error

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

if train_mode:
    size = int(1e6)
    n_feats = len(data_real.columns) - 5
    
    X_val = np.array(data_real[size:])[:, :n_feats]
    y_val = np.array(data_real[size:])[:, n_feats]
    data_real = np.array(data_real)[:size]
    
    n_folds = 10
    rmse_scores = []
    mape_scores = []
    
    kfold = KFold(n_folds, True, random_state=22)    
    for train, test in tqdm(kfold.split(data_real), "Folds... ", leave=False):        
        xgbreg = XGBRegressor(
                max_depth=9,
                learning_rate=0.08,
                n_estimators=3000,
                n_jobs=10
        )
                            
        X_train = data_real[train][:, :n_feats]
        y_train = data_real[train][:, n_feats]
                            
        X_test = data_real[test][:, :n_feats]
        y_test = data_real[test][:, n_feats]

        xgbreg.fit(X_train, y_train,
                   verbose=False,
                   eval_set=[(X_val, y_val)],
                   early_stopping_rounds=5)
        
        y_predict = xgbreg.predict(X_test)
        rmse = mean_squared_error(y_predict, y_test)**0.5
        mape = mean_absolute_percentage_error(y_predict, y_test)
        rmse_scores.append(rmse)
        mape_scores.append(mape)
    
    result = np.array([[np.mean(mape_scores), np.std(mape_scores)], [np.mean(rmse_scores), np.std(rmse_scores)]])
    np.savez_compressed('feature_selection/all_features_metrics.npz', a=result)

Folds... : 0it [00:00, ?it/s]

In [None]:
print(result)

In [None]:
all_features_metric = np.load('feature_selection/all_features_metrics.npz', allow_pickle=True)['a'][0][0]
eps = np.load('feature_selection/all_features_metric.npz', allow_pickle=True)['a'][0][1]

opt_features = []
current_metrics = []
current_metric = 100

features = data_real.iloc[:, :-5].columns
while abs(all_features_metric - current_metric) > eps:
    metrics = []
    for feature in tqdm(features, "Features loop"):
        xgbreg = XGBRegressor(
            max_depth=9,
            learning_rate=0.08,
            n_estimators=3000,
            random_state=22,
        )
        
        scores = cross_val_score(
            xgbreg,
            data_real.iloc[:, :-5][opt_features+[feature]],
            data_real.iloc[:, -5],
            cv=5,
            n_jobs=5,
            verbose=100,
            scoring='neg_mean_absolute_percentage_error'
        )
        
        metric = -100*scores.mean()
        metrics.append(metric)

    best_metric_ind = np.argmin(metrics)
    current_metric = metrics[best_metric_ind]
    current_metrics.append(current_metric)
    opt_features.append(features[best_metric_ind])
    features = features.drop(features[best_metric_ind])

    print(current_metrics)
    print(opt_features)
    
    np.savez_compressed('feature_selection/opt_features.npz', a=np.array(opt_features))
    np.savez_compressed('feature_selection/current_metrics.npz', a=np.array(current_metrics))

In [None]:
np.savez_compressed('feature_selection/opt_features.npz', a=np.array(opt_features))
np.savez_compressed('feature_selection/current_metrics.npz', a=np.array(current_metrics))