In [1]:
import os,gc,pickle
import joblib 
from tqdm import tqdm
import pandas as pd
import polars as pl
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
import numpy as np 
from sklearn.metrics import r2_score

from joblib import Parallel, delayed

import kaggle_evaluation.jane_street_inference_server

# !pip install lightgbm==4.2.0 -i https://mirrors.aliyun.com/pypi/simple/
# !pip install catboost==1.2.7 -i https://mirrors.aliyun.com/pypi/simple/
# !pip install xgboost==2.0.3 -i https://mirrors.aliyun.com/pypi/simple/
# !pip install joblib==1.4.2 -i https://mirrors.aliyun.com/pypi/simple/


def reduce_mem_usage(df, float16_as32=True):
    #memory_usage()是df每列的内存使用量,sum是对它们求和, B->KB->MB
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:#遍历每列的列名
        col_type = df[col].dtype#列名的type
        if col_type != object and str(col_type)!='category':#不是object也就是说这里处理的是数值类型的变量
            c_min,c_max = df[col].min(),df[col].max() #求出这列的最大值和最小值
            if str(col_type)[:3] == 'int':#如果是int类型的变量,不管是int8,int16,int32还是int64
                #如果这列的取值范围是在int8的取值范围内,那就对类型进行转换 (-128 到 127)
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                #如果这列的取值范围是在int16的取值范围内,那就对类型进行转换(-32,768 到 32,767)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                #如果这列的取值范围是在int32的取值范围内,那就对类型进行转换(-2,147,483,648到2,147,483,647)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                #如果这列的取值范围是在int64的取值范围内,那就对类型进行转换(-9,223,372,036,854,775,808到9,223,372,036,854,775,807)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:#如果是浮点数类型.
                #如果数值在float16的取值范围内,如果觉得需要更高精度可以考虑float32
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:#如果数据需要更高的精度可以选择float32
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)  
                #如果数值在float32的取值范围内，对它进行类型转换
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                #如果数值在float64的取值范围内，对它进行类型转换
                else:
                    df[col] = df[col].astype(np.float64)
    #计算一下结束后的内存
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #相比一开始的内存减少了百分之多少
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [2]:
feature_names = [f"feature_{i:02d}" for i in range(79)] + [f"responder_{idx}_lag_1" for idx in range(9)]

label_name = 'responder_6'
weight_name = 'weight'
train_data_path = '/kaggle/input/js24-preprocessing-create-lags/training.parquet'
valid_data_path = '/kaggle/input/js24-preprocessing-create-lags/validation.parquet'

train_df = pl.scan_parquet(train_data_path)
valid_df = pl.scan_parquet(valid_data_path)


In [3]:
train_df = train_df.collect().to_pandas()
train_df = reduce_mem_usage(train_df, False)

valid_df = valid_df.collect().to_pandas()
valid_df = reduce_mem_usage(valid_df, False)

X_train = train_df[feature_names].values
y_train = train_df[label_name].values
w_train = train_df[weight_name].values
X_valid = valid_df[feature_names].values
y_valid = valid_df[label_name].values
w_valid = valid_df[weight_name].values


Memory usage of dataframe is 8119.52 MB
Memory usage after optimization is: 4109.88 MB
Decreased by 49.4%
Memory usage of dataframe is 418.00 MB
Memory usage after optimization is: 211.58 MB
Decreased by 49.4%


In [4]:
use_es = False
if not use_es:
    X_train = np.concatenate((X_train,X_valid), axis=0)
    y_train = np.concatenate((y_train,y_valid), axis=0)
    w_train = np.concatenate((w_train,w_valid), axis=0)

# early stop

In [5]:
if use_es:
    lgb_params = {
        'objective': 'regression',
        "device"           : "gpu",
        'metric': 'l2',                                      # Root Mean Squared Error
        'boosting_type': 'gbdt',                               # Gradient Boosted Decision Trees
        "colsample_bytree" : 0.8,
        "subsample"        : 0.8,
        "num_leaves"        : 31,
        'learning_rate': 0.05,
        'n_estimators':   1000,
    }
    model = lgb.LGBMRegressor(**lgb_params)
    
    model.fit(X_train, y_train, w_train,
                          # eval_metric=[r2_lgb],
                          eval_metric = 'l1',
                          eval_set=[(X_valid, y_valid, w_valid)], 
                          callbacks=[
                              lgb.early_stopping(70), 
                              lgb.log_evaluation(10)
                          ])
    
    with open(f"lgb_model_use_es.pkl", "wb") as fp:
        pickle.dump(model, fp)
else:
    lgb_params = {
        'objective': 'regression',
        "device"           : "gpu",
        'metric': 'l2',                                      # Root Mean Squared Error
        'boosting_type': 'gbdt',                               # Gradient Boosted Decision Trees
        "colsample_bytree" : 0.8,
        "subsample"        : 0.8,
        "num_leaves"        : 31,
        'learning_rate': 0.05,
        'n_estimators':   260,
    }
    model = lgb.LGBMRegressor(**lgb_params)
    
    model.fit(X_train, y_train, w_train,
                          # eval_metric=[r2_lgb],
                          eval_metric = 'l1',
                          eval_set=[(X_valid, y_valid, w_valid)],
             )
    
    with open(f"lgb_model_no_es.pkl", "wb") as fp:
        pickle.dump(model, fp)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 21735
[LightGBM] [Info] Number of data points in the train set: 22104280, number of used features: 88
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 88 dense feature groups (1855.07 MB) transferred to GPU in 1.185340 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -0.001785


In [6]:
if use_es:
    XGB_Params = {
        'learning_rate': 0.05,
        'max_depth': 7,
        'n_estimators': 1000,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'tree_method': 'gpu_hist',
        'device' : 'cuda',
        'n_gpus' : 2,
        # 'eval_metric': 'rmse',
        'eval_metric':'rmse',
    }
    
    model = xgb.XGBRegressor(**XGB_Params)
    
    model.fit(X_train, y_train, sample_weight=w_train, 
                          eval_set=[(X_valid, y_valid)], 
                          sample_weight_eval_set=[w_valid], 
                          verbose=10, 
                          early_stopping_rounds=60)
    
    with open(f"xgb_model_use_es.pkl", "wb") as fp:
        pickle.dump(model, fp)

else:
    XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 7,
    'n_estimators': 80,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'tree_method': 'gpu_hist',
    'device' : 'cuda',
    'n_gpus' : 2,
    # 'eval_metric': 'rmse',
    'eval_metric':'rmse',
}

    model = xgb.XGBRegressor(**XGB_Params)
    
    model.fit(X_train, y_train, sample_weight=w_train, 
                          eval_set=[(X_valid, y_valid)], 
                          sample_weight_eval_set=[w_valid], 
                          verbose=10, )
    
    with open(f"xgb_model_no_es.pkl", "wb") as fp:
        pickle.dump(model, fp)


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "n_gpus" } are not used.



[0]	validation_0-rmse:0.73081
[10]	validation_0-rmse:0.72971
[20]	validation_0-rmse:0.72908
[30]	validation_0-rmse:0.72861
[40]	validation_0-rmse:0.72833
[50]	validation_0-rmse:0.72806
[60]	validation_0-rmse:0.72783
[70]	validation_0-rmse:0.72761
[79]	validation_0-rmse:0.72739



    E.g. tree_method = "hist", device = "cuda"



In [7]:
if use_es:
    CBT_Params = {'task_type':'GPU',
            'loss_function':'RMSE',
            'eval_metric' : 'RMSE',
           # 'bagging_temperature' : 0.50,
           'iterations'          : 1000 ,
           'learning_rate'       : 0.05,
           'max_depth'           : 8,
           # 'l2_leaf_reg'         : 1.25,
           'min_data_in_leaf'    : 100,
           # 'random_strength'     : 0.25, 
           'verbose'             : 0,
          }
    
    model = cbt.CatBoostRegressor(**CBT_Params)
    
    evalset = cbt.Pool(X_valid, y_valid, weight=w_valid) 
    # Train CatBoost model with early stopping and verbose logging
    model.fit(X_train, y_train, sample_weight=w_train, 
              eval_set=[evalset], 
              verbose=10, 
              early_stopping_rounds=80)
    
    with open(f"cbt_model_use_es.pkl", "wb") as fp:
        pickle.dump(model, fp)

else:
    CBT_Params = {'task_type':'GPU',
            'loss_function':'RMSE',
            'eval_metric' : 'RMSE',
           # 'bagging_temperature' : 0.50,
           'iterations'          : 315 ,
           'learning_rate'       : 0.05,
           'max_depth'           : 8,
           # 'l2_leaf_reg'         : 1.25,
           'min_data_in_leaf'    : 100,
           # 'random_strength'     : 0.25, 
           'verbose'             : 0,
          }
    
    model = cbt.CatBoostRegressor(**CBT_Params)
    
    evalset = cbt.Pool(X_valid, y_valid, weight=w_valid) 
    # Train CatBoost model with early stopping and verbose logging
    model.fit(X_train, y_train, sample_weight=w_train, 
              eval_set=[evalset], 
              verbose=10, )
    
    with open(f"cbt_model_no_es.pkl", "wb") as fp:
        pickle.dump(model, fp)

0:	learn: 0.8292765	test: 0.7308430	best: 0.7308430 (0)	total: 13.8s	remaining: 1h 12m 12s
10:	learn: 0.8273700	test: 0.7299705	best: 0.7299705 (10)	total: 15.7s	remaining: 7m 12s
20:	learn: 0.8263006	test: 0.7295384	best: 0.7295384 (20)	total: 17.4s	remaining: 4m 4s
30:	learn: 0.8255918	test: 0.7292741	best: 0.7292741 (30)	total: 19.2s	remaining: 2m 55s
40:	learn: 0.8250007	test: 0.7290924	best: 0.7290924 (40)	total: 21s	remaining: 2m 20s
50:	learn: 0.8245178	test: 0.7289595	best: 0.7289595 (50)	total: 22.8s	remaining: 1m 58s
60:	learn: 0.8239851	test: 0.7287952	best: 0.7287952 (60)	total: 24.7s	remaining: 1m 42s
70:	learn: 0.8235898	test: 0.7286631	best: 0.7286631 (70)	total: 26.5s	remaining: 1m 30s
80:	learn: 0.8231172	test: 0.7285458	best: 0.7285458 (80)	total: 28.3s	remaining: 1m 21s
90:	learn: 0.8227517	test: 0.7284619	best: 0.7284619 (90)	total: 30.2s	remaining: 1m 14s
100:	learn: 0.8223580	test: 0.7283723	best: 0.7283723 (100)	total: 32s	remaining: 1m 7s
110:	learn: 0.8219460	t

# no early stop

In [8]:
# # Define Parameters
# train_data = lgb.Dataset(X_train, label=y_train, weight=w_train)
# valid_data = lgb.Dataset(X_valid, label=y_valid, weight=w_valid, reference=train_data)

# lgb_params = {
#     'objective': 'regression',
#     "device"           : "gpu",
#     'metric': 'l2',                                      # Root Mean Squared Error
#     'boosting_type': 'gbdt',                               # Gradient Boosted Decision Trees
#     "colsample_bytree" : 0.8,
#     "subsample"        : 0.8,
#     "num_leaves"        : 31,
#     'learning_rate': 0.05,
#     'n_estimators':   1000,
# }

# # Train the model
# lgbm_model = lgb.train(
#     params,
#     train_data,
#     valid_sets=[train_data, valid_data],
#     callbacks=[
#                 lgb.early_stopping(100), 
#                 lgb.log_evaluation(10)],
# )

# lgbm_model.save_model(f"lgbm_model_offline.json")

# if valid_data is not None:
#     y_pred_valid = lgbm_model.predict(X_valid)
#     valid_score = r2_score(y_valid, y_pred_valid, sample_weight=w_valid )
#     print('valid score:',valid_score)

In [9]:
# def get_model(model_name, n_estimators=None):
#     # XGBoost parameters
    
#     XGB_Params = {
#         'learning_rate': 0.05,
#         'max_depth': 7,
#         'n_estimators': 1000 if n_estimators is None else n_estimators,
#         'subsample': 0.8,
#         'colsample_bytree': 0.8,
#         'reg_alpha': 1,
#         'reg_lambda': 2,
#         'random_state': 2024,
#         'tree_method': 'gpu_hist',
#         'device' : 'cuda',
#         'n_gpus' : 2,
#         # 'eval_metric': 'rmse',
#         'eval_metric':r2_xgb,
#     }

#     LGB_Params={"device"           : "gpu",
#              "objective"        : "regression_l2",
#              "metrics"          : "custom",
#              "n_estimators"     :  1000 if n_estimators is None else n_estimators,
#              "max_depth"        : 8,
#              "learning_rate"    : 0.05,
#              "colsample_bytree" : 0.8,
#              "subsample"        : 0.80,
#              "random_state"     : 2024,
#              "reg_alpha"        : 0.1,
#              "reg_lambda"       : 1.0,
#              "verbosity"        : -1,
#          }
    
#     CBT_Params = {'task_type':'GPU',
#            'random_state':2024,
#            'eval_metric'         : r2_cbt(),
#             'loss_function':'RMSE',
#             # 'eval_metric' : 'RMSE',
#            # 'bagging_temperature' : 0.50,
#            'iterations'          : 1000 if n_estimators is None else n_estimators,
#            'learning_rate'       : 0.05,
#            'max_depth'           : 8,
#            'l2_leaf_reg'         : 1.25,
#            # 'min_data_in_leaf'    : 24,
#            # 'random_strength'     : 0.25, 
#            'verbose'             : 0,
#           }
    
#     if model_name == 'xgb':
#         model = xgb.XGBRegressor(**XGB_Params)
#     if model_name == 'lgb':
#         model = lgb.LGBMRegressor(**LGB_Params)
#     if model_name == 'cbt':
#         model = cbt.CatBoostRegressor(**CBT_Params)
#     return model

In [10]:



# def train_model(train_data, valid_data=None,model_name='lgb',n_estimators=None ):
    
#     # Get the model from the dictionary
#     model = get_model(model_name, n_estimators)
    
#     X_train, y_train, w_train = train_data
#     if valid_data is not None:
#         X_valid, y_valid, w_valid = valid_data
        
#     # Train the model based on the type (LightGBM, XGBoost, or CatBoost)
#     if model_name == 'lgb':
#         # Train LightGBM model with early stopping and evaluation logging
#         if valid_data is not None:
#             model.fit(X_train, y_train, w_train,
#                       # eval_metric=[r2_lgb],
#                       eval_metric = 'l1',
#                       eval_set=[(X_valid, y_valid, w_valid)], 
#                       callbacks=[
#                           lgb.early_stopping(100), 
#                           lgb.log_evaluation(10)
#                       ])
#         else:
#             model.fit(X_train,y_train,sample_weight=w_train)
        
#     elif model_name == 'cbt':
#         # Prepare evaluation set for CatBoost
#         if valid_data is not None:
#             evalset = cbt.Pool(X_valid, y_valid, weight=w_valid)
            
#             # Train CatBoost model with early stopping and verbose logging
#             model.fit(X_train, y_train, sample_weight=w_train, 
#                       eval_set=[evalset], 
#                       verbose=10, 
#                       early_stopping_rounds=100)
#         else:
#             model.fit(X_train,y_train,sample_weight=w_train)
        
#     else:
#         # Train XGBoost model with early stopping and verbose logging
#         if valid_data is not None:
#             model.fit(X_train, y_train, sample_weight=w_train, 
#                       eval_set=[(X_valid, y_valid)], 
#                       sample_weight_eval_set=[w_valid], 
#                       verbose=10, 
#                       early_stopping_rounds=100)
#         else:
#             model.fit(X_train,y_train,sample_weight=w_train)
        
#     if valid_data is not None:
#         y_pred_valid = model.predict(X_valid)
#         valid_score = r2_score(y_valid, y_pred_valid, sample_weight=w_valid )
#         print('valid score:',valid_score)
        
#     # # Save the trained model to a file
#     # joblib.dump(model, f'./models/{model_name}_train{train_date_start}-{train_date_end}_valid{valid_date_start}-{valid_date_end}_score-{valid_score:.6f}.model')
#     with open(f"{model_name}.pkl", "wb") as fp:
#             pickle.dump(model, fp)




In [11]:
# model_name = 'cbt'
# num_trees = None

# train_model(train_data, valid_data, model_name, num_trees)