In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics

from xgboost import XGBRegressor

In [2]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

In [3]:
df_train.head()

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,B,B,B,C,B,B,A,E,C,N,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
2,B,B,A,A,B,D,A,F,A,O,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
3,A,A,A,C,B,D,A,D,A,F,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
4,B,B,A,C,B,D,A,E,C,K,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
6,A,A,A,C,B,D,A,E,A,N,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


In [4]:
X_train = df_train.drop('target', axis=1)
y_train = df_train.target

X_test = df_test

In [5]:
num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]

In [6]:
y_test_pred = []

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
    
    # Generating X and y for train and test sets
    X_train_f = X_train.iloc[train_idx].copy()
    y_train_f = y_train.iloc[train_idx]
    
    X_valid_f = X_train.iloc[valid_idx].copy()
    y_valid_f = y_train.iloc[valid_idx]
    
    X_test = df_test.copy()
    
    # Encoding Categorical variables
    encoder = preprocessing.OrdinalEncoder()
    X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
    X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
    X_test[cat_col] = encoder.transform(X_test[cat_col])
    
    # Modeling
    model = XGBRegressor(random_state=fold,
                         tree_method='gpu_hist',
                         gpu_id=0,
                         predictor='gpu_predictor')
    model.fit(X_train_f, y_train_f)
    
    y_pred_f = model.predict(X_valid_f)
    rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
    print(f'fold-{1} rmse : {rmse:.5f}')
    
    y_test_f = model.predict(X_test)
    y_test_pred.append(y_test_f)
    

fold-1 rmse : 0.72457
fold-1 rmse : 0.72425
fold-1 rmse : 0.72707
fold-1 rmse : 0.72684
fold-1 rmse : 0.72572


In [7]:
# Converting list of list to arraywith column_stack
y_test_pred = np.mean(np.column_stack(y_test_pred), axis=1)