In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)

In [3]:
df.shape

(300000, 37)

In [4]:
df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,tar_enc_cat0,tar_enc_cat1,tar_enc_cat2,tar_enc_cat3,tar_enc_cat4,tar_enc_cat5,tar_enc_cat6,tar_enc_cat7,tar_enc_cat8,tar_enc_cat9
0,1,B,B,B,C,B,B,A,E,C,...,8.245979,8.20385,8.22478,8.236717,8.240572,8.229516,8.240567,8.240285,8.280709,8.249782
1,8,B,A,A,A,B,D,A,E,C,...,8.245979,8.276689,8.244491,8.274495,8.240572,8.250754,8.240567,8.240285,8.280709,8.259165
2,13,A,B,A,C,B,B,A,E,A,...,8.23897,8.20385,8.244491,8.236717,8.240572,8.229516,8.240567,8.240285,8.230681,8.249782
3,14,B,B,A,C,B,D,A,E,C,...,8.245979,8.20385,8.244491,8.236717,8.240572,8.250754,8.240567,8.240285,8.280709,8.234356
4,25,B,B,A,C,B,D,A,E,C,...,8.245979,8.20385,8.244491,8.236717,8.240572,8.250754,8.240567,8.240285,8.280709,8.259165


In [5]:
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

In [6]:
final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    model = XGBRegressor(learning_rate=0.1,verbosity=1,reg_lambda=1,base_score=0.4,colsample_bytree=0,max_delta_step=3,scale_pos_weight=2, n_estimators=1300,n_jobs=1, min_child_weight=1,random_state=42, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7188693163925176
1 0.7182497490019608
2 0.7196814238237107
3 0.7223268757040336
4 0.7970886037217841
0.7352431937288013 0.030953951973226007


In [7]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

In [8]:
sample_submission.target = preds
sample_submission.to_csv("submission4.csv", index=False)