In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import optuna
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder

In [None]:
train = pd.read_csv("../input/30daysofml/train.csv", index_col=0)
test = pd.read_csv("../input/30daysofml/test.csv", index_col=0)

In [None]:
object_cols = [col for col in train.columns if train[col].dtype == "object"]
oe=OrdinalEncoder()
train[object_cols]=oe.fit_transform(train[object_cols])
test[object_cols]=oe.transform(test[object_cols])


In [None]:
columns=[col for col in train.columns.to_list() if col not in ['id','target']]

In [None]:
y = train['target']
X = train.drop(['target'], axis=1)

In [None]:
def objective(trial,data=X,target=y):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    params={'tree_method':'gpu_hist',
          'lambda':trial.suggest_uniform('lambda',5,7),
          #'alpha':trial.suggest_uniform('alpha',13,17),
          #'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.5,0.55),
          #'subsample':trial.suggest_uniform('subsample',0.75,0.84),
          'learning_rate':trial.suggest_uniform('learning_rate',1e-2,1e-1),
          'n_estimators': 2000,
          #'gamma':trial.suggest_uniform('gamma',0,0.4),
          #'min_child_weight': trial.suggest_int('min_child_weight',2,15),
          'random_state':42,
          'max_depth': trial.suggest_int('max_depth', 2,15),
          }
    #pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    
    model = xgb.XGBRegressor(**params)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=10,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse


In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=300)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_contour(study, params=['alpha',
                            #'max_depth',
                            'lambda',
                            'gamma',
                            'learning_rate',
                            'colsample_bytree'])

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
Best_trial={'tree_method':'gpu_hist','max_depth':5,'lambda': 5.911138269296357,
            'alpha': 14.464864964905939, 'colsample_bytree': 0.511076352867773,
            'subsample': 0.8222132607803955, 'learning_rate': 0.05860971842786365,
            'gamma': 0.23279873834112796,'n_estimators':2000,'random_state':42}

In [None]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5,random_state=42,shuffle=True)
rmse=[]  
n=0

for trn_idx, test_idx in kf.split(train[columns],train['target']):
    X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val=train['target'].iloc[trn_idx],train['target'].iloc[test_idx]
    
    model = xgb.XGBRegressor(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=10,verbose=False)
    
    preds+=model.predict(test[columns])/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(n+1,rmse[n])
    n+=1

In [None]:
np.mean(rmse)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3,random_state=42)
te1_x, te2_x, te1_y, te2_y=train_test_split(test_x, test_y, test_size=0.5,random_state=42)

model = xgb.XGBRegressor(**Best_trial)
model.fit(train_x,train_y,eval_set=[(te1_x,te1_y)],early_stopping_rounds=100,verbose=True)

print(mean_squared_error(te2_y, model.predict(te2_x), squared=False))

preds = np.zeros(test.shape[0])
preds+=model.predict(test[columns])

In [None]:
output = pd.DataFrame({'Id': test.index,
                       'target': preds})
output.to_csv('submission.csv', index=False)