In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neural_network import MLPRegressor

In [22]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, num_rounds=1000):
    param = {}
    param['booster'] = 'gblinear'
    param['objective'] = 'reg:linear'
    param['silent'] = 1
    # param['eta'] = 0.1
    # param['max_depth'] = 10  
    # param['min_child_weight'] = 1
    # param['subsample'] = 0.9
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [3]:
data_path = ""
train_file = data_path + "save_train.csv"
test_file = data_path + "save_test.csv"
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
print(train_df.shape)
print(test_df.shape)

(25000, 386)
(25000, 385)


In [4]:
train_y = train_df["reference"]
train_X = train_df.iloc[0:,:-1]
test_X = test_df

In [23]:
kf = model_selection.KFold(n_splits=5)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y, num_rounds=3000)
        break

[0]	train-rmse:13.1592	test-rmse:13.2107
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 20 rounds.
[1]	train-rmse:11.3183	test-rmse:11.3391
[2]	train-rmse:10.1385	test-rmse:10.1168
[3]	train-rmse:9.8963	test-rmse:9.89742
[4]	train-rmse:9.48241	test-rmse:9.54467
[5]	train-rmse:9.17702	test-rmse:9.26847
[6]	train-rmse:8.98461	test-rmse:9.09265
[7]	train-rmse:8.79298	test-rmse:8.91174
[8]	train-rmse:8.68097	test-rmse:8.81257
[9]	train-rmse:8.61839	test-rmse:8.75486
[10]	train-rmse:8.55185	test-rmse:8.68713
[11]	train-rmse:8.51108	test-rmse:8.65615
[12]	train-rmse:8.48358	test-rmse:8.64092
[13]	train-rmse:8.45424	test-rmse:8.59561
[14]	train-rmse:8.41976	test-rmse:8.55093
[15]	train-rmse:8.39339	test-rmse:8.5184
[16]	train-rmse:8.3651	test-rmse:8.4847
[17]	train-rmse:8.3422	test-rmse:8.45385
[18]	train-rmse:8.32317	test-rmse:8.43157
[19]	train-rmse:8.31191	test-rmse:8.41293
[20]	train-rmse:8.29945	test-rms

In [13]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=3000)

In [15]:
out_df = pd.DataFrame(preds)
out_df.columns = ["reference"]
out_df.insert(0, "id", test_df["id"])
out_df.to_csv("xgboost_pro1.csv", index=False)