In [1]:
from google.colab import drive
from os.path import join

ROOT = '/content/drive'     # default for the drive
PROJ = 'My Drive/node-master'       # path to your project on Drive

GIT_USERNAME = "sgsyang" # replace with yours
#GIT_TOKEN = "XXX"           # definitely replace with yours
#GIT_REPOSITORY = "yyy"      # ...nah


drive.mount(ROOT)           # we mount the drive at /content/drive

PROJECT_PATH = join(ROOT, PROJ)
!mkdir "{PROJECT_PATH}"    # in case we haven't created it already   

#GIT_PATH = "https://{GIT_TOKEN}@github.com/{GIT_USERNAME}/{GIT_REPOSITORY}.git"
GIT_PATH = "https://github.com/sgsyang/node.git"

!mkdir ./temp
!git clone "{GIT_PATH}"
!mv ./temp/* "{PROJECT_PATH}"
!rm -rf ./temp
!rsync -aP --exclude=data/ "{PROJECT_PATH}"/*  ./

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mkdir: cannot create directory ‘/content/drive/My Drive/node-master’: File exists
fatal: destination path 'node' already exists and is not an empty directory.
mv: cannot stat './temp/*': No such file or directory
sending incremental file list
notebooks/Regression/Year_XgtBoost.ipynb
         43,418 100%   10.16MB/s    0:00:00 (xfr#1, to-chk=0/31)


In [2]:
%load_ext autoreload
%autoreload 2
%env CUDA_VISIBLE_DEVICES=0,1
import os, sys
import time
sys.path.insert(0, '..')
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
!pip install xgboost
# !pip install category_encoders
# !pip install tensorboardX
# !pip install qhoptim



import lib
import torch, torch.nn as nn
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'

experiment_name = 'Year_Xgboost'
experiment_name = '{}_{}.{:0>2d}.{:0>2d}_{:0>2d}:{:0>2d}'.format(experiment_name, *time.gmtime()[:5])
print("experiment:", experiment_name)



env: CUDA_VISIBLE_DEVICES=0,1
experiment: Year_Xgboost_2020.05.26_03:02


  import pandas.util.testing as tm


In [3]:
torch.cuda.get_device_name(device)

'Tesla P100-PCIE-16GB'

Downloading https://www.dropbox.com/s/l09pug0ywaqsy0e/YearPredictionMSD.txt?dl=1 > ./data/YEAR/data.csv


100%|██████████| 448576698/448576698 [01:38<00:00, 4550596.43it/s]


Downloading https://www.dropbox.com/s/00u6cnj9mthvzj1/stratified_train_idx.txt?dl=1 > ./data/YEAR/stratified_train_idx.txt


100%|██████████| 2507989/2507989 [00:01<00:00, 1936455.95it/s]


Downloading https://www.dropbox.com/s/420uhjvjab1bt7k/stratified_valid_idx.txt?dl=1 > ./data/YEAR/stratified_valid_idx.txt


100%|██████████| 626904/626904 [00:00<00:00, 728169.11it/s]


mean = 1998.39193, std = 10.92832


##Default Parameters

In [0]:
data = lib.Dataset("YEAR", random_state=133, quantile_transform=True, quantile_noise=1e-3)
in_features = data.X_train.shape[1]

mu, std = data.y_train.mean(), data.y_train.std()
normalize = lambda x: ((x - mu) / std).astype(np.float32)
data.y_train, data.y_valid, data.y_test = map(normalize, [data.y_train, data.y_valid, data.y_test])

print("mean = %.5f, std = %.5f" % (mu, std))

In [0]:
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
xg_reg = XGBRegressor(objective  = 'reg:squarederror',n_estimators=2048,tree_method='gpu_hist')

xg_reg.fit(data.X_train,data.y_train)

valid_preds = xg_reg.predict(data.X_valid)

valid_mse = mean_squared_error(data.y_valid, valid_preds)
print("MSE of Validation: %f" % (valid_mse))



MSE of Validation: 0.661815


In [0]:
# random_state=133

test_preds = xg_reg.predict(data.X_test)


test_mse = mean_squared_error(data.y_test, test_preds)
print("MSE of Validation: %f" % (valid_mse))
print("MSE of Test: %f" % (test_mse))
print(test_mse * std ** 2)

MSE of Validation: 0.661815
MSE of Test: 0.675654
80.69211157358245


##Tuned Parameters



In [5]:
!pip install hyperopt
import xgboost as xgb
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials



In [7]:
#AUTOMATE 
import math
import numpy as np
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

space={'eta': hp.loguniform('eta',np.exp(-7) ,1),
       'max_depth': hp.quniform("max_depth", 2,10,1),
       'subsample': hp.uniform('subsample', 0.5, 1),
       'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
       'colsample_bylevel' : hp.uniform('colsample_bylevel', 0.5,1),
       'min_child_weight' : hp.loguniform('min_child_weight', np.exp(-16),np.exp(4)),
       'alpha' : hp.choice('alpha', [0,hp.loguniform('a',np.exp(-16),np.exp(2))]),
       'lambda': hp.choice('lambda',[ 0,hp.loguniform('l',np.exp(-16),np.exp(2))]),
       'gamma': hp.choice ('gamma', [0,hp.loguniform('g',np.exp(-16),np.exp(2))]) 
    }

def my_tuning(space):
    reg=xgb.XGBRegressor( eta = space['eta'],max_depth = int(space['max_depth']), subsample=space['subsample'],
                         gamma = space['gamma'],reg_alpha = space['alpha'],reg_lambda = space['lambda'],
                         min_child_weight=space['min_child_weight'],
                         colsample_bytree=space['colsample_bytree'],
                         colsample_bylevel = space['colsample_bylevel'],
                         objective  = 'reg:squarederror',tree_method='gpu_hist',n_estimators=2048)
    
    reg.fit(data.X_train, data.y_train)

    pred = reg.predict(data.X_test)
    mse= mean_squared_error(data.y_test, pred)
    print ("SCORE:", mse)
    #change the metric if you like
    return {'loss':mse, 'status': STATUS_OK }



score_results = [] 
key_results = []

random_state = [1337, 133,13, 1, 602, 295, 970, 2000, 3000, 10000]
# random_state = [1337,1]
for i in random_state:
  data = lib.Dataset("YEAR", random_state=i, quantile_transform=True, quantile_noise=1e-3)
  in_features = data.X_train.shape[1]
  mu, std = data.y_train.mean(), data.y_train.std()
  normalize = lambda x: ((x - mu) / std).astype(np.float32)
  data.y_train, data.y_valid, data.y_test = map(normalize, [data.y_train, data.y_valid, data.y_test])
  print(f'The dataset with random state {i} is used, std = {std}')

  trials = Trials()
  best = fmin(fn=my_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)
  # print('Best parameters',best)
  tuned_loss = trials.best_trial['result']['loss']
  print('Best MSE',tuned_loss)
  final = tuned_loss* std ** 2
  print('FINAL mse:',final)
  score_results.append(final)
  key_results.append(i)


import pprint
Result= dict(zip(key_results,score_results))

print('----')
pprint.pprint(Result, width=1)

mean = sum(score_results) / len(score_results)
print('Mean:',mean)
print('Max:',max(score_results))
print('Min:',min(score_results))
print('Distance to max',max(score_results)-mean)
print('Distance to min:',min(score_results)-mean)


The dataset with random state 1337 is used, std = 10.9283205039482
SCORE:
1.226654
SCORE:
0.68860596
SCORE:
1.226654
SCORE:
0.67671883
SCORE:
0.66785926
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
0.6852594
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
0.98615515
SCORE:
0.7188186
SCORE:
1.226654
SCORE:
0.6629527
SCORE:
1.226654
SCORE:
0.665372
SCORE:
0.66420734
SCORE:
1.226654
SCORE:
0.68324184
SCORE:
1.226654
SCORE:
0.6881469
SCORE:
0.6677819
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
0.6630715
SCORE:
0.6743799
SCORE:
0.69751924
SCORE:
1.226654
SCORE:
0.67392504
SCORE:
0.6811719
SCORE:
1.226654
SCORE:
1.226654
SCORE:
0.6723836
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
0.68349797
SCORE:
0.6754416
SCORE:
1.226654
100%|██████████| 50/50 [14:47<00:00, 17.75s/it, best loss: 0.662952721118927]
Best MSE 0.662952721118927


In [4]:
a = {1: 79.21961947219616,
 13: 78.71962492186249,
 133: 78.74350028628709,
 295: 78.71213628639002,
 602: 79.16181033847758,
 970: 78.98059532590844,
 1337: 79.17524290039445,
 2000: 79.48666193523262,
 3000: 78.8668848097133,
 10000: 78.98475251518212}



score_results = a.values()
mean = sum(score_results) / len(score_results)
print('Mean:',mean)
print('Max:',max(score_results))
print('Min:',min(score_results))
print('Distance to max',max(score_results)-mean)
print('Distance to min:',min(score_results)-mean)

Mean: 79.00508287916442
Max: 79.48666193523262
Min: 78.71213628639002
Distance to max 0.48157905606819895
Distance to min: -0.29294659277439905


In [0]:

# {1: 79.21961947219616,
#  13: 78.71962492186249,
#  133: 78.74350028628709,
#  295: 78.71213628639002,
#  602: 79.16181033847758,
#  970: 78.98059532590844,
#  1337: 79.17524290039445,
#  2000: 79.48666193523262,
#  3000: 78.8668848097133,
#  10000: 78.98475251518212}







# def Tune_Test(ran_state):
#   #read data
#   data = lib.Dataset("YEAR", random_state=ran_state, quantile_transform=True, quantile_noise=1e-3)
#   in_features = data.X_train.shape[1]
#   mu, std = data.y_train.mean(), data.y_train.std()
#   normalize = lambda x: ((x - mu) / std).astype(np.float32)
#   data.y_train, data.y_valid, data.y_test = map(normalize, [data.y_train, data.y_valid, data.y_test])
#   print(f'The dataset with random state {ran_state} is used, std = {std}')

#   trials = Trials()
#   best = fmin(fn=my_tuning,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=50,
#             trials=trials)

#   print('Best parameters',best)

#   tuned_loss = trials.best_trial['result']['loss']
#   print('Best MSE',tuned_loss)
#   final = tuned_loss* std ** 2
#   print('FINAL:',final)

  # return final, ran_state

In [6]:
#Read the data
ran_state=13
data = lib.Dataset("YEAR", random_state=ran_state, quantile_transform=True, quantile_noise=1e-3)
in_features = data.X_train.shape[1]
mu, std = data.y_train.mean(), data.y_train.std()
normalize = lambda x: ((x - mu) / std).astype(np.float32)
data.y_train, data.y_valid, data.y_test = map(normalize, [data.y_train, data.y_valid, data.y_test])
print(f'The dataset with random state {ran_state} is used, std = {std}')



The dataset with random state 13 is used, std = 10.9283205039482


In [0]:
import math
import numpy as np
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

space={'eta': hp.loguniform('eta',np.exp(-7) ,1),
       'max_depth': hp.quniform("max_depth", 2,10,1),
       'subsample': hp.uniform('subsample', 0.5, 1),
       'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
       'colsample_bylevel' : hp.uniform('colsample_bylevel', 0.5,1),
       'min_child_weight' : hp.loguniform('min_child_weight', np.exp(-16),np.exp(4)),
       'alpha' : hp.choice('alpha', [0,hp.loguniform('a',np.exp(-16),np.exp(2))]),
       'lambda': hp.choice('lambda',[ 0,hp.loguniform('l',np.exp(-16),np.exp(2))]),
       'gamma': hp.choice ('gamma', [0,hp.loguniform('g',np.exp(-16),np.exp(2))]) 
    }



def hyperparameter_tuning(space):
    reg=xgb.XGBRegressor( eta = space['eta'],max_depth = int(space['max_depth']), subsample=space['subsample'],
                         gamma = space['gamma'],reg_alpha = space['alpha'],reg_lambda = space['lambda'],
                         min_child_weight=space['min_child_weight'],
                         colsample_bytree=space['colsample_bytree'],
                         colsample_bylevel = space['colsample_bylevel'],
                         objective  = 'reg:squarederror',tree_method='gpu_hist',n_estimators=2048)
    
    # evaluation = [( data.X_train, data.y_train), ( data.X_test, data.y_test)]
    
    reg.fit(data.X_train, data.y_train)

    pred = reg.predict(data.X_test)
    mse= mean_squared_error(data.y_test, pred)
    print ("SCORE:", mse)
    #change the metric if you like
    return {'loss':mse, 'status': STATUS_OK }



In [8]:

trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

print('Best parameters',best)

tuned_loss = trials.best_trial['result']['loss']
print('Best MSE',tuned_loss)
final = tuned_loss* std ** 2
print('FINAL:',final)

#79.27715098539818. FOR 1337
# 79.34589409638457 for 133
#79.69379531448928.  for 1
#78.84991436582891 for 13

SCORE:
0.98615515
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
0.6726997
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
0.675798
SCORE:
0.6763364
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
0.66538846
SCORE:
0.69174135
SCORE:
0.67135847
SCORE:
1.226654
SCORE:
0.66677654
SCORE:
0.67638546
SCORE:
1.226654
SCORE:
0.69565237
SCORE:
1.226654
SCORE:
0.67742443
SCORE:
1.226654
SCORE:
0.6736009
SCORE:
0.67058146
SCORE:
1.226654
SCORE:
1.226654
SCORE:
0.7115657
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
0.6862419
SCORE:
0.6797418
SCORE:
1.226654
SCORE:
1.226654
SCORE:
1.226654
SCORE:
0.6795321
SCORE:
0.7089624
SCORE:
1.226654
SCORE:
0.66092885
SCORE:
1.226654
SCORE:
0.6616858
SCORE:
0.66022867
100%|██████████| 50/50 [16:26<00:00, 19.73s/it, best loss: 0.6602286696434021]
Best parameters {'a': 144.52479381419585, 'alpha': 1, 'colsample_bylevel': 0.5346083139278

In [19]:

# tuned_model =XGBRegressor(reg_alpha= best['alpha'], colsample_bylevel= best['colsample_bylevel'],
#                                colsample_bytree= best['colsample_bytree'], 
#                                eta= best['eta'], gamma= best['gamma'], 
#                                reg_lambda=best['lambda'], max_depth= int(best['max_depth']), 
#                                min_child_weight= best['min_child_weight'], 
#                                subsample= best['subsample'],
#                                objective  = 'reg:squarederror',
#                                n_estimators=2048,
#                                tree_method='gpu_hist')

  
# tuned_model.fit(data.X_train,data.y_train)
# test_preds = tuned_model.predict(data.X_test)
# test_mse = mean_squared_error(data.y_test, test_preds)
# # print("MSE of Validation: %f" % (valid_mse))
# print("MSE of Test: %f" % (test_mse))
# print('FINAL:',test_mse * std ** 2)
# print('Get results!')


# def best_para(space,data):
#   trials = Trials()
#   best = fmin(fn=hyperparameter_tuning(data),
#             space=space,
#             algo=tpe.suggest,
#             max_evals=50,
#             trials=trials)
#   return best

# def Result(best,ran_state):
#   tuned_model =XGBRegressor(reg_alpha= best['alpha'], colsample_bylevel= best['colsample_bylevel'],
#                                colsample_bytree= best['colsample_bytree'], 
#                                eta= best['eta'], gamma= best['gamma'], 
#                                reg_lambda=best['lambda'], max_depth= int(best['max_depth']), 
#                                min_child_weight= best['min_child_weight'], 
#                                subsample= best['subsample'],
#                                tree_method='gpu_hist')

  
#   tuned_model.fit(data.X_train,data.y_train)
#   test_preds = tuned_model.predict(data.X_test)
#   test_mse = mean_squared_error(data.y_test, test_preds)
#   print("MSE of Validation: %f" % (valid_mse))
#   print("MSE of Test: %f" % (test_mse))
#   print(test_mse * std ** 2)
#   print('Get results!')
  
#   return test_mse * std ** 2

# def Tune_Test(ran_state):
#   data,std = GetData(ran_state)
#   best = best_para(space,data)
#   test_mse = Result(best,ran_state)
#   score = test_mse * std ** 2
#   print('Score',score)
#   print('RandomState',ran_state,'Over!!!')
#   return score,ran_state


79.27715098539818


In [0]:
# score_results = [] 
# key_results = []


# # random_state = [1337, 133,13, 1, 602, 295, 970, 2000, 3000, 10000]
# # random_state = [1337,1]
# # for i in random_state:
# #   score, random_state = Tune_Test(i)
# #   score_results.append(score)
# #   key_results.append(random_state)
# score, random_state = Tune_Test(1337)
# print('score')



TypeError: ignored

In [0]:
import pprint
Result= dict(zip(key_results,score_results))
pprint.pprint(Result, width=1)

mean = sum(score_results) / len(score_results)
print('Mean:',mean)
print('Max:',max(score_results))
print('Min:',min(score_results))
print('Distance to max',max(score_results)-mean)
print('Distance to min:',min(score_results)-mean)

In [0]:
# trials = Trials()
# best = fmin(fn=hyperparameter_tuning,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=50,
#             trials=trials)

# print (best)

In [0]:
# best = {'a': 1.0759598108425803, 
#  'alpha': 1, 'colsample_bylevel': 0.770805584581162,
#  'colsample_bytree': 0.5858367806044571, 
#  'eta': 1.3485802717370956, 'g': 25.009840581184363,
#  'gamma': 1, 'lambda': 0, 'max_depth': 10.0, 
#  'min_child_weight': 160.57054418225204, 
#  'subsample': 0.8993238200707577}


# tuned_model =XGBRegressor(reg_alpha= best['alpha'], colsample_bylevel= best['colsample_bylevel'],
#                                colsample_bytree= best['colsample_bytree'], 
#                                eta= best['eta'], gamma= best['gamma'], 
#                                reg_lambda=best['lambda'], max_depth= int(best['max_depth']), 
#                                min_child_weight= best['min_child_weight'], 
#                                subsample= best['subsample'],
#                                tree_method='gpu_hist')


# tuned_model.fit(data.X_train,data.y_train)
# test_preds = tuned_model.predict(data.X_test)
# test_mse = mean_squared_error(data.y_test, test_preds)
# print("MSE of Validation: %f" % (valid_mse))
# print("MSE of Test: %f" % (test_mse))
# print(test_mse * std ** 2)
# print('RandomState',133)


In [0]:
# best1 = {'a': 1.0759598108425803, 
#  'alpha': 1, 'colsample_bylevel': 0.770805584581162,
#  'colsample_bytree': 0.5858367806044571, 
#  'eta': 1.3485802717370956, 'g': 25.009840581184363,
#  'gamma': 1, 'lambda': 0, 'max_depth': 10.0, 
#  'min_child_weight': 160.57054418225204, 
#  'subsample': 0.8993238200707577}

# print(best1['a'])