In [1]:
import xgboost as xgb
import numpy as np

import matplotlib.pyplot as plt

%matplotlib qt

In [2]:
version = "v3a"

CONJUNCTIONS_REFS = np.load(f"./../processed_data_chorus_neural_network/STAGE_4/{version}/MODEL_READY_DATA_{version}.npz")

TRAINING_FEATURES = CONJUNCTIONS_REFS["FEATURES"]
TRAINING_LABELS = CONJUNCTIONS_REFS["LABELS"]

VALIDATION_FEATURES = CONJUNCTIONS_REFS["VALIDATION_FEATURES"]
VALIDATION_LABELS = CONJUNCTIONS_REFS["VALIDATION_LABELS"]

BINS = CONJUNCTIONS_REFS["BINS"]

CONJUNCTIONS_REFS.close()

print(f"Training set shape: {TRAINING_FEATURES.shape, TRAINING_LABELS.shape}")
print(f"Validation set shape: {VALIDATION_FEATURES.shape, VALIDATION_LABELS.shape}")
print(f"Bins used: {BINS}")
print(f"Number of bins: {len(BINS)}")


print(VALIDATION_LABELS)

Training set shape: ((21012169, 15), (21012169, 1))
Validation set shape: ((560971, 15), (560971, 1))
Bins used: [1.00000000e-01 1.58489319e-01 2.51188643e-01 3.98107171e-01
 6.30957344e-01 1.00000000e+00 1.29154967e+00 1.66810054e+00
 2.15443469e+00 2.78255940e+00 3.59381366e+00 4.64158883e+00
 5.99484250e+00 7.74263683e+00 1.00000000e+01 1.58489319e+01
 2.51188643e+01 3.98107171e+01 6.30957344e+01 1.00000000e+02
 3.16227766e+02 1.00000000e+03]
Number of bins: 22
[[6]
 [6]
 [6]
 ...
 [7]
 [7]
 [7]]


In [3]:
training_set = xgb.DMatrix(TRAINING_FEATURES[0:10000000], TRAINING_LABELS[0:10000000])
validation_set = xgb.DMatrix(VALIDATION_FEATURES[0:10000000], VALIDATION_LABELS[0:10000000])

In [4]:
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['nthread'] = 8
param['num_class'] = len(BINS) + 1
param['device'] = "cuda"
param["lambda"] = 100
param["alpha"] = 0
param["gamma"] = 0.1

watchlist = [(training_set, 'train'), (validation_set, 'validation')]

num_round = 100

model = xgb.train(params=param, 
                  dtrain=training_set, 
                  num_boost_round=num_round, 
                  evals=watchlist,
                  xgb_model=f"./../processed_data_chorus_neural_network/TRAINED_MODELS/XGBOOST_CLASSIFIER_v3a.model")

model.save_model(f"./../processed_data_chorus_neural_network/TRAINED_MODELS/XGBOOST_CLASSIFIER_v3a.model")

[0]	train-mlogloss:3.02754	validation-mlogloss:3.06218
[1]	train-mlogloss:2.99795	validation-mlogloss:3.04140
[2]	train-mlogloss:2.91504	validation-mlogloss:2.98271
[3]	train-mlogloss:2.90429	validation-mlogloss:2.97489
[4]	train-mlogloss:2.83466	validation-mlogloss:2.92458
[5]	train-mlogloss:2.82822	validation-mlogloss:2.91989
[6]	train-mlogloss:2.76803	validation-mlogloss:2.87578
[7]	train-mlogloss:2.76442	validation-mlogloss:2.87315
[8]	train-mlogloss:2.76176	validation-mlogloss:2.87117
[9]	train-mlogloss:2.75920	validation-mlogloss:2.86933
[10]	train-mlogloss:2.75569	validation-mlogloss:2.86674
[11]	train-mlogloss:2.70325	validation-mlogloss:2.82781
[12]	train-mlogloss:2.70137	validation-mlogloss:2.82640
[13]	train-mlogloss:2.69968	validation-mlogloss:2.82512
[14]	train-mlogloss:2.69760	validation-mlogloss:2.82363


KeyboardInterrupt: 

In [None]:
model = xgb.Booster({'nthread': 4})  # init model
model.load_model(f"./../processed_data_chorus_neural_network/TRAINED_MODELS/XG_BOOSTED_REGRESSION_ON_CLASSIFIED_INPUT.model")  # load model data

pred = (model.predict(validation_set)).astype(int)

num_correct = np.sum(pred.flatten() == VALIDATION_LABELS.flatten())
print(num_correct / len(VALIDATION_LABELS))

print(np.nanmean(pred.flatten() - VALIDATION_LABELS.flatten()))

plt.scatter(model.predict(validation_set), VALIDATION_LABELS[0:10000000, :])

0.21714491479951725
-0.807359025689385


<matplotlib.collections.PathCollection at 0x1f6f2478ad0>

In [26]:
print(model.get_score(importance_type='gain'))
print(model.get_score(importance_type='weight'))

chorus_real = np.zeros_like(VALIDATION_LABELS)
indices_within_bounds = (VALIDATION_LABELS.flatten() != 0) & (VALIDATION_LABELS.flatten() != 22)
chorus_real[indices_within_bounds] = BINS[VALIDATION_LABELS[indices_within_bounds]]
chorus_real[VALIDATION_LABELS.flatten() == 0] = 0.1
chorus_real[VALIDATION_LABELS.flatten() == 22] = 1000


chorus_pred = np.zeros_like(pred)
indices_within_bounds = (pred.flatten() != 0) & (pred.flatten() != 22)
chorus_pred[indices_within_bounds] = BINS[pred[indices_within_bounds]]
chorus_pred[pred.flatten() == 0] = 0.1
chorus_pred[pred.flatten() == 22] = 1000


{'f0': 2319.209228515625, 'f1': 249.33517456054688, 'f2': 579.41357421875, 'f3': 475.2789001464844, 'f4': 312.008544921875, 'f5': 193.44424438476562, 'f6': 232.8834991455078, 'f7': 141.73191833496094, 'f8': 219.95263671875, 'f9': 493.7401428222656, 'f10': 204.27955627441406, 'f11': 163.28846740722656, 'f12': 161.9849853515625, 'f13': 149.22296142578125, 'f14': 163.7195281982422}
{'f0': 12876.0, 'f1': 18441.0, 'f2': 2220.0, 'f3': 1008.0, 'f4': 994.0, 'f5': 742.0, 'f6': 1033.0, 'f7': 883.0, 'f8': 972.0, 'f9': 2778.0, 'f10': 13262.0, 'f11': 10936.0, 'f12': 12550.0, 'f13': 10837.0, 'f14': 10783.0}
[7 7 7 ... 6 6 6]


In [6]:
plt.scatter(chorus_real, chorus_pred)
plt.xscale("log")
plt.yscale("log")

NameError: name 'chorus_real' is not defined

In [32]:
model.get_score(importance_type='gain')


{'f0': 2319.209228515625,
 'f1': 249.33517456054688,
 'f2': 579.41357421875,
 'f3': 475.2789001464844,
 'f4': 312.008544921875,
 'f5': 193.44424438476562,
 'f6': 232.8834991455078,
 'f7': 141.73191833496094,
 'f8': 219.95263671875,
 'f9': 493.7401428222656,
 'f10': 204.27955627441406,
 'f11': 163.28846740722656,
 'f12': 161.9849853515625,
 'f13': 149.22296142578125,
 'f14': 163.7195281982422}