## We will only test XGBoost on Word2Vec because of memory issues with the Tf-Idf matrix for XGBoost

In [1]:
from sklearn.model_selection import cross_val_score
import pandas
import pickle
import numpy

from sklearn.model_selection import cross_validate

from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn import metrics

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
params = {
    'max_depth': 6,
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 14,
    # Set number of GPUs if available   
    'tree_method': 'gpu_hist',
    'gpu_id': 0
}

### full dataset

In [3]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_lem_preprocessed_vectorized.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_lem_preprocessed_vectorized.pickle", "rb"))

In [4]:
## LABELS MUST BE FROM 0 TO 13 BECAUSE OF XGBOOST IMPLEMENTATION

y_test = [y-1 for y in y_test]
y_train = [y-1 for y in y_train]

In [5]:
dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)

In [6]:
### delete data to free space

import sys 
import gc
del x_train
del x_test
gc.collect()

7

In [7]:
t = time()
bst = xgb.train(params, dtrain)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
pred = bst.predict(dtest)

test_time = time() - t
print("test time:  %0.3fs" % test_time)

# print(classification_report(y_test, pred))
# cm = confusion_matrix(y_test, pred)
# cm



train time: 81.498s
test time:  0.163s


In [8]:
score1 = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

accuracy:   0.930
              precision    recall  f1-score   support

           0       0.87      0.84      0.86      5000
           1       0.94      0.95      0.95      5000
           2       0.83      0.80      0.81      5000
           3       0.97      0.97      0.97      5000
           4       0.93      0.93      0.93      5000
           5       0.93      0.95      0.94      5000
           6       0.92      0.92      0.92      5000
           7       0.96      0.97      0.97      5000
           8       0.99      0.98      0.99      5000
           9       0.97      0.97      0.97      5000
          10       0.97      0.97      0.97      5000
          11       0.93      0.96      0.94      5000
          12       0.94      0.95      0.94      5000
          13       0.85      0.86      0.86      5000

    accuracy                           0.93     70000
   macro avg       0.93      0.93      0.93     70000
weighted avg       0.93      0.93      0.93     70000

confusi

### half of the dataset

In [9]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_lem_preprocessed_vectorized.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_lem_preprocessed_vectorized.pickle", "rb"))

y_train = y_train[slice(0, len(y_train), 2)]
x_train = x_train[slice(0, len(x_train), 2)]

In [10]:
## LABELS MUST BE FROM 0 TO 13 BECAUSE OF XGBOOST IMPLEMENTATION

y_test = [y-1 for y in y_test]
y_train = [y-1 for y in y_train]

In [11]:
dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)

In [12]:
### delete data to free space

import sys 
import gc
del x_train
del x_test
gc.collect()

7

In [13]:
t = time()
bst = xgb.train(params, dtrain)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
pred = bst.predict(dtest)

test_time = time() - t
print("test time:  %0.3fs" % test_time)


train time: 45.361s
test time:  0.188s


In [14]:
score1 = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

accuracy:   0.930
              precision    recall  f1-score   support

           0       0.87      0.84      0.86      5000
           1       0.94      0.95      0.94      5000
           2       0.83      0.79      0.81      5000
           3       0.97      0.97      0.97      5000
           4       0.93      0.93      0.93      5000
           5       0.94      0.95      0.94      5000
           6       0.93      0.92      0.93      5000
           7       0.96      0.98      0.97      5000
           8       0.99      0.98      0.99      5000
           9       0.97      0.97      0.97      5000
          10       0.97      0.97      0.97      5000
          11       0.93      0.96      0.95      5000
          12       0.93      0.94      0.94      5000
          13       0.84      0.86      0.85      5000

    accuracy                           0.93     70000
   macro avg       0.93      0.93      0.93     70000
weighted avg       0.93      0.93      0.93     70000

confusi

### fourth of the dataset


In [15]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_lem_preprocessed_vectorized.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_lem_preprocessed_vectorized.pickle", "rb"))

y_train = y_train[slice(0, len(y_train), 4)]
x_train = x_train[slice(0, len(x_train), 4)]


## LABELS MUST BE FROM 0 TO 13 BECAUSE OF XGBOOST IMPLEMENTATION

y_test = [y-1 for y in y_test]
y_train = [y-1 for y in y_train]

In [16]:
dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)

In [17]:
### delete data to free space

import sys 
import gc
del x_train
del x_test
gc.collect()

37

In [18]:
t = time()
bst = xgb.train(params, dtrain)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
pred = bst.predict(dtest)

test_time = time() - t
print("test time:  %0.3fs" % test_time)


train time: 26.628s
test time:  0.163s


In [19]:
score1 = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

accuracy:   0.928
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      5000
           1       0.94      0.94      0.94      5000
           2       0.82      0.79      0.81      5000
           3       0.97      0.97      0.97      5000
           4       0.92      0.93      0.93      5000
           5       0.93      0.95      0.94      5000
           6       0.93      0.92      0.92      5000
           7       0.96      0.97      0.97      5000
           8       0.99      0.98      0.98      5000
           9       0.97      0.97      0.97      5000
          10       0.97      0.97      0.97      5000
          11       0.93      0.96      0.94      5000
          12       0.93      0.94      0.94      5000
          13       0.85      0.86      0.85      5000

    accuracy                           0.93     70000
   macro avg       0.93      0.93      0.93     70000
weighted avg       0.93      0.93      0.93     70000

confusi

### tf-idf

### one fourth of the dataset

In [3]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_preprocessed.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_preprocessed.pickle", "rb"))


y_train = y_train[slice(0, len(y_train), 6)]
x_train = x_train[slice(0, len(x_train), 6)]

y_train = [y-1 for y in y_train]
y_test = [y-1 for y in y_test]


## TFIDF VECTORIZATION

t = time()  # not compulsory

# loading CountVectorizer
tf_vectorizer = TfidfVectorizer() # or term frequency

x_train = tf_vectorizer.fit_transform(x_train)

duration = time() - t
print("Time taken to extract features from training data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_train.shape)

t = time()
x_test = tf_vectorizer.transform(x_test)

duration = time() - t
print("Time taken to extract features from test data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_test.shape)



dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)


import gc
import sys
del x_train
del x_test
del tf_vectorizer
gc.collect()


## TRAIN AND FIT CLASSIFIER

t = time()
bst = xgb.train(params, dtrain)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
y_pred = bst.predict(dtest)

test_time = time() - t
print("test time:  %0.3fs" % test_time)

# compute the performance measures
score1 = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, y_pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

print('------------------------------')

Time taken to extract features from training data : 2.241900 seconds
n_samples: 93334, n_features: 215944
Time taken to extract features from test data : 1.360001 seconds
n_samples: 70000, n_features: 215944


XGBoostError: [01:23:04] C:/buildkite-agent/builds/buildkite-windows-cpu-autoscaling-group-i-030221e36e1a46bfb-1/xgboost/xgboost-ci-windows/src/tree/updater_gpu_hist.cu:784: Exception in gpu_hist: [01:23:04] c:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-030221e36e1a46bfb-1\xgboost\xgboost-ci-windows\src\data\../common/device_helpers.cuh:431: Memory allocation error on worker 0: bad allocation: cudaErrorMemoryAllocation: out of memory
- Free memory: 1103390311
- Requested memory: 2426767616



### half of the dataset

In [None]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_preprocessed.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_preprocessed.pickle", "rb"))


y_train = y_train[slice(0, len(y_train), 2)]
x_train = x_train[slice(0, len(x_train), 2)]

y_train = [y-1 for y in y_train]
y_test = [y-1 for y in y_test]


## TFIDF VECTORIZATION

t = time()  # not compulsory

# loading CountVectorizer
tf_vectorizer = TfidfVectorizer() # or term frequency

x_train = tf_vectorizer.fit_transform(x_train)

duration = time() - t
print("Time taken to extract features from training data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_train.shape)

t = time()
x_test = tf_vectorizer.transform(x_test)

duration = time() - t
print("Time taken to extract features from test data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_test.shape)



dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)


import gc
import sys
del x_train
del x_test
del tf_vectorizer
gc.collect()


## TRAIN AND FIT CLASSIFIER

t = time()
bst = xgb.train(params, dtrain)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
y_pred = bst.predict(dtest)

test_time = time() - t
print("test time:  %0.3fs" % test_time)

# compute the performance measures
score1 = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, y_pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

print('------------------------------')

### full dataset

In [None]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_preprocessed.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_preprocessed.pickle", "rb"))

y_train = [y-1 for y in y_train]
y_test = [y-1 for y in y_test]


## TFIDF VECTORIZATION

t = time()  # not compulsory

# loading CountVectorizer
tf_vectorizer = TfidfVectorizer() # or term frequency

x_train = tf_vectorizer.fit_transform(x_train)

duration = time() - t
print("Time taken to extract features from training data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_train.shape)

t = time()
x_test = tf_vectorizer.transform(x_test)

duration = time() - t
print("Time taken to extract features from test data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_test.shape)



dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)


import gc
import sys
del x_train
del x_test
del tf_vectorizer
gc.collect()


## TRAIN AND FIT CLASSIFIER

t = time()
bst = xgb.train(params, dtrain)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
y_pred = bst.predict(dtest)

test_time = time() - t
print("test time:  %0.3fs" % test_time)

# compute the performance measures
score1 = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, y_pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

print('------------------------------')