In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

from sklearn.preprocessing import StandardScaler, MinMaxScaler

## word2vec

### full dataset

In [2]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_lem_preprocessed_vectorized.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_lem_preprocessed_vectorized.pickle", "rb"))


## TRAIN AND FIT CLASSIFIER

t = time()


forest = RandomForestClassifier(bootstrap = False, n_estimators = 200, verbose=-1, n_jobs=-1, max_depth=40,  min_samples_split= 2, min_samples_leaf=1)
forest = forest.fit(x_train, y_train)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
y_pred = forest.predict(x_test)

test_time = time() - t
print("test time:  %0.3fs" % test_time)

# compute the performance measures
score1 = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, y_pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

print('------------------------------')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 44.7min finished


train time: 2681.743s


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.7s


test time:  3.112s
accuracy:   0.952
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      5000
           1       0.96      0.97      0.96      5000
           2       0.89      0.85      0.87      5000
           3       0.98      0.98      0.98      5000
           4       0.95      0.96      0.96      5000
           5       0.96      0.98      0.97      5000
           6       0.96      0.94      0.95      5000
           7       0.97      0.99      0.98      5000
           8       1.00      0.98      0.99      5000
           9       0.99      0.99      0.99      5000
          10       0.99      0.99      0.99      5000
          11       0.95      0.97      0.96      5000
          12       0.96      0.96      0.96      5000
          13       0.88      0.91      0.89      5000

    accuracy                           0.95     70000
   macro avg       0.95      0.95      0.95     70000
weighted avg       0.95      0.95      0.95

[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    2.7s finished


### half of the dataset

In [3]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_lem_preprocessed_vectorized.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_lem_preprocessed_vectorized.pickle", "rb"))


y_train = y_train[slice(0, len(y_train), 2)]
x_train = x_train[slice(0, len(x_train), 2)]



## TRAIN AND FIT CLASSIFIER

t = time()

forest = RandomForestClassifier(bootstrap = False, n_estimators = 200, verbose=-1, n_jobs=-1, max_depth=40,  min_samples_split= 2, min_samples_leaf=1)

forest = forest.fit(x_train, y_train)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
y_pred = forest.predict(x_test)

test_time = time() - t
print("test time:  %0.3fs" % test_time)

# compute the performance measures
score1 = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, y_pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

print('------------------------------')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 19.3min finished


train time: 1159.352s


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.7s


test time:  2.761s
accuracy:   0.950
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      5000
           1       0.96      0.96      0.96      5000
           2       0.89      0.84      0.86      5000
           3       0.98      0.99      0.98      5000
           4       0.95      0.96      0.95      5000
           5       0.96      0.98      0.97      5000
           6       0.95      0.94      0.95      5000
           7       0.97      0.99      0.98      5000
           8       1.00      0.98      0.99      5000
           9       0.98      0.98      0.98      5000
          10       0.99      0.98      0.98      5000
          11       0.94      0.97      0.96      5000
          12       0.96      0.95      0.96      5000
          13       0.87      0.91      0.89      5000

    accuracy                           0.95     70000
   macro avg       0.95      0.95      0.95     70000
weighted avg       0.95      0.95      0.95

[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    2.5s finished


### one fourth of the dataset

In [4]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_lem_preprocessed_vectorized.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_lem_preprocessed_vectorized.pickle", "rb"))

y_train = y_train[slice(0, len(y_train), 4)]
x_train = x_train[slice(0, len(x_train), 4)]



## TRAIN AND FIT CLASSIFIER

t = time()

forest = RandomForestClassifier(bootstrap = False, n_estimators = 200, verbose=-1, n_jobs=-1, max_depth=40,  min_samples_split= 2, min_samples_leaf=1)

forest = forest.fit(x_train, y_train)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
y_pred = forest.predict(x_test)

test_time = time() - t
print("test time:  %0.3fs" % test_time)

# compute the performance measures
score1 = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, y_pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

print('------------------------------')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  8.3min finished


train time: 499.013s


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.6s


test time:  2.561s
accuracy:   0.947
              precision    recall  f1-score   support

           0       0.90      0.86      0.88      5000
           1       0.95      0.96      0.96      5000
           2       0.88      0.83      0.85      5000
           3       0.98      0.98      0.98      5000
           4       0.95      0.95      0.95      5000
           5       0.95      0.97      0.96      5000
           6       0.95      0.94      0.94      5000
           7       0.97      0.98      0.98      5000
           8       1.00      0.98      0.99      5000
           9       0.98      0.98      0.98      5000
          10       0.98      0.98      0.98      5000
          11       0.94      0.97      0.96      5000
          12       0.95      0.95      0.95      5000
          13       0.87      0.91      0.89      5000

    accuracy                           0.95     70000
   macro avg       0.95      0.95      0.95     70000
weighted avg       0.95      0.95      0.95

[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    2.2s finished


### TF-IDF

### full dataset

In [7]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_preprocessed.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_preprocessed.pickle", "rb"))


## TFIDF VECTORIZATION

t = time()  # not compulsory

# loading CountVectorizer
tf_vectorizer = TfidfVectorizer() # or term frequency

x_train = tf_vectorizer.fit_transform(x_train)

duration = time() - t
print("Time taken to extract features from training data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_train.shape)

t = time()
x_test = tf_vectorizer.transform(x_test)

duration = time() - t
print("Time taken to extract features from test data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_test.shape)


## TRAIN AND FIT CLASSIFIER

t = time()


forest = RandomForestClassifier(bootstrap = False, n_estimators = 100, verbose=-1, n_jobs=-1, max_depth=15)
forest = forest.fit(x_train, y_train)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
y_pred = forest.predict(x_test)

test_time = time() - t
print("test time:  %0.3fs" % test_time)

# compute the performance measures
score1 = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, y_pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

print('------------------------------')

Time taken to extract features from training data : 14.677125 seconds
n_samples: 560000, n_features: 698699
Time taken to extract features from test data : 1.621698 seconds
n_samples: 70000, n_features: 698699


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


train time: 9.826s


[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.4s


test time:  0.848s
accuracy:   0.885
              precision    recall  f1-score   support

           0       0.87      0.74      0.80      5000
           1       0.84      0.91      0.88      5000
           2       0.89      0.58      0.70      5000
           3       0.91      0.95      0.93      5000
           4       0.91      0.90      0.91      5000
           5       0.89      0.95      0.92      5000
           6       0.91      0.83      0.87      5000
           7       0.92      0.95      0.93      5000
           8       0.92      0.99      0.95      5000
           9       0.85      0.83      0.84      5000
          10       0.86      0.93      0.89      5000
          11       0.87      0.98      0.92      5000
          12       0.90      0.96      0.93      5000
          13       0.84      0.89      0.86      5000

    accuracy                           0.89     70000
   macro avg       0.89      0.89      0.88     70000
weighted avg       0.89      0.89      0.88

[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished


### half of the dataset

In [8]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_preprocessed.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_preprocessed.pickle", "rb"))


y_train = y_train[slice(0, len(y_train), 2)]
x_train = x_train[slice(0, len(x_train), 2)]


## TFIDF VECTORIZATION

t = time()  # not compulsory

# loading CountVectorizer
tf_vectorizer = TfidfVectorizer() # or term frequency

x_train = tf_vectorizer.fit_transform(x_train)

duration = time() - t
print("Time taken to extract features from training data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_train.shape)

t = time()
x_test = tf_vectorizer.transform(x_test)

duration = time() - t
print("Time taken to extract features from test data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_test.shape)


## TRAIN AND FIT CLASSIFIER

t = time()


forest = RandomForestClassifier(bootstrap = False, n_estimators = 100, verbose=-1, n_jobs=-1, max_depth=15)
forest = forest.fit(x_train, y_train)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
y_pred = forest.predict(x_test)

test_time = time() - t
print("test time:  %0.3fs" % test_time)

# compute the performance measures
score1 = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, y_pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

print('------------------------------')

Time taken to extract features from training data : 7.142597 seconds
n_samples: 280000, n_features: 445084
Time taken to extract features from test data : 1.608977 seconds
n_samples: 70000, n_features: 445084


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


train time: 5.132s


[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.3s


test time:  0.722s
accuracy:   0.880
              precision    recall  f1-score   support

           0       0.84      0.82      0.83      5000
           1       0.88      0.92      0.90      5000
           2       0.87      0.62      0.72      5000
           3       0.91      0.96      0.94      5000
           4       0.91      0.91      0.91      5000
           5       0.90      0.90      0.90      5000
           6       0.92      0.83      0.87      5000
           7       0.85      0.93      0.89      5000
           8       0.92      0.96      0.94      5000
           9       0.87      0.83      0.85      5000
          10       0.86      0.88      0.87      5000
          11       0.85      0.99      0.91      5000
          12       0.89      0.97      0.93      5000
          13       0.85      0.82      0.83      5000

    accuracy                           0.88     70000
   macro avg       0.88      0.88      0.88     70000
weighted avg       0.88      0.88      0.88

[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.6s finished


### one fourth of the dataset

In [9]:
y_train, _, x_train = pickle.load(open("dbpedia_csv/" + "/train_preprocessed.pickle", "rb"))
#_, _, x_valid = pickle.load(open(paths.data + "/validation_preprocessed.pickle", "rb"))
y_test, _, x_test = pickle.load(open("dbpedia_csv/" + "/test_preprocessed.pickle", "rb"))


y_train = y_train[slice(0, len(y_train), 4)]
x_train = x_train[slice(0, len(x_train), 4)]


## TFIDF VECTORIZATION

t = time()  # not compulsory

# loading CountVectorizer
tf_vectorizer = TfidfVectorizer() # or term frequency

x_train = tf_vectorizer.fit_transform(x_train)

duration = time() - t
print("Time taken to extract features from training data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_train.shape)

t = time()
x_test = tf_vectorizer.transform(x_test)

duration = time() - t
print("Time taken to extract features from test data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % x_test.shape)


## TRAIN AND FIT CLASSIFIER

t = time()


forest = RandomForestClassifier(bootstrap = False, n_estimators = 100, verbose=-1, n_jobs=-1, max_depth=15)
forest = forest.fit(x_train, y_train)

training_time = time() - t
print("train time: %0.3fs" % training_time)


t = time()
y_pred = forest.predict(x_test)

test_time = time() - t
print("test time:  %0.3fs" % test_time)

# compute the performance measures
score1 = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, y_pred, target_names=[str(x) for x in range(14)]))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

print('------------------------------')

Time taken to extract features from training data : 3.604377 seconds
n_samples: 140000, n_features: 282474
Time taken to extract features from test data : 1.477304 seconds
n_samples: 70000, n_features: 282474


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


train time: 2.388s


[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.3s


test time:  0.623s
accuracy:   0.880
              precision    recall  f1-score   support

           0       0.87      0.76      0.81      5000
           1       0.85      0.92      0.88      5000
           2       0.86      0.56      0.68      5000
           3       0.94      0.96      0.95      5000
           4       0.91      0.90      0.91      5000
           5       0.90      0.91      0.90      5000
           6       0.89      0.82      0.85      5000
           7       0.87      0.95      0.91      5000
           8       0.92      0.98      0.95      5000
           9       0.94      0.78      0.85      5000
          10       0.83      0.95      0.88      5000
          11       0.83      0.98      0.90      5000
          12       0.88      0.98      0.93      5000
          13       0.85      0.87      0.86      5000

    accuracy                           0.88     70000
   macro avg       0.88      0.88      0.88     70000
weighted avg       0.88      0.88      0.88

[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.5s finished
