In [1]:
import time
import fasttext
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score

In [2]:
dbpedia = load_dataset("fancyzhx/dbpedia_14")
print(dbpedia)

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 560000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 70000
    })
})


In [3]:
print(type(dbpedia['train']['title']))
print(type(dbpedia['train']['title'][0]))
print(dbpedia['train']['title'][0])
print(type(dbpedia['train']['content']))
print(type(dbpedia['train']['content'][0]))
print(dbpedia['train']['content'][0])
print(type(dbpedia['train']['label']))
print(type(dbpedia['train']['label'][0]))
print(dbpedia['train']['label'][0])

<class 'list'>
<class 'str'>
E. D. Abbott Ltd
<class 'list'>
<class 'str'>
 Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.
<class 'list'>
<class 'int'>
0


In [4]:
print(type(dbpedia['test']['title']))
print(type(dbpedia['test']['title'][0]))
print(dbpedia['test']['title'][0])
print(type(dbpedia['test']['content']))
print(type(dbpedia['test']['content'][0]))
print(dbpedia['test']['content'][0])
print(type(dbpedia['test']['label']))
print(type(dbpedia['test']['label'][0]))
print(dbpedia['test']['label'][0])

<class 'list'>
<class 'str'>
TY KU
<class 'list'>
<class 'str'>
 TY KU /taɪkuː/ is an American alcoholic beverage company that specializes in sake and other spirits. The privately-held company was founded in 2004 and is headquartered in New York City New York. While based in New York TY KU's beverages are made in Japan through a joint venture with two sake breweries. Since 2011 TY KU's growth has extended its products into all 50 states.
<class 'list'>
<class 'int'>
0


In [5]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [6]:
dataset = dbpedia

In [7]:
titles = dataset['train']['title']
contents = dataset['train']['content']
train_texts = [str(x) + str(y) for x, y in zip(titles, contents)]
train_labels = dataset['train']['label']

with open("train.txt", "w") as f:
    for text, label in zip(train_texts, train_labels):
        f.write(f"__label__{label} {text}\n")

In [8]:
titles = dataset['test']['title']
contents = dataset['test']['content']
test_texts = [str(x) + str(y) for x, y in zip(titles, contents)]
test_labels = dataset['test']['label']

with open("test.txt", "w") as f:
    for text, label in zip(test_texts, test_labels):
        f.write(f"__label__{label} {text}\n")

In [9]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [1]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=1


Read 28M words
Number of words:  1215996
Number of labels: 14
Progress: 100.0% words/sec/thread: 6994424 lr:  0.000000 avg.loss:  0.090955 ETA:   0h 0m 0s


N	70000
P@1	0.827
R@1	0.827

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=1


Read 28M words
Number of words:  1215996
Number of labels: 14
Progress: 100.0% words/sec/thread: 7552344 lr:  0.000000 avg.loss:  0.062388 ETA:   0h 0m 0s
Read 4M words

N	70000
P@1	0.884
R@1	0.884

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=1


Read 28M words
Number of words:  1215996
Number of labels: 14
Progress: 100.0% words/sec/thread: 7053838 lr:  0.007099 avg.loss:  0.044221 ETA:   0h 0m 0s

N	70000
P@1	0.916
R@1	0.916

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=1


Progress: 100.0% words/sec/thread: 6997383 lr:  0.000000 avg.loss:  0.043780 ETA:   0h 0m 0s
Read 28M words
Number of words:  1215996
Number of labels: 14
Progress: 100.0% words/sec/thread: 7250915 lr:  0.000000 avg.loss:  0.039895 ETA:   0h 0m 0s


N	70000
P@1	0.922
R@1	0.922



In [10]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Training the best model with lr=0.5, dim=10, epoch=5, and wordNgrams=1


Read 28M words
Number of words:  1215996
Number of labels: 14
Progress: 100.0% words/sec/thread: 6520413 lr:  0.000000 avg.loss:  0.037551 ETA:   0h 0m 0s


In [11]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 4.667471170425415 seconds
Accuracy: 0.9262857142857143


In [12]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [2]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=2


Read 28M words
Number of words:  1215996
Number of labels: 14
Progress: 100.0% words/sec/thread: 4779354 lr:  0.000000 avg.loss:  0.136605 ETA:   0h 0m 0s


N	70000
P@1	0.909
R@1	0.909

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=2


Read 28M words
Number of words:  1215996
Number of labels: 14
Progress: 100.0% words/sec/thread: 4665925 lr: -0.000000 avg.loss:  0.078056 ETA:   0h 0m 0s

N	70000
P@1	0.923
R@1	0.923

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=2


Progress: 100.0% words/sec/thread: 4665783 lr:  0.000000 avg.loss:  0.078056 ETA:   0h 0m 0s
Read 28M words
Number of words:  1215996
Number of labels: 14
Progress: 100.0% words/sec/thread: 5060565 lr:  0.000000 avg.loss:  0.040261 ETA:   0h 0m 0s


N	70000
P@1	0.900
R@1	0.900

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=2


Read 28M words
Number of words:  1215996
Number of labels: 14
Progress:  98.6% words/sec/thread: 5236550 lr:  0.007136 avg.loss:  0.031728 ETA:   0h 0m 0s

N	70000
P@1	0.924
R@1	0.924



Progress: 100.0% words/sec/thread: 5174915 lr:  0.000000 avg.loss:  0.031415 ETA:   0h 0m 0s


In [13]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Read 3M words

Training the best model with lr=0.5, dim=10, epoch=5, and wordNgrams=2


Read 28M words
Number of words:  1215996
Number of labels: 14
Progress: 100.0% words/sec/thread: 4583524 lr:  0.000000 avg.loss:  0.033793 ETA:   0h 0m 0s


In [14]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 6.000260829925537 seconds
Accuracy: 0.9301857142857143
