In [1]:
import time
import fasttext
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score

In [2]:
ag_news = load_dataset("ag_news")
print(ag_news)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [3]:
print(type(ag_news['train']['text']))
print(type(ag_news['train']['text'][0]))
print(ag_news['train']['text'][0])
print(type(ag_news['train']['label']))
print(type(ag_news['train']['label'][0]))
print(ag_news['train']['label'][0])

<class 'list'>
<class 'str'>
Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
<class 'list'>
<class 'int'>
2


In [4]:
print(type(ag_news['test']['text']))
print(type(ag_news['test']['text'][0]))
print(ag_news['test']['text'][0])
print(type(ag_news['test']['label']))
print(type(ag_news['test']['label'][0]))
print(ag_news['test']['label'][0])

<class 'list'>
<class 'str'>
Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
<class 'list'>
<class 'int'>
2


In [5]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [6]:
dataset = ag_news

In [7]:
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']

with open("train.txt", "w") as f:
    for text, label in zip(train_texts, train_labels):
        f.write(f"__label__{label} {text}\n")

In [8]:
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

with open("test.txt", "w") as f:
    for text, label in zip(test_texts, test_labels):
        f.write(f"__label__{label} {text}\n")

In [9]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [1]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=1


Read 4M words
Number of words:  188111
Number of labels: 4
Progress: 100.0% words/sec/thread: 8123139 lr:  0.000000 avg.loss:  0.312559 ETA:   0h 0m 0s
Read 1M words

N	7600
P@1	0.913
R@1	0.913

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=1


Read 4M words
Number of words:  188111
Number of labels: 4
Progress: 100.0% words/sec/thread: 8270125 lr:  0.000000 avg.loss:  0.243690 ETA:   0h 0m 0s
Read 1M words

N	7600
P@1	0.914
R@1	0.914

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=1


Read 4M words
Number of words:  188111
Number of labels: 4
Progress: 100.0% words/sec/thread: 8216088 lr:  0.000000 avg.loss:  0.214442 ETA:   0h 0m 0s
Read 1M words

N	7600
P@1	0.914
R@1	0.914

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=1


Read 4M words
Number of words:  188111
Number of labels: 4
Progress:  52.5% words/sec/thread: 8726498 lr:  0.237640 avg.loss:  0.292023 ETA:   0h 0m 0s

N	7600
P@1	0.913
R@1	0.913



Progress: 100.0% words/sec/thread: 8250533 lr:  0.000000 avg.loss:  0.208962 ETA:   0h 0m 0s


In [10]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Read 1M words

Training the best model with lr=0.1, dim=10, epoch=5, and wordNgrams=1


Read 4M words
Number of words:  188111
Number of labels: 4
Progress: 100.0% words/sec/thread: 8191716 lr:  0.000000 avg.loss:  0.229802 ETA:   0h 0m 0s


In [11]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 0.6706719398498535 seconds
Accuracy: 0.9167105263157894


In [12]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [2]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=2


Read 4M words
Number of words:  188111
Number of labels: 4
Progress: 100.0% words/sec/thread: 4733911 lr:  0.000000 avg.loss:  0.355953 ETA:   0h 0m 0s
Read 3M words

N	7600
P@1	0.911
R@1	0.911

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=2


Read 4M words
Number of words:  188111
Number of labels: 4
Progress: 100.0% words/sec/thread: 5474497 lr:  0.000000 avg.loss:  0.225286 ETA:   0h 0m 0s
Read 4M words

N	7600
P@1	0.917
R@1	0.917

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=2


Read 4M words
Number of words:  188111
Number of labels: 4
Progress: 100.0% words/sec/thread: 4732799 lr:  0.000000 avg.loss:  0.143234 ETA:   0h 0m 0s


N	7600
P@1	0.919
R@1	0.919

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=2


Read 4M words
Number of words:  188111
Number of labels: 4
Progress:  85.2% words/sec/thread: 5687812 lr:  0.074111 avg.loss:  0.148198 ETA:   0h 0m 0s

N	7600
P@1	0.917
R@1	0.917



Progress: 100.0% words/sec/thread: 5538546 lr:  0.000000 avg.loss:  0.123914 ETA:   0h 0m 0s


In [13]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Read 3M words

Training the best model with lr=0.25, dim=10, epoch=5, and wordNgrams=2


Read 4M words
Number of words:  188111
Number of labels: 4
Progress: 100.0% words/sec/thread: 5465766 lr:  0.000000 avg.loss:  0.145287 ETA:   0h 0m 0s


In [14]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 0.908433198928833 seconds
Accuracy: 0.9186842105263158
