In [1]:
import time
import fasttext
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score

In [2]:
yahoo_answers = load_dataset("yahoo_answers_topics")
print(yahoo_answers)

DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 1400000
    })
    test: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 60000
    })
})


In [3]:
print(type(yahoo_answers['train']['question_title']))
print(type(yahoo_answers['train']['question_title'][0]))
print(yahoo_answers['train']['question_title'][0])
print(type(yahoo_answers['train']['question_content']))
print(type(yahoo_answers['train']['question_content'][0]))
print(yahoo_answers['train']['question_content'][0])
print(type(yahoo_answers['train']['best_answer']))
print(type(yahoo_answers['train']['best_answer'][0]))
print(yahoo_answers['train']['best_answer'][0])
print(type(yahoo_answers['train']['topic']))
print(type(yahoo_answers['train']['topic'][0]))
print(yahoo_answers['train']['topic'][0])

<class 'list'>
<class 'str'>
why doesn't an optical mouse work on a glass table?
<class 'list'>
<class 'str'>
or even on some surfaces?
<class 'list'>
<class 'str'>
Optical mice use an LED and a camera to rapidly capture images of the surface beneath the mouse.  The infomation from the camera is analyzed by a DSP (Digital Signal Processor) and used to detect imperfections in the underlying surface and determine motion. Some materials, such as glass, mirrors or other very shiny, uniform surfaces interfere with the ability of the DSP to accurately analyze the surface beneath the mouse.  \nSince glass is transparent and very uniform, the mouse is unable to pick up enough imperfections in the underlying surface to determine motion.  Mirrored surfaces are also a problem, since they constantly reflect back the same image, causing the DSP not to recognize motion properly. When the system is unable to see surface changes associated with movement, the mouse will not work properly.
<class 'list'

In [4]:
print(type(yahoo_answers['test']['question_title']))
print(type(yahoo_answers['test']['question_title'][0]))
print(yahoo_answers['test']['question_title'][0])
print(type(yahoo_answers['test']['question_content']))
print(type(yahoo_answers['test']['question_content'][0]))
print(yahoo_answers['test']['question_content'][0])
print(type(yahoo_answers['test']['best_answer']))
print(type(yahoo_answers['test']['best_answer'][0]))
print(yahoo_answers['test']['best_answer'][0])
print(type(yahoo_answers['test']['topic']))
print(type(yahoo_answers['test']['topic'][0]))
print(yahoo_answers['test']['topic'][0])

<class 'list'>
<class 'str'>
What makes friendship click?
<class 'list'>
<class 'str'>
How does the spark keep going?
<class 'list'>
<class 'str'>
good communication is what does it.  Can you move beyond small talk and say what's really on your mind.  If you start doing this, my expereince is that potentially good friends will respond or shun you.  Then you know who the really good friends are.
<class 'list'>
<class 'int'>
8


In [5]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [6]:
dataset = yahoo_answers

In [7]:
titles = dataset['train']['question_title']
contents = dataset['train']['question_content']
answers = dataset['train']['best_answer']
train_texts = [str(x) + str(y) + str(z) for x, y, z in zip(titles, contents, answers)]
train_labels = dataset['train']['topic']

with open("train.txt", "w") as f:
    for text, label in zip(train_texts, train_labels):
        f.write(f"__label__{label} {text}\n")

In [8]:
titles = dataset['test']['question_title']
contents = dataset['test']['question_content']
answers = dataset['test']['best_answer']
test_texts = [str(x) + str(y) + str(z) for x, y, z in zip(titles, contents, answers)]
test_labels = dataset['test']['topic']

with open("test.txt", "w") as f:
    for text, label in zip(test_texts, test_labels):
        f.write(f"__label__{label} {text}\n")

In [9]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [1]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=1


Read 128M words
Number of words:  5783083
Number of labels: 10
Progress: 100.0% words/sec/thread: 8334691 lr:  0.000000 avg.loss:  1.074399 ETA:   0h 0m 0s


N	60000
P@1	0.700
R@1	0.700

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=1


Read 128M words
Number of words:  5783083
Number of labels: 10
Progress: 100.0% words/sec/thread: 8251633 lr:  0.000000 avg.loss:  1.057265 ETA:   0h 0m 0s


N	60000
P@1	0.698
R@1	0.698

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=1


Read 128M words
Number of words:  5783083
Number of labels: 10
Progress: 100.0% words/sec/thread: 8379705 lr:  0.000000 avg.loss:  1.046224 ETA:   0h 0m 0s


N	60000
P@1	0.697
R@1	0.697

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=1


Read 128M words
Number of words:  5783083
Number of labels: 10
Progress: 100.0% words/sec/thread: 8212604 lr:  0.000000 avg.loss:  1.056955 ETA:   0h 0m 0s

N	60000
P@1	0.697
R@1	0.697






In [10]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Read 3M words

Training the best model with lr=0.05, dim=10, epoch=5, and wordNgrams=1


Read 128M words
Number of words:  5783083
Number of labels: 10
Progress: 100.0% words/sec/thread: 8324836 lr:  0.000000 avg.loss:  1.076735 ETA:   0h 0m 0s


In [11]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 18.771555185317993 seconds
Accuracy: 0.698


In [12]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [2]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=2


Read 128M words
Number of words:  5783083
Number of labels: 10
Progress: 100.0% words/sec/thread: 5305582 lr:  0.000000 avg.loss:  0.947236 ETA:   0h 0m 0s


N	60000
P@1	0.701
R@1	0.701

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=2


Read 128M words
Number of words:  5783083
Number of labels: 10
Progress: 100.0% words/sec/thread: 5262699 lr:  0.000000 avg.loss:  0.827007 ETA:   0h 0m 0s
Read 3M words

N	60000
P@1	0.695
R@1	0.695

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=2


Read 128M words
Number of words:  5783083
Number of labels: 10
Progress: 100.0% words/sec/thread: 5186229 lr:  0.000000 avg.loss:  0.770433 ETA:   0h 0m 0s


N	60000
P@1	0.686
R@1	0.686

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=2


Read 128M words
Number of words:  5783083
Number of labels: 10
Progress:  99.8% words/sec/thread: 5289914 lr:  0.001174 avg.loss:  0.769883 ETA:   0h 0m 0s

N	60000
P@1	0.679
R@1	0.679



Progress: 100.0% words/sec/thread: 5271720 lr:  0.000000 avg.loss:  0.768592 ETA:   0h 0m 0s


In [13]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Training the best model with lr=0.05, dim=10, epoch=5, and wordNgrams=2


Read 128M words
Number of words:  5783083
Number of labels: 10
Progress: 100.0% words/sec/thread: 5188727 lr:  0.000000 avg.loss:  0.942776 ETA:   0h 0m 0s


In [14]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 25.628034114837646 seconds
Accuracy: 0.7007
