In [1]:
import time
import fasttext
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score

In [2]:
amazon_polarity = load_dataset("amazon_polarity")
print(amazon_polarity)

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 3600000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 400000
    })
})


In [3]:
print(type(amazon_polarity['train']['title']))
print(type(amazon_polarity['train']['title'][0]))
print(amazon_polarity['train']['title'][0])
print(type(amazon_polarity['train']['content']))
print(type(amazon_polarity['train']['content'][0]))
print(amazon_polarity['train']['content'][0])
print(type(amazon_polarity['train']['label']))
print(type(amazon_polarity['train']['label'][0]))
print(amazon_polarity['train']['label'][0])

<class 'list'>
<class 'str'>
Stuning even for the non-gamer
<class 'list'>
<class 'str'>
This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
<class 'list'>
<class 'int'>
1


In [4]:
print(type(amazon_polarity['test']['title']))
print(type(amazon_polarity['test']['title'][0]))
print(amazon_polarity['test']['title'][0])
print(type(amazon_polarity['test']['content']))
print(type(amazon_polarity['test']['content'][0]))
print(amazon_polarity['test']['content'][0])
print(type(amazon_polarity['test']['label']))
print(type(amazon_polarity['test']['label'][0]))
print(amazon_polarity['test']['label'][0])

<class 'list'>
<class 'str'>
Great CD
<class 'list'>
<class 'str'>
My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"
<class 'list'>
<class 'int'>
1


In [5]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [6]:
dataset = amazon_polarity

In [7]:
titles = dataset['train']['title']
contents = dataset['train']['content']
train_texts = [str(x) + str(y) for x, y in zip(titles, contents)]
train_labels = dataset['train']['label']

with open("train.txt", "w") as f:
    for text, label in zip(train_texts, train_labels):
        f.write(f"__label__{label} {text}\n")

In [8]:
titles = dataset['test']['title']
contents = dataset['test']['content']
test_texts = [str(x) + str(y) for x, y in zip(titles, contents)]
test_labels = dataset['test']['label']

with open("test.txt", "w") as f:
    for text, label in zip(test_texts, test_labels):
        f.write(f"__label__{label} {text}\n")

In [9]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [1]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=1


Read 286M words
Number of words:  6228957
Number of labels: 2
Progress: 100.0% words/sec/thread: 8295131 lr:  0.000000 avg.loss:  0.248864 ETA:   0h 0m 0s


N	400000
P@1	0.910
R@1	0.910

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=1


Read 286M words
Number of words:  6228957
Number of labels: 2
Progress: 100.0% words/sec/thread: 8169291 lr:  0.000000 avg.loss:  0.248696 ETA:   0h 0m 0s


N	400000
P@1	0.910
R@1	0.910

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=1


Read 286M words
Number of words:  6228957
Number of labels: 2
Progress: 100.0% words/sec/thread: 7071227 lr:  0.000000 avg.loss:  0.248313 ETA:   0h 0m 0s


N	400000
P@1	0.910
R@1	0.910

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=1


Read 286M words
Number of words:  6228957
Number of labels: 2
Progress:  99.9% words/sec/thread: 5620893 lr:  0.000280 avg.loss:  0.248156 ETA:   0h 0m 0s

N	400000
P@1	0.910
R@1	0.910



Progress: 100.0% words/sec/thread: 5608102 lr:  0.000000 avg.loss:  0.248150 ETA:   0h 0m 0s


In [10]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Read 4M words

Training the best model with lr=0.1, dim=10, epoch=5, and wordNgrams=1


Read 286M words
Number of words:  6228957
Number of labels: 2
Progress: 100.0% words/sec/thread: 6828899 lr:  0.000000 avg.loss:  0.248553 ETA:   0h 0m 0s


In [11]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 44.175086975097656 seconds
Accuracy: 0.90998


In [12]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [2]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=2


Read 286M words
Number of words:  6228957
Number of labels: 2
Progress: 100.0% words/sec/thread: 4717671 lr:  0.000000 avg.loss:  0.160333 ETA:   0h 0m 0s


N	400000
P@1	0.934
R@1	0.934

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=2


Read 286M words
Number of words:  6228957
Number of labels: 2
Progress: 100.0% words/sec/thread: 4720375 lr:  0.000000 avg.loss:  0.156409 ETA:   0h 0m 0s


N	400000
P@1	0.932
R@1	0.932

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=2


Read 286M words
Number of words:  6228957
Number of labels: 2
Progress: 100.0% words/sec/thread: 4826574 lr:  0.000000 avg.loss:  0.154817 ETA:   0h 0m 0s


N	400000
P@1	0.931
R@1	0.931

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=2


Read 286M words
Number of words:  6228957
Number of labels: 2
Progress: 100.0% words/sec/thread: 4666947 lr:  0.000224 avg.loss:  0.155216 ETA:   0h 0m 0s

N	400000
P@1	0.931
R@1	0.931



Progress: 100.0% words/sec/thread: 4657849 lr:  0.000000 avg.loss:  0.155179 ETA:   0h 0m 0s


In [13]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Read 4M words

Training the best model with lr=0.05, dim=10, epoch=5, and wordNgrams=2


Read 286M words
Number of words:  6228957
Number of labels: 2
Progress: 100.0% words/sec/thread: 4712501 lr:  0.000000 avg.loss:  0.160551 ETA:   0h 0m 0s


In [14]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 57.58466625213623 seconds
Accuracy: 0.9336325
