In [1]:
import time
import fasttext
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score

In [2]:
column_names = ['label', 'title', 'content']
amazon_full_train = pd.read_csv('train.csv', header=None, names=column_names)
amazon_full_train.head()

Unnamed: 0,label,title,content
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...


In [3]:
amazon_full_test = pd.read_csv('test.csv', header=None, names=column_names)
amazon_full_test.head()

Unnamed: 0,label,title,content
0,1,mens ultrasheer,"This model may be ok for sedentary types, but ..."
1,4,Surprisingly delightful,This is a fast read filled with unexpected hum...
2,2,"Works, but not as advertised",I bought one of these chargers..the instructio...
3,2,Oh dear,I was excited to find a book ostensibly about ...
4,2,Incorrect disc!,"I am a big JVC fan, but I do not like this mod..."


In [4]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [5]:
titles = amazon_full_train['title'].tolist()
contents = amazon_full_train['content'].tolist()
train_texts = [str(x) + str(y) for x, y in zip(titles, contents)]
train_labels = amazon_full_train['label'].tolist()

with open("train.txt", "w") as f:
    for text, label in zip(train_texts, train_labels):
        f.write(f"__label__{label} {text}\n")

In [6]:
titles = amazon_full_test['title'].tolist()
contents = amazon_full_test['content'].tolist()
test_texts = [str(x) + str(y) for x, y in zip(titles, contents)]
test_labels = amazon_full_test['label'].tolist()

with open("test.txt", "w") as f:
    for text, label in zip(test_texts, test_labels):
        f.write(f"__label__{label} {text}\n")

In [7]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [1]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=1


Read 243M words
Number of words:  5472064
Number of labels: 5
Progress: 100.0% words/sec/thread: 5868247 lr:  0.000000 avg.loss:  1.082846 ETA:   0h 0m 0s


N	650000
P@1	0.552
R@1	0.552

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=1


Read 243M words
Number of words:  5472064
Number of labels: 5
Progress: 100.0% words/sec/thread: 6615638 lr:  0.000000 avg.loss:  1.080539 ETA:   0h 0m 0s


N	650000
P@1	0.552
R@1	0.552

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=1


Read 243M words
Number of words:  5472064
Number of labels: 5
Progress: 100.0% words/sec/thread: 6988232 lr:  0.000000 avg.loss:  1.080027 ETA:   0h 0m 0s
Read 4M words

N	650000
P@1	0.552
R@1	0.552

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=1


Read 243M words
Number of words:  5472064
Number of labels: 5
Progress: 100.0% words/sec/thread: 7128577 lr:  0.000000 avg.loss:  1.081235 ETA:   0h 0m 0s


N	650000
P@1	0.552
R@1	0.552



In [8]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Training the best model with lr=0.25, dim=10, epoch=5, and wordNgrams=1


Read 243M words
Number of words:  5472064
Number of labels: 5
Progress: 100.0% words/sec/thread: 7162327 lr:  0.000000 avg.loss:  1.079740 ETA:   0h 0m 0s


In [9]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 36.26824998855591 seconds
Accuracy: 0.5520030769230769


In [10]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [2]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=2


Read 243M words
Number of words:  5472064
Number of labels: 5
Progress: 100.0% words/sec/thread: 4726466 lr:  0.000000 avg.loss:  0.925387 ETA:   0h 0m 0s


N	650000
P@1	0.579
R@1	0.579

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=2


Read 243M words
Number of words:  5472064
Number of labels: 5
Progress: 100.0% words/sec/thread: 4581883 lr:  0.000000 avg.loss:  0.896259 ETA:   0h 0m 0s100.0% words/sec/thread: 4581901 lr: -0.000000 avg.loss:  0.896259 ETA:   0h 0m 0s
Read 4M words

N	650000
P@1	0.570
R@1	0.570

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=2


Read 243M words
Number of words:  5472064
Number of labels: 5
Progress: 100.0% words/sec/thread: 4139524 lr:  0.000000 avg.loss:  0.885213 ETA:   0h 0m 0s


N	650000
P@1	0.559
R@1	0.559

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=2


Read 243M words
Number of words:  5472064
Number of labels: 5
Progress: 100.0% words/sec/thread: 4128622 lr:  0.000211 avg.loss:  0.877894 ETA:   0h 0m 0s

N	650000
P@1	0.555
R@1	0.555



Progress: 100.0% words/sec/thread: 4120134 lr:  0.000000 avg.loss:  0.877879 ETA:   0h 0m 0s


In [11]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Read 4M words

Training the best model with lr=0.05, dim=10, epoch=5, and wordNgrams=2


Read 243M words
Number of words:  5472064
Number of labels: 5
Progress: 100.0% words/sec/thread: 4293795 lr:  0.000000 avg.loss:  0.925026 ETA:   0h 0m 0s


In [12]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 53.69218111038208 seconds
Accuracy: 0.5792323076923077
