In [1]:
import time
import fasttext
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score

In [2]:
yelp_full = load_dataset("yelp_review_full")
print(yelp_full)

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})


In [3]:
print(type(yelp_full['train']['text']))
print(type(yelp_full['train']['text'][0]))
print(yelp_full['train']['text'][0])
print(type(yelp_full['train']['label']))
print(type(yelp_full['train']['label'][0]))
print(yelp_full['train']['label'][0])

<class 'list'>
<class 'str'>
dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.
<class 'list'>
<class 'int'>
4


In [4]:
print(type(yelp_full['test']['text']))
print(type(yelp_full['test']['text'][0]))
print(yelp_full['test']['text'][0])
print(type(yelp_full['test']['label']))
print(type(yelp_full['test']['label'][0]))
print(yelp_full['test']['label'][0])

<class 'list'>
<class 'str'>
I got 'new' tires from them and within two weeks got a flat. I took my car to a local mechanic to see if i could get the hole patched, but they said the reason I had a flat was because the previous patch had blown - WAIT, WHAT? I just got the tire and never needed to have it patched? This was supposed to be a new tire. \nI took the tire over to Flynn's and they told me that someone punctured my tire, then tried to patch it. So there are resentful tire slashers? I find that very unlikely. After arguing with the guy and telling him that his logic was far fetched he said he'd give me a new tire \"this time\". \nI will never go back to Flynn's b/c of the way this guy treated me and the simple fact that they gave me a used tire!
<class 'list'>
<class 'int'>
0


In [5]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [6]:
dataset = yelp_full

In [7]:
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']

with open("train.txt", "w") as f:
    for text, label in zip(train_texts, train_labels):
        f.write(f"__label__{label} {text}\n")

In [8]:
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

with open("test.txt", "w") as f:
    for text, label in zip(test_texts, test_labels):
        f.write(f"__label__{label} {text}\n")

In [9]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [1]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=1


Read 88M words
Number of words:  1622077
Number of labels: 5
Progress: 100.0% words/sec/thread: 6756598 lr:  0.000000 avg.loss:  1.004073 ETA:   0h 0m 0s100.0% words/sec/thread: 6756636 lr: -0.000000 avg.loss:  1.004073 ETA:   0h 0m 0s
Read 3M words

N	50000
P@1	0.599
R@1	0.599

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=1


Read 88M words
Number of words:  1622077
Number of labels: 5
Progress: 100.0% words/sec/thread: 6733874 lr: -0.000000 avg.loss:  0.990898 ETA:   0h 0m 0s 0.000000 avg.loss:  0.990898 ETA:   0h 0m 0s
Read 3M words

N	50000
P@1	0.601
R@1	0.601

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=1


Read 88M words
Number of words:  1622077
Number of labels: 5
Progress: 100.0% words/sec/thread: 3729118 lr:  0.000000 avg.loss:  0.988271 ETA:   0h 0m 0s


N	50000
P@1	0.601
R@1	0.601

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=1


Read 88M words
Number of words:  1622077
Number of labels: 5
Progress: 100.0% words/sec/thread: 3641016 lr:  0.000217 avg.loss:  0.991617 ETA:   0h 0m 0s  4.1% words/sec/thread: 4177928 lr:  0.479626 avg.loss:  1.192482 ETA:   0h 0m14s

N	50000
P@1	0.601
R@1	0.601



Progress: 100.0% words/sec/thread: 3620663 lr:  0.000000 avg.loss:  0.991511 ETA:   0h 0m 0s


In [10]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Read 1M words

Training the best model with lr=0.1, dim=10, epoch=5, and wordNgrams=1


Read 88M words
Number of words:  1622077
Number of labels: 5
Progress: 100.0% words/sec/thread: 6646488 lr:  0.000000 avg.loss:  0.992639 ETA:   0h 0m 0s


In [11]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 19.619973182678223 seconds
Accuracy: 0.60076


In [12]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [2]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=2


Read 88M words
Number of words:  1622077
Number of labels: 5
Progress:  99.9% words/sec/thread: 4492938 lr:  0.000028 avg.loss:  0.901889 ETA:   0h 0m 0s

N	50000
P@1	0.626
R@1	0.626

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=2


Progress: 100.0% words/sec/thread: 4462135 lr:  0.000000 avg.loss:  0.901800 ETA:   0h 0m 0s
Read 88M words
Number of words:  1622077
Number of labels: 5
Progress: 100.0% words/sec/thread: 2456696 lr:  0.000000 avg.loss:  0.833739 ETA:   0h 0m 0s


N	50000
P@1	0.625
R@1	0.625

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=2


Read 88M words
Number of words:  1622077
Number of labels: 5
Progress: 100.0% words/sec/thread: 2582966 lr:  0.000000 avg.loss:  0.783145 ETA:   0h 0m 0s


N	50000
P@1	0.617
R@1	0.617

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=2


Read 88M words
Number of words:  1622077
Number of labels: 5
Progress: 100.0% words/sec/thread: 4346398 lr:  0.000000 avg.loss:  0.774995 ETA:   0h 0m 0s


N	50000
P@1	0.612
R@1	0.612



In [13]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Training the best model with lr=0.05, dim=10, epoch=5, and wordNgrams=2


Read 88M words
Number of words:  1622077
Number of labels: 5
Progress: 100.0% words/sec/thread: 2789307 lr:  0.000000 avg.loss:  0.906262 ETA:   0h 0m 0s


In [14]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 27.480775833129883 seconds
Accuracy: 0.6254
