In [1]:
import time
import fasttext
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score

In [2]:
yelp_polarity = load_dataset("yelp_polarity")
print(yelp_polarity)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 560000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 38000
    })
})


In [3]:
print(type(yelp_polarity['train']['text']))
print(type(yelp_polarity['train']['text'][0]))
print(yelp_polarity['train']['text'][0])
print(type(yelp_polarity['train']['label']))
print(type(yelp_polarity['train']['label'][0]))
print(yelp_polarity['train']['label'][0])

<class 'list'>
<class 'str'>
Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.
<class 'list'>
<class 'int'>
0


In [4]:
print(type(yelp_polarity['test']['text']))
print(type(yelp_polarity['test']['text'][0]))
print(yelp_polarity['test']['text'][0])
print(type(yelp_polarity['test']['label']))
print(type(yelp_polarity['test']['label'][0]))
print(yelp_polarity['test']['label'][0])

<class 'list'>
<class 'str'>
Contrary to other reviews, I have zero complaints about the service or the prices. I have been getting tire service here for the past 5 years now, and compared to my experience with places like Pep Boys, these guys are experienced and know what they're doing. \nAlso, this is one place that I do not feel like I am being taken advantage of, just because of my gender. Other auto mechanics have been notorious for capitalizing on my ignorance of cars, and have sucked my bank account dry. But here, my service and road coverage has all been well explained - and let up to me to decide. \nAnd they just renovated the waiting room. It looks a lot better than it did in previous years.
<class 'list'>
<class 'int'>
1


In [5]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [6]:
dataset = yelp_polarity

In [7]:
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']

with open("train.txt", "w") as f:
    for text, label in zip(train_texts, train_labels):
        f.write(f"__label__{label} {text}\n")

In [8]:
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

with open("test.txt", "w") as f:
    for text, label in zip(test_texts, test_labels):
        f.write(f"__label__{label} {text}\n")

In [9]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [1]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=1


Read 75M words
Number of words:  1446643
Number of labels: 2
Progress: 100.0% words/sec/thread: 6585161 lr:  0.000000 avg.loss:  0.212021 ETA:   0h 0m 0s


N	38000
P@1	0.934
R@1	0.934

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=1


Read 75M words
Number of words:  1446643
Number of labels: 2
Progress:  99.2% words/sec/thread: 8732574 lr:  0.000810 avg.loss:  0.208317 ETA:   0h 0m 0s

N	38000
P@1	0.935
R@1	0.935

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=1


Progress: 100.0% words/sec/thread: 8655631 lr:  0.000000 avg.loss:  0.207631 ETA:   0h 0m 0s
Read 75M words
Number of words:  1446643
Number of labels: 2
Progress: 100.0% words/sec/thread: 8865514 lr:  0.000000 avg.loss:  0.208330 ETA:   0h 0m 0s


N	38000
P@1	0.935
R@1	0.935

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=1


Read 75M words
Number of words:  1446643
Number of labels: 2
Progress:  99.2% words/sec/thread: 4891677 lr:  0.004075 avg.loss:  0.207632 ETA:   0h 0m 0s

N	38000
P@1	0.935
R@1	0.935



Progress: 100.0% words/sec/thread: 4884994 lr:  0.000000 avg.loss:  0.207058 ETA:   0h 0m 0s


In [10]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Training the best model with lr=0.5, dim=10, epoch=5, and wordNgrams=1


Read 75M words
Number of words:  1446643
Number of labels: 2
Progress: 100.0% words/sec/thread: 4807764 lr:  0.000000 avg.loss:  0.208177 ETA:   0h 0m 0s


In [11]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 17.376765966415405 seconds
Accuracy: 0.9349473684210526


In [12]:
lr_list = [0.05, 0.1, 0.25, 0.5]
dim_list = [10] # hidden units
ngrams_list = [2]
epoch_list = [5]

best_p, best_r = 0, 0
best_l, best_d, best_n, best_e = 0, 0, 0, 0

for l in lr_list:
    for d in dim_list:
        for n in ngrams_list:
            for e in epoch_list:
                print(f"Training model with lr={l}, dim={d}, epoch={e}, and wordNgrams={n}")
                model = fasttext.train_supervised(input='train.txt', lr=l, dim=d, epoch=e, wordNgrams=n)
                N, p, r = model.test('test.txt')
                print_results(N, p, r)
                if p > best_p and r > best_r:
                    best_p, best_r = p, r
                    best_l, best_d, best_n, best_e = l, d, n, e
                print()

Training model with lr=0.05, dim=10, epoch=5, and wordNgrams=2


Read 75M words
Number of words:  1446643
Number of labels: 2
Progress: 100.0% words/sec/thread: 4631250 lr:  0.000000 avg.loss:  0.151864 ETA:   0h 0m 0s


N	38000
P@1	0.947
R@1	0.947

Training model with lr=0.1, dim=10, epoch=5, and wordNgrams=2


Read 75M words
Number of words:  1446643
Number of labels: 2
Progress: 100.0% words/sec/thread: 3104696 lr:  0.000000 avg.loss:  0.130290 ETA:   0h 0m 0s


N	38000
P@1	0.949
R@1	0.949

Training model with lr=0.25, dim=10, epoch=5, and wordNgrams=2


Read 75M words
Number of words:  1446643
Number of labels: 2
Progress: 100.0% words/sec/thread: 2464531 lr:  0.000000 avg.loss:  0.119147 ETA:   0h 0m 0s
Read 1M words

N	38000
P@1	0.950
R@1	0.950

Training model with lr=0.5, dim=10, epoch=5, and wordNgrams=2


Read 75M words
Number of words:  1446643
Number of labels: 2
Progress: 100.0% words/sec/thread: 4218194 lr:  0.000000 avg.loss:  0.119323 ETA:   0h 0m 0s


N	38000
P@1	0.950
R@1	0.950



In [13]:
print(f"Training the best model with lr={best_l}, dim={best_d}, epoch={best_e}, and wordNgrams={best_n}")

start_time = time.time()
model = fasttext.train_supervised(input='train.txt', lr=best_l, dim=best_d, epoch=best_e, wordNgrams=best_n, verbose=2)

Training the best model with lr=0.5, dim=10, epoch=5, and wordNgrams=2


Read 75M words
Number of words:  1446643
Number of labels: 2
Progress: 100.0% words/sec/thread: 4042177 lr:  0.000000 avg.loss:  0.118038 ETA:   0h 0m 0s


In [14]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

predictions = []
for text in test_texts:
    prediction = model.predict(text)
    predictions.append(int(prediction[0][0].split('__label__')[1]))
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Elapsed time: 17.57801914215088 seconds
Accuracy: 0.95
