In [None]:
# Import packages
import zipfile
import urllib.request

import numpy as np
import sklearn.metrics
import sklearn.ensemble

from datasets import load_dataset

import fasttext

import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('talk')

In [None]:
# Load IMDB dataset
df = load_dataset('imdb')

In [None]:
# Preprocess the data
with open('train.txt', 'w') as f:
    for text, label in zip(df['train']['text'], df['train']['label']):
        f.write(f'__label__{label} {text}\n')

with open('test.txt', 'w') as f:
    for text, label in zip(df['test']['text'], df['test']['label']):
        f.write(f'__label__{label} {text}\n')

In [None]:
# Model training
model = fasttext.train_supervised('train.txt')

In [None]:
# Evaluate the model
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('test.txt'))

In [None]:
# Predict labels for test data
pred_labels = []
true_labels = df['test']['label']

for text in df['test']['text']:
    pred = model.predict(text)
    pred_label = int(pred[0][0].replace('__label__', ''))
    pred_labels.append(pred_label)

# Generate classification report
print(sklearn.metrics.classification_report(true_labels, pred_labels, target_names=['negative', 'positive']))