In [3]:
import pandas as pd
from joblib import dump, load
from scipy.sparse import load_npz
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from service.utils import read_data, load_count_vectorizer

In [4]:
experement_id = 20

Read data vectorized

In [18]:
data = load_npz(f'../../local_data/data_vectorized_{experement_id}.npz')
labels = pd.read_csv('../../local_data/data_labels.csv')['labels']

Read raw data and vectorize it

In [5]:
df = read_data()

100%|██████████| 250/250 [00:54<00:00,  4.56it/s]


   title                          text  score
0    NaN  Tik tok is the best app ever      5
1    NaN                     I love it      5
2    NaN                          Nice      5
3    NaN                         GREAT      5
4    NaN                          Good      3


In [7]:
vectorizer = load_count_vectorizer(f'./models/vectorizer_{experement_id}.sav', f'./models/vocabulary_{experement_id}.sav')

In [10]:
data = vectorizer.transform(df['text'])
labels = df['labels']

Train model

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.25, random_state=42)

In [13]:
model = MultinomialNB()

In [104]:
batch_size = 10_000
for start in range(0, X_train.shape[0], batch_size):
    x_batch, y_batch = X_train[start: start+batch_size], y_train[start: start+batch_size]
    model.partial_fit(x_batch, y_batch, classes=[0, 1])

AttributeError: 'MultinomialNB' object has no attribute 'feature_count_'

In [14]:
model.fit(X_train, y_train)

MultinomialNB()

In [15]:
dump(model, f'./models/model_{experement_id}.sav')

['./models/model_20.sav']

Evaluate model

In [16]:
model = load(f'./models/model_{experement_id}.sav')

In [17]:
predictions = model.predict(X_test)

In [18]:
print(f'Test sample size: {X_test.shape[0]}')
print(f'recall: {recall_score(y_test, predictions)}')
print(f'precision: {precision_score(y_test, predictions)}')
print(f'f1 score: {f1_score(y_test, predictions)}')
print(f'accuracy: {accuracy_score(y_test, predictions)}')

Test sample size: 6250000
recall: 0.9272153831327257
precision: 0.9249004930267707
f1 score: 0.9260564914326264
accuracy: 0.88322976
