In [2]:
import pprint
import random
import pandas as pd
import numpy as np
from joblib import dump, load
from scipy.sparse import load_npz, save_npz
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB

from service.utils import read_data, load_tf_idf_vectorizer

In [3]:
pp = pprint.PrettyPrinter()


Read data vectorized

In [1]:
data = load_npz(f'../../../local_data/data_vectorized_22.npz')
labels = pd.read_csv('../../../local_data/data_labels.csv')['labels']
len(labels)

In [4]:
experement_id = 22

Read raw data and vectorize it

In [5]:
df = read_data()

100%|██████████| 250/250 [00:52<00:00,  4.73it/s]


   title                          text  score
0    NaN  Tik tok is the best app ever      5
1    NaN                     I love it      5
2    NaN                          Nice      5
3    NaN                         GREAT      5
4    NaN                          Good      3


In [7]:
vectorizer = load_tf_idf_vectorizer(f'../models/vectorizer_{experement_id}.sav', f'../models/vectorizer_params_{experement_id}.sav')

In [9]:
data = vectorizer.transform(df['text'])
labels = df['labels']

Train model

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.25, random_state=42)
print(len(y_train), len(y_test))

18750000 6250000


In [2]:
# save_npz(f'../../../local_data/data_vectorized_20.npz', data[:1_000_000])

In [3]:
def fix_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)

In [4]:
def validate(params, X, Y, folds=10):
    fix_seed()
    grid = GridSearchCV(model, params, cv=folds, n_jobs=-1, scoring='roc_auc', return_train_score=True)
    grid.fit(X, Y)
    pp.pprint(grid.best_score_)
    pp.pprint(grid.best_params_)
    pp.pprint(grid.cv_results_)

In [15]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
validate({
    'C': [0.21],
}, data[:100_000], labels[:100_000], folds=5)

0.9200078988724026
{'C': 0.21}
{'mean_fit_time': array([1.40378661]),
 'mean_score_time': array([0.00599866]),
 'mean_test_score': array([0.9200079]),
 'mean_train_score': array([0.94297891]),
 'param_C': masked_array(data=[0.21],
             mask=[False],
       fill_value='?',
            dtype=object),
 'params': [{'C': 0.21}],
 'rank_test_score': array([1]),
 'split0_test_score': array([0.89869699]),
 'split0_train_score': array([0.94806563]),
 'split1_test_score': array([0.93544061]),
 'split1_train_score': array([0.93972115]),
 'split2_test_score': array([0.9157085]),
 'split2_train_score': array([0.94312258]),
 'split3_test_score': array([0.92510069]),
 'split3_train_score': array([0.94143908]),
 'split4_test_score': array([0.9250927]),
 'split4_train_score': array([0.94254613]),
 'std_fit_time': array([0.0542502]),
 'std_score_time': array([4.90331443e-06]),
 'std_test_score': array([0.0123499]),
 'std_train_score': array([0.00279524])}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [37]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=0.21, n_jobs=-1)

In [11]:
model = MultinomialNB()

In [None]:
batch_size = 10_000
for start in range(0, X_train.shape[0], batch_size):
    print(start)
    x_batch, y_batch = X_train[start: start+batch_size], y_train[start: start+batch_size]
    model.partial_fit(x_batch, y_batch, classes=[0, 1])

In [12]:
model.fit(X_train, y_train)

MultinomialNB()

In [15]:
model = load('../models/model_default.sav')

In [None]:
dump(model, f'../models/model_{experement_id}.sav')

Evaluate model

In [47]:
# model = load(f'../models/model_{experement_id}.sav')

In [16]:
prediction_probas = model.predict_proba(X_test)
predictions = np.argmax(prediction_probas, axis=1)

In [17]:
print(f'Test sample size: {X_test.shape[0]}')
print(f'recall: {recall_score(y_test, predictions)}')
print(f'precision: {precision_score(y_test, predictions)}')
print(f'f1 score: {f1_score(y_test, predictions)}')
print(f'roc-auc: {roc_auc_score(y_test, predictions)}')

Test sample size: 6250000
recall: 0.9574519023489609
precision: 0.9168702185070395
f1 score: 0.9367217354461519
roc-auc: 0.8168068987799224
