In [1]:
import os
import plotly.graph_objects as go
from tqdm import tqdm

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

Интеграция с Weights&Biases

In [2]:
import wandb
wandb.init(project="review_classifier")

wandb: Wandb version 0.8.36 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


W&B Run: https://app.wandb.ai/datasciensyash/review_classifier/runs/1km9lsx9

Загрузка датасета

    train_df: Датасет с тренировочным набором данных
    test_df: Датасет с набором данных для тестирования

In [3]:
train_file = 'dataset/train.pkl'
test_file = 'dataset/test.pkl'

In [4]:
train_df = pd.read_pickle(train_file)
test_df = pd.read_pickle(test_file)

Подготовка корпуса для обучения модели.

In [5]:
corpus_train = train_df['Text'].values

Подготовка гиперпараметров для обучения модели.

In [6]:
import random

random_state = 1024 #Фиксируем для воспроизводимости

vector_hparams = {
    
    'min_df': 11,  #Минимальное количество вхождений токена в корпусе для добавления его в словарь
    'max_df': 0.8953, #Максимальная доля документов, в которые входит токен, для добавления его в словарь

}

model_hparams = {
    
    'solver': 'newton-cg',
    'max_iter': 134,
    
}


Подготовка векторизатора для нашего корпуса.

In [7]:
vectorizer = TfidfVectorizer(**vector_hparams)
train_features = vectorizer.fit_transform(corpus_train)

Инициализация модели

In [8]:
model = LogisticRegression(**model_hparams, random_state=random_state)

Обучение модели

In [9]:
#Кросс-валидация
cv_num = 5
scores = cross_val_score(model, train_features, train_df['Target'], cv=cv_num)

print('Mean acc.:', np.mean(scores))
wandb.log({'score': np.mean(scores)})

fig = go.Figure(data=[go.Bar(x=[i for i in range(cv_num)], y=scores, text=np.round(scores, 3), textposition='auto')])
fig.show()

Mean acc.: 0.8652025041152165


wandb: Wandb version 0.8.36 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [10]:
#Обучение
model = model.fit(train_features, train_df['Target'])

Тестирование модели

In [11]:
test_features = vectorizer.transform(test_df['Text'].values)
predictions = model.predict_proba(test_features)

In [12]:
print(classification_report(test_df['Target'], predictions.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.88      0.89      0.89     12361
           1       0.89      0.88      0.89     12440

    accuracy                           0.89     24801
   macro avg       0.89      0.89      0.89     24801
weighted avg       0.89      0.89      0.89     24801



In [13]:
round(accuracy_score(test_df['Target'], predictions.argmax(axis=1)), 2)

0.89

In [25]:
from sklearn.metrics import log_loss
log_loss(test_df['Target'], predictions)

0.31452161622527924

Исследование неправильно классифицированных примеров

In [15]:
test_df['Predicted'] = predictions.argmax(axis=1)
test_df['Score'] = predictions.T[1]
test_errors = test_df[test_df['Predicted'] != test_df['Target']]
test_errors

Unnamed: 0,Target,Text,Predicted,Score
20,1,I have certainly not seen all of Jean Rollin's...,0,0.467276
27,1,While I can't say whether or not Larry Hama ev...,0,0.279051
42,1,"Naturally, along with everyone else, I was pri...",0,0.402039
69,1,"For late-80s cheese, this really isn't so bad....",0,0.411714
73,1,"Madonna gets into action, again and she fails ...",0,0.305384
...,...,...,...,...
24989,0,"Some describe CALIGULIA as ""the"" most controve...",1,0.522786
24992,0,many people said this was a great movie with H...,1,0.707492
24997,0,The basic genre is a thriller intercut with an...,1,0.599365
24998,0,Four things intrigued me as to this film - fir...,1,0.535890


In [16]:
import plotly.express as px
fig = px.histogram(test_errors, x='Score', color='Target')
fig.show() #Как мы видим, распределение ошибок нормальное, что соответствует принципу сильного классификатора

Исследование предсказываемых значений. Это важно при дальнейшем переводе из бинарной классификации в рейтинг (от 0 до 10)

In [17]:
test_df['Predicted'] = predictions.argmax(axis=1)
test_df['Score'] = predictions.T[1]
fig = px.histogram(test_df, x='Score')
fig.show()

In [18]:
test_df.to_csv('Predictions.csv')

In [19]:
import pickle
with open('./models/model.pkl', 'wb') as file:
    pickle.dump(model, file)
with open('./models/vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

In [20]:
data = vectorizer.transform([""])
model.predict_proba(data)

array([[0.49248003, 0.50751997]])

In [21]:
a = np.array([1, 2, 3])
b = np.array([8, 9, 10])

np.stack([a, b], axis=1).tolist()

[[1, 8], [2, 9], [3, 10]]