In [2]:
import pickle
import pandas as pd


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
def text_preprocessor(text: str):
    doc = nlp(text)
    tokens = [ i.lemma_ for i in doc]   
    text = " ".join(tokens) 
    return text

In [5]:
with open('../dataset_raw.pickle', 'rb') as handle:
    data = pickle.load(handle)

In [6]:
data = pd.DataFrame.from_dict(data)

In [7]:
data.head()

Unnamed: 0,example,category
0,"But when we reached Shanghai, I felt so terrif...",№1 ЛСВ 1 осмотр
1,the one event that Pliny is famous for actuall...,№1 ЛСВ 1 осмотр
2,"In an unprecedented legal move, Mr Justice Hod...",№1 ЛСВ 1 осмотр
3,Doctor Kagawa upped his estimate of latency to...,№1 ЛСВ 1 осмотр
4,and suddenly Eva was above the park and Zozobr...,№1 ЛСВ 1 осмотр


In [8]:
data['example'] = data['example'].apply(text_preprocessor)

In [9]:
data.head()

Unnamed: 0,example,category
0,"but when we reach Shanghai , I feel so terrifi...",№1 ЛСВ 1 осмотр
1,the one event that Pliny be famous for actuall...,№1 ЛСВ 1 осмотр
2,"in an unprecedented legal move , Mr Justice Ho...",№1 ЛСВ 1 осмотр
3,Doctor Kagawa up his estimate of latency to 7 ...,№1 ЛСВ 1 осмотр
4,and suddenly Eva be above the park and Zozobra...,№1 ЛСВ 1 осмотр


In [11]:
train, test = train_test_split(data, test_size=0.3, stratify=data['category'])

In [13]:
df_train = train.groupby(['category'])['category'].count()
print(df_train)

category
№1 ЛСВ 1 осмотр            70
№10 ЛСВ 4.2                 9
№2 ЛСВ 1.1 просмотр        41
№3 ЛСВ 2 обзор           1349
№4 ЛСВ 3 вид               57
№5 ЛСВ 3.1 пейзаж        1156
№6 ЛСВ 3.2 картинка       191
№7 ЛСВ 3.4 ракурс          81
№8 ЛСВ 4 точка зрения    2093
№9 ЛСВ 4.1                 21
Name: category, dtype: int64


In [14]:
df_test = test.groupby(['category'])['category'].count()
print(df_test)

category
№1 ЛСВ 1 осмотр           30
№10 ЛСВ 4.2                4
№2 ЛСВ 1.1 просмотр       17
№3 ЛСВ 2 обзор           578
№4 ЛСВ 3 вид              25
№5 ЛСВ 3.1 пейзаж        496
№6 ЛСВ 3.2 картинка       82
№7 ЛСВ 3.4 ракурс         35
№8 ЛСВ 4 точка зрения    897
№9 ЛСВ 4.1                 9
Name: category, dtype: int64


In [16]:
train_X = train['example']
train_y = train['category']
test_X = test['example']
test_y = test['category']

In [17]:
classification_pipeline = Pipeline(steps=[
    ('count_vectorizer', CountVectorizer()),
    ('naive_bayes_classifier', MultinomialNB())
])

In [19]:
from time import time
t = time()
classification_pipeline.fit(train_X, train_y)
training_time = time() - t
print("train time: %0.3fs" % training_time)

train time: 0.200s


In [20]:
y_pred = classification_pipeline.predict(test_X)

In [21]:
print(classification_report(y_pred, test_y))

                       precision    recall  f1-score   support

      №1 ЛСВ 1 осмотр       0.00      0.00      0.00         0
          №10 ЛСВ 4.2       0.00      0.00      0.00         0
  №2 ЛСВ 1.1 просмотр       0.00      0.00      0.00         0
       №3 ЛСВ 2 обзор       0.77      0.70      0.73       634
         №4 ЛСВ 3 вид       0.00      0.00      0.00         0
    №5 ЛСВ 3.1 пейзаж       0.69      0.71      0.70       485
  №6 ЛСВ 3.2 картинка       0.00      0.00      0.00         0
    №7 ЛСВ 3.4 ракурс       0.00      0.00      0.00         0
№8 ЛСВ 4 точка зрения       0.95      0.81      0.87      1054
           №9 ЛСВ 4.1       0.00      0.00      0.00         0

             accuracy                           0.75      2173
            macro avg       0.24      0.22      0.23      2173
         weighted avg       0.84      0.75      0.79      2173



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
filename = 'MultinominalNB + lemmatization'

In [23]:
with open(f'{filename}.bin', 'wb') as output_stream:
    pickle.dump(classification_pipeline, output_stream)