# Preparing the data

## Importing libraries, downloading the model

In [1]:
import pandas
import sklearn
import numpy
import spacy
import sys
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import pickle

#Besides modules, I will also import some functions written by me from my functions.py file
from functions import predict_new, predict2, model_analysis2, fivegram_pos_extractor_from_sentence, fivegram_pos_extractor_unique

print(pandas.__version__)
print(sklearn.__version__)
print(numpy.__version__)

2.0.3
1.2.2
1.25.2


In [2]:
# Small Russian model:
# !python -m spacy download ru_core_news_sm
# nlp = spacy.load('ru_core_news_sm')

# Large Russian model:
!python -m spacy download ru_core_news_lg
nlp = spacy.load('ru_core_news_lg')

Collecting ru-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.7.0)
  Downloading pymorphy3-2.0.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m889.3 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-lg==3.7.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-lg==3.7.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pymorphy

## Making lists from csv files

In [3]:
# Let's load the training data from a csv file
train_set = pandas.read_csv('./train_data_2024.csv', encoding='utf-8')
train_set.head()

Unnamed: 0,text,author
0,Он старался не развлекаться и не портить себе ...,Tolstoy
1,Всегда этак у меня перед припадком бывает.,Chekhov
2,"Катерина Николаевна тут же и. отказала ему, ...",Dostoevsky
3,Анна Андреевна.,Gogol
4,"— То, что я видел сейчас, хуже всякой простуды...",Chekhov


In [4]:
test_set = pandas.read_csv('./test_data_2024.csv', encoding='utf-8')
test_set.head()

Unnamed: 0,text,author
0,— Это мы понимаем… Мы ведь не все отвинчиваем…...,Chekhov
1,Да что я?,Gogol
2,"Я думаю, у меня горло замерзло от проклятого м...",Gogol
3,"На деда, несмотря на весь страх, смех напал, к...",Gogol
4,"Действительно, влияние товарищей оказало на не...",Tolstoy


In [5]:
train_sentences = train_set['text'].to_list()
train_authors = train_set['author'].to_list()

test_sentences = test_set['text'].to_list()
test_authors = test_set['author'].to_list()

print(len(train_authors), len(test_authors))

10000 1000


## Preparing the feature set

In [6]:
train_doc_sentences = nlp.pipe(train_sentences)

unique_5gram_POS_seq = fivegram_pos_extractor_unique(train_doc_sentences)
print(unique_5gram_POS_seq[:10])
print(len(unique_5gram_POS_seq))

[('CCONJ', 'ADV', 'VERB', 'PUNCT', 'SPACE'), ('VERB', 'NOUN', 'CCONJ', 'AUX', 'ADJ'), ('DET', 'PUNCT', 'PART', 'PART', 'PRON'), ('NOUN', 'PUNCT', 'SCONJ', 'PART', 'ADJ'), ('PRON', 'VERB', 'NOUN', 'PROPN', 'ADP'), ('ADP', 'NOUN', 'AUX', 'DET', 'NOUN'), ('ADP', 'ADP', 'ADP', 'PRON', 'ADP'), ('PART', 'PUNCT', 'ADJ', 'NOUN', 'NOUN'), ('ADP', 'ADJ', 'PUNCT', 'NOUN', 'ADV'), ('ADV', 'NOUN', 'NUM', 'NOUN', 'PUNCT')]
48239


In [7]:
# We are creating a matrix with zero vectors for each review (in training set and test set)
train_features_matrix = numpy.zeros((len(train_sentences), len(unique_5gram_POS_seq)))
print(train_features_matrix.shape)

test_features_matrix = numpy.zeros((len(test_sentences), len(unique_5gram_POS_seq)))
print(test_features_matrix.shape)

(10000, 48239)
(1000, 48239)


# Modifying the feature vectors

## Visualisation tests

In [None]:
train_doc_sentences = nlp.pipe(train_sentences)

for sentence, author in zip(train_doc_sentences, train_authors):
    print(sentence)
    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    print(fivegram_pos_list)
    for fivegram in unique_5gram_POS_seq:
      if fivegram in fivegram_pos_list:
        print(fivegram)
        fivegram_id = unique_5gram_POS_seq.index(fivegram)
        print(fivegram_id)
        sys.exit()

Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших себе уши лентами, и на все эти лица, или ничем не занятые, или занятые самыми разнообразными интересами, но только не музыкой.
[('PUNCT', 'CCONJ', 'ADP', 'DET', 'DET'), ('NOUN', 'VERB', 'PRON', 'NOUN', 'NOUN'), ('DET', 'NOUN', 'PUNCT', 'CCONJ', 'PRON'), ('NOUN', 'NOUN', 'ADJ', 'NOUN', 'PUNCT'), ('NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'ADP'), ('CCONJ', 'PART', 'PART', 'NOUN', 'PUNCT'), ('ADJ', 'NOUN', 'PUNCT', 'CCONJ', 'PART'), ('PART', 'VERB', 'CCONJ', 'PART', 'VERB'), ('DET', 'DET', 'NOUN', 'PUNCT', 'CCONJ'), ('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), ('NOUN', 'PUNCT', 'ADV', 'ADV', 'ADJ'), ('PUNCT', 'ADV', 'ADP', 'NOUN', 'VERB'), ('PRON', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ'), ('PRON', 'VERB', 'PART', 'VERB', 'CCONJ'), ('PUNCT', 'VERB', 'ADP', 'NOUN', 'NOUN'), ('ADP', 'NOUN

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


Visualisation test 2:

In [None]:
train_doc_sentences = nlp.pipe(train_sentences)

for sentence, author, feature_vector in zip(train_doc_sentences, train_authors, train_features_matrix):
    print('Author:', author)
    print(sentence)
    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    print(fivegram_pos_list)
    for fivegram in unique_5gram_POS_seq:
      if fivegram in fivegram_pos_list:
        fivegram_id = unique_5gram_POS_seq.index(fivegram)
        print(fivegram_id)
        print(fivegram)
        feature_vector[fivegram_id] = 1
        print(feature_vector.tolist())
        sys.exit()


Author: Tolstoy
Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших себе уши лентами, и на все эти лица, или ничем не занятые, или занятые самыми разнообразными интересами, но только не музыкой.
[('PUNCT', 'CCONJ', 'ADP', 'DET', 'DET'), ('NOUN', 'VERB', 'PRON', 'NOUN', 'NOUN'), ('DET', 'NOUN', 'PUNCT', 'CCONJ', 'PRON'), ('NOUN', 'NOUN', 'ADJ', 'NOUN', 'PUNCT'), ('NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'ADP'), ('CCONJ', 'PART', 'PART', 'NOUN', 'PUNCT'), ('ADJ', 'NOUN', 'PUNCT', 'CCONJ', 'PART'), ('PART', 'VERB', 'CCONJ', 'PART', 'VERB'), ('DET', 'DET', 'NOUN', 'PUNCT', 'CCONJ'), ('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), ('NOUN', 'PUNCT', 'ADV', 'ADV', 'ADJ'), ('PUNCT', 'ADV', 'ADP', 'NOUN', 'VERB'), ('PRON', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ'), ('PRON', 'VERB', 'PART', 'VERB', 'CCONJ'), ('PUNCT', 'VERB', 'ADP', 'NOUN', 'NOUN'

SystemExit: 

## Writing a function for feature vector modification

In [8]:
def modify_feature_vectors(doc_sentences, features_matrix):
  for sentence, feature_vector in zip(doc_sentences, features_matrix):
    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    for fivegram in unique_5gram_POS_seq:
      if fivegram in fivegram_pos_list:
        fivegram_id = unique_5gram_POS_seq.index(fivegram)
        feature_vector[fivegram_id] = 1
  return features_matrix

In [9]:
train_features_matrix = numpy.zeros((len(train_sentences), len(unique_5gram_POS_seq)))
train_doc_sentences = nlp.pipe(train_sentences)

train_features_matrix_final = modify_feature_vectors(train_doc_sentences, train_features_matrix)

# Training

In [10]:
lr_5gram_pos = LogisticRegression()

# Train the model on the data, storing the information learned from the data
# Model is learning the relationship between digits (x_train) and labels (y_train)
lr_5gram_pos.fit(train_features_matrix_final, train_authors)

print(lr_5gram_pos.classes_)

['Chekhov' 'Dostoevsky' 'Gogol' 'Tolstoy']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Modifying the test set feature vectors

In [11]:
test_doc_sentences = nlp.pipe(test_sentences)
test_features_matrix = numpy.zeros((len(test_sentences), len(unique_5gram_POS_seq)))

test_features_matrix_final = modify_feature_vectors(test_doc_sentences, test_features_matrix)

# Making predictions

In [12]:
predict_data = [test_sentences, test_features_matrix_final, test_authors, lr_5gram_pos]

In [13]:
predict_new(0, predict_data)
predict_new(1, predict_data)
predict_new(2, predict_data)
predict_new(3, predict_data)
predict_new(4, predict_data)
predict_new(5, predict_data)

— Это мы понимаем… Мы ведь не все отвинчиваем… оставляем… Не без ума делаем… понимаем….
[0. 0. 0. ... 0. 0. 0.]
Chekhov

Prediction:
['Dostoevsky']
[[0.07954554 0.76340857 0.01529742 0.14174847]]
--------------------------------------------------------

Да что я?
[0. 0. 0. ... 0. 0. 0.]
Gogol

Prediction:
['Chekhov']
[[0.27410382 0.22192951 0.2642981  0.23966857]]
--------------------------------------------------------

Я думаю, у меня горло замерзло от проклятого морозу.
[0. 0. 0. ... 0. 0. 0.]
Gogol

Prediction:
['Gogol']
[[0.09934069 0.166255   0.41837945 0.31602485]]
--------------------------------------------------------

На деда, несмотря на весь страх, смех напал, когда увидел, как черти с собачьими мордами, на немецких ножках, вертя хвостами, увивались около ведьм, будто парни около красных девушек; а музыканты тузили себя в щеки кулаками, словно в бубны, и свистали носами, как в валторны.
[0. 0. 0. ... 0. 0. 0.]
Gogol

Prediction:
['Gogol']
[[0.00387534 0.01507045 0.95536332

In [14]:
test_predictions_pos = lr_5gram_pos.predict(test_features_matrix_final)

In [15]:
predict2(test_predictions_pos, test_authors)

Dostoevsky (Incorrect:Chekhov)
Chekhov (Incorrect:Gogol)
Gogol (Correct:Gogol)
Gogol (Correct:Gogol)
Chekhov (Incorrect:Tolstoy)
Tolstoy (Correct:Tolstoy)
Gogol (Incorrect:Chekhov)
Tolstoy (Correct:Tolstoy)
Dostoevsky (Incorrect:Gogol)
Tolstoy (Correct:Tolstoy)


#Analysing the model

In [16]:
model_analysis2(lr_5gram_pos, unique_5gram_POS_seq)

Class: Chekhov
Feature: ('NOUN', 'PUNCT', 'SPACE', 'PROPN', 'PUNCT'), Coefficient: 1.6194
Feature: ('VERB', 'PUNCT', 'SPACE', 'PROPN', 'PUNCT'), Coefficient: 1.5375
Feature: ('NOUN', 'PUNCT', 'ADV', 'PUNCT', 'PUNCT'), Coefficient: 1.1549
Feature: ('NOUN', 'PUNCT', 'SPACE', 'ADJ', 'PUNCT'), Coefficient: 1.0608
Feature: ('VERB', 'ADP', 'ADJ', 'NOUN', 'CCONJ'), Coefficient: 1.0209
Intercept: 0.0955

Class: Dostoevsky
Feature: ('PRON', 'PRON', 'ADV', 'VERB', 'PUNCT'), Coefficient: 1.0744
Feature: ('ADV', 'ADP', 'PRON', 'VERB', 'PUNCT'), Coefficient: 1.0424
Feature: ('CCONJ', 'ADJ', 'NOUN', 'PUNCT', 'PUNCT'), Coefficient: 1.0422
Feature: ('SPACE', 'ADJ', 'SPACE', 'NOUN', 'PUNCT'), Coefficient: 1.0118
Feature: ('VERB', 'ADP', 'PROPN', 'PROPN', 'PUNCT'), Coefficient: 1.0032
Intercept: -0.1157

Class: Gogol
Feature: ('VERB', 'PUNCT', 'ADJ', 'NOUN', 'PUNCT'), Coefficient: 1.1651
Feature: ('ADV', 'VERB', 'PUNCT', 'SCONJ', 'VERB'), Coefficient: 1.0487
Feature: ('VERB', 'NOUN', 'PUNCT', 'VERB', 'A

# Saving the model

In [17]:
# Save to file in the current working directory
pkl_filename = "logreg_POS.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(lr_5gram_pos, file)

# Evaluating the model

## Dummy model

In [None]:
dummy_predictions = ['Dostoevsky'] * len(test_sentences)
print(classification_report(test_authors, dummy_predictions))

              precision    recall  f1-score   support

     Chekhov       0.00      0.00      0.00       250
  Dostoevsky       0.25      1.00      0.40       250
       Gogol       0.00      0.00      0.00       250
     Tolstoy       0.00      0.00      0.00       250

    accuracy                           0.25      1000
   macro avg       0.06      0.25      0.10      1000
weighted avg       0.06      0.25      0.10      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## POS Model

In [18]:
print(classification_report(test_authors, test_predictions_pos))

              precision    recall  f1-score   support

     Chekhov       0.32      0.52      0.40       250
  Dostoevsky       0.39      0.29      0.33       250
       Gogol       0.34      0.27      0.30       250
     Tolstoy       0.37      0.32      0.34       250

    accuracy                           0.35      1000
   macro avg       0.36      0.35      0.34      1000
weighted avg       0.36      0.35      0.34      1000

