# Preparing the data

## Importing libraries, downloading the model

In [1]:
import pandas
import sklearn
import numpy
import spacy
from collections import Counter
import sys
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.metrics import classification_report

#Besides modules, I will also import some functions written by me from my functions.py file
from functions import predict_new, predict2, model_analysis2, fivegram_pos_extractor_from_sentence

print(pandas.__version__)
print(sklearn.__version__)
print(numpy.__version__)

2.0.3
1.2.2
1.25.2


In [2]:
# Small Russian model:
# !python -m spacy download ru_core_news_sm
# nlp = spacy.load('ru_core_news_sm')

# Large Russian model:
!python -m spacy download ru_core_news_lg
nlp = spacy.load('ru_core_news_lg')

Collecting ru-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m908.3 kB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.7.0)
  Downloading pymorphy3-2.0.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-lg==3.7.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-lg==3.7.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pymorphy

## Making lists from csv files

In [3]:
# Let's load the training data from a csv file
train_set = pandas.read_csv('./train_data_2024.csv', encoding='utf-8')
train_set.head()

Unnamed: 0,text,author
0,Он старался не развлекаться и не портить себе ...,Tolstoy
1,Всегда этак у меня перед припадком бывает.,Chekhov
2,"Катерина Николаевна тут же и. отказала ему, ...",Dostoevsky
3,Анна Андреевна.,Gogol
4,"— То, что я видел сейчас, хуже всякой простуды...",Chekhov


In [4]:
test_set = pandas.read_csv('./test_data_2024.csv', encoding='utf-8')
test_set.head()

Unnamed: 0,text,author
0,— Это мы понимаем… Мы ведь не все отвинчиваем…...,Chekhov
1,Да что я?,Gogol
2,"Я думаю, у меня горло замерзло от проклятого м...",Gogol
3,"На деда, несмотря на весь страх, смех напал, к...",Gogol
4,"Действительно, влияние товарищей оказало на не...",Tolstoy


In [5]:
train_sentences = train_set['text'].to_list()
train_authors = train_set['author'].to_list()

test_sentences = test_set['text'].to_list()
test_authors = test_set['author'].to_list()

print(len(train_authors), len(test_authors))

10000 1000


## Preparing the feature set

We can simply copy the contents of fivegrams_list from the Data_analysis notebook

In [6]:
fivegrams_list = [('VERB', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), ('PRON', 'VERB', 'ADP', 'NOUN', 'PUNCT'), ('VERB', 'ADP', 'NOUN', 'CCONJ', 'VERB'), ('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), ('X', 'X', 'X', 'X', 'X'), ('NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), ('VERB', 'ADP', 'NOUN', 'PUNCT', 'PUNCT'), ('VERB', 'NOUN', 'ADP', 'NOUN', 'PUNCT'), ('NOUN', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'), ('VERB', 'ADP', 'DET', 'NOUN', 'PUNCT'), ('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), ('VERB', 'PUNCT', 'SCONJ', 'PRON', 'VERB')]
print(len(fivegrams_list))

12


In [7]:
# We are creating a matrix with zero vectors for each review (in training set and test set)
train_features_matrix = numpy.zeros((len(train_sentences), len(fivegrams_list)))
print(train_features_matrix.shape)

test_features_matrix = numpy.zeros((len(test_sentences), len(fivegrams_list)))
print(test_features_matrix.shape)

(10000, 12)
(1000, 12)


# Modifying the feature vectors

## Visualisation tests

In [None]:
train_doc_sentences = nlp.pipe(train_sentences)

for sentence, author in zip(train_doc_sentences, train_authors):
    print(author)
    print(sentence)
    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    print(fivegram_pos_list)
    for fivegram in fivegrams_list:
      if fivegram in fivegram_pos_list:
        print(fivegram)
        fivegram_id = fivegrams_list.index(fivegram)
        print(fivegram_id)
        sys.exit()

Tolstoy
Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших себе уши лентами, и на все эти лица, или ничем не занятые, или занятые самыми разнообразными интересами, но только не музыкой.
[('ADJ', 'ADJ', 'NOUN', 'PUNCT', 'CCONJ'), ('ADJ', 'VERB', 'ADJ', 'NOUN', 'PUNCT'), ('NOUN', 'PUNCT', 'ADP', 'VERB', 'ADP'), ('PRON', 'VERB', 'PART', 'VERB', 'CCONJ'), ('NOUN', 'PUNCT', 'CCONJ', 'ADP', 'DET'), ('NOUN', 'VERB', 'PRON', 'NOUN', 'NOUN'), ('PRON', 'PART', 'VERB', 'PUNCT', 'CCONJ'), ('VERB', 'ADP', 'NOUN', 'NOUN', 'ADJ'), ('ADV', 'ADP', 'NOUN', 'VERB', 'PRON'), ('PART', 'VERB', 'PUNCT', 'CCONJ', 'VERB'), ('ADP', 'NOUN', 'PUNCT', 'ADV', 'ADP'), ('PART', 'VERB', 'PRON', 'NOUN', 'PUNCT'), ('VERB', 'ADJ', 'ADJ', 'NOUN', 'PUNCT'), ('ADJ', 'NOUN', 'PUNCT', 'ADV', 'ADV'), ('ADJ', 'NOUN', 'PUNCT', 'CCONJ', 'PART'), ('ADP', 'DET',

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


Visualisation test 2:

In [None]:
train_doc_sentences = nlp.pipe(train_sentences)

for sentence, author, feature_vector in zip(train_doc_sentences, train_authors, train_features_matrix):
    print('Author:', author)
    print(sentence)
    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    print(fivegram_pos_list)
    for fivegram in fivegrams_list:
      if fivegram in fivegram_pos_list:
        fivegram_id = fivegrams_list.index(fivegram)
        print(fivegram_id)
        print(fivegram)
        feature_vector[fivegram_id] = 1
        print(feature_vector.tolist())
        sys.exit()

Author: Tolstoy
Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших себе уши лентами, и на все эти лица, или ничем не занятые, или занятые самыми разнообразными интересами, но только не музыкой.
[('ADJ', 'ADJ', 'NOUN', 'PUNCT', 'CCONJ'), ('ADJ', 'VERB', 'ADJ', 'NOUN', 'PUNCT'), ('NOUN', 'PUNCT', 'ADP', 'VERB', 'ADP'), ('PRON', 'VERB', 'PART', 'VERB', 'CCONJ'), ('NOUN', 'PUNCT', 'CCONJ', 'ADP', 'DET'), ('NOUN', 'VERB', 'PRON', 'NOUN', 'NOUN'), ('PRON', 'PART', 'VERB', 'PUNCT', 'CCONJ'), ('VERB', 'ADP', 'NOUN', 'NOUN', 'ADJ'), ('ADV', 'ADP', 'NOUN', 'VERB', 'PRON'), ('PART', 'VERB', 'PUNCT', 'CCONJ', 'VERB'), ('ADP', 'NOUN', 'PUNCT', 'ADV', 'ADP'), ('PART', 'VERB', 'PRON', 'NOUN', 'PUNCT'), ('VERB', 'ADJ', 'ADJ', 'NOUN', 'PUNCT'), ('ADJ', 'NOUN', 'PUNCT', 'ADV', 'ADV'), ('ADJ', 'NOUN', 'PUNCT', 'CCONJ', 'PART'), ('ADP'

SystemExit: 

## Writing a function for feature vector modification

In [8]:
def modify_feature_vectors(doc_sentences, features_matrix):
  for sentence, feature_vector in zip(doc_sentences, features_matrix):
    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    for fivegram in fivegrams_list:
      if fivegram in fivegram_pos_list:
        fivegram_id = fivegrams_list.index(fivegram)
        feature_vector[fivegram_id] = 1
  return features_matrix

In [9]:
train_features_matrix = numpy.zeros((len(train_sentences), len(fivegrams_list)))
train_doc_sentences = nlp.pipe(train_sentences)

train_features_matrix_final = modify_feature_vectors(train_doc_sentences, train_features_matrix)

# Training

In [10]:
lr_common_5grams = LogisticRegression()

# Train the model on the data, storing the information learned from the data
# Model is learning the relationship between digits (x_train) and labels (y_train)
lr_common_5grams.fit(train_features_matrix_final, train_authors)

print(lr_common_5grams.classes_)

['Chekhov' 'Dostoevsky' 'Gogol' 'Tolstoy']


# Modifying the test set feature vectors

In [11]:
test_doc_sentences = nlp.pipe(test_sentences)
test_features_matrix = numpy.zeros((len(test_sentences), len(fivegrams_list)))

test_features_matrix_final = modify_feature_vectors(test_doc_sentences, test_features_matrix)

# Making predictions

In [12]:
predict_data = [test_sentences, test_features_matrix_final, test_authors, lr_common_5grams]

In [13]:
predict_new(0, predict_data)
predict_new(1, predict_data)
predict_new(2, predict_data)
predict_new(3, predict_data)
predict_new(4, predict_data)
predict_new(5, predict_data)

— Это мы понимаем… Мы ведь не все отвинчиваем… оставляем… Не без ума делаем… понимаем….
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Chekhov

Prediction:
['Dostoevsky']
[[0.24577425 0.26058165 0.24666239 0.24698171]]
--------------------------------------------------------

Да что я?
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Gogol

Prediction:
['Dostoevsky']
[[0.24577425 0.26058165 0.24666239 0.24698171]]
--------------------------------------------------------

Я думаю, у меня горло замерзло от проклятого морозу.
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Gogol

Prediction:
['Gogol']
[[0.24837281 0.22928623 0.26998664 0.25235432]]
--------------------------------------------------------

На деда, несмотря на весь страх, смех напал, когда увидел, как черти с собачьими мордами, на немецких ножках, вертя хвостами, увивались около ведьм, будто парни около красных девушек; а музыканты тузили себя в щеки кулаками, словно в бубны, и свистали носами, как в валторны.
[0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0.]
Gogo

In [14]:
test_predictions_common_5grams = lr_common_5grams.predict(test_features_matrix_final)

In [15]:
predict2(test_predictions_common_5grams, test_authors)

Dostoevsky (Incorrect:Chekhov)
Dostoevsky (Incorrect:Gogol)
Gogol (Correct:Gogol)
Gogol (Correct:Gogol)
Dostoevsky (Incorrect:Tolstoy)
Dostoevsky (Incorrect:Tolstoy)
Dostoevsky (Incorrect:Chekhov)
Dostoevsky (Incorrect:Tolstoy)
Dostoevsky (Incorrect:Gogol)
Dostoevsky (Incorrect:Tolstoy)


#Analysing the model

In [16]:
model_analysis2(lr_common_5grams, fivegrams_list)

Class: Chekhov
Feature: ('X', 'X', 'X', 'X', 'X'), Coefficient: -1.0598
Feature: ('VERB', 'ADP', 'NOUN', 'CCONJ', 'VERB'), Coefficient: 0.9268
Feature: ('VERB', 'ADP', 'NOUN', 'PUNCT', 'PUNCT'), Coefficient: 0.8900
Feature: ('VERB', 'NOUN', 'ADP', 'NOUN', 'PUNCT'), Coefficient: 0.4444
Feature: ('NOUN', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'), Coefficient: 0.2382
Intercept: -0.0168

Class: Dostoevsky
Feature: ('VERB', 'NOUN', 'ADP', 'NOUN', 'PUNCT'), Coefficient: -0.6810
Feature: ('VERB', 'ADP', 'NOUN', 'PUNCT', 'PUNCT'), Coefficient: -0.6681
Feature: ('NOUN', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'), Coefficient: -0.5400
Feature: ('VERB', 'ADP', 'NOUN', 'CCONJ', 'VERB'), Coefficient: -0.4840
Feature: ('NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), Coefficient: -0.4053
Intercept: 0.0418

Class: Gogol
Feature: ('X', 'X', 'X', 'X', 'X'), Coefficient: -1.0453
Feature: ('VERB', 'ADP', 'NOUN', 'CCONJ', 'VERB'), Coefficient: -0.5891
Feature: ('NOUN', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'), Coefficient: 0.3716
Feature

# Saving the model

In [17]:
# Save to file in the current working directory
pkl_filename = "logreg_common_5grams.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(lr_common_5grams, file)

# Evaluating the model

## Dummy model

In [None]:
dummy_predictions = ['Dostoevsky'] * len(test_sentences)
print(classification_report(test_authors, dummy_predictions))

              precision    recall  f1-score   support

     Chekhov       0.00      0.00      0.00       250
  Dostoevsky       0.25      1.00      0.40       250
       Gogol       0.00      0.00      0.00       250
     Tolstoy       0.00      0.00      0.00       250

    accuracy                           0.25      1000
   macro avg       0.06      0.25      0.10      1000
weighted avg       0.06      0.25      0.10      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Common POS Model

In [18]:
print(classification_report(test_authors, test_predictions_common_5grams))

              precision    recall  f1-score   support

     Chekhov       0.17      0.02      0.04       250
  Dostoevsky       0.26      0.90      0.40       250
       Gogol       0.19      0.04      0.07       250
     Tolstoy       0.36      0.05      0.09       250

    accuracy                           0.25      1000
   macro avg       0.24      0.25      0.15      1000
weighted avg       0.24      0.25      0.15      1000

