# Preparing the data

##Importing libraries, downloading the model

In [None]:
import pandas
import sklearn
import numpy
import spacy
import string
import sys
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.metrics import classification_report
import re
from typing import List

#Besides modules, I will also import some functions written by me from my functions.py file
from functions import predict_new, predict2, model_analysis2

print(pandas.__version__)
print(sklearn.__version__)
print(numpy.__version__)

2.0.3
1.2.2
1.25.2


In [None]:
# Small Russian model:
# !python -m spacy download ru_core_news_sm
# nlp = spacy.load('ru_core_news_sm')

# Large Russian model:
!python -m spacy download ru_core_news_lg
nlp = spacy.load('ru_core_news_lg')

Collecting ru-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.7.0)
  Downloading pymorphy3-2.0.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-lg==3.7.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-lg==3.7.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pymorphy3-

##Making lists from csv files

In [None]:
# Let's load the training data from a csv file
train_set = pandas.read_csv('./train_data_2024.csv', encoding='utf-8')
train_set.head()

Unnamed: 0,text,author
0,Он старался не развлекаться и не портить себе ...,Tolstoy
1,Всегда этак у меня перед припадком бывает.,Chekhov
2,"Катерина Николаевна тут же и. отказала ему, ...",Dostoevsky
3,Анна Андреевна.,Gogol
4,"— То, что я видел сейчас, хуже всякой простуды...",Chekhov


In [None]:
test_set = pandas.read_csv('./test_data_2024.csv', encoding='utf-8')
test_set.head()

Unnamed: 0,text,author
0,— Это мы понимаем… Мы ведь не все отвинчиваем…...,Chekhov
1,Да что я?,Gogol
2,"Я думаю, у меня горло замерзло от проклятого м...",Gogol
3,"На деда, несмотря на весь страх, смех напал, к...",Gogol
4,"Действительно, влияние товарищей оказало на не...",Tolstoy


In [None]:
train_sentences = train_set['text'].to_list()
train_authors = train_set['author'].to_list()

test_sentences = test_set['text'].to_list()
test_authors = test_set['author'].to_list()

print(len(train_authors), len(test_authors))

10000 1000


## Preparing the feature set

The function below allows us to take any regex pattern and a list of sentences and get a string of unique characters. The elements of that string, be they digits, word characters or punctuation will be used by us as features to train our models.

In [None]:
def regex_feature_maker(regex_pattern: str, sentences: List[str]) -> str:
  """
  Extract unique characters from a list of sentences using a regex pattern.

  This function applies a given regex pattern to each sentence in the input list,
  collects all unique matches, and returns them as a single string.

  Args:
      regex_pattern (str): The regular expression pattern to apply.
      sentences (List[str]): A list of sentences to process.

  Returns:
      str: A string containing all unique characters or patterns found.
  """
  pattern = re.compile(regex_pattern)
  unique_chars_set = set()
  for sentence in sentences:
      matches = pattern.findall(sentence)
      if matches:
          unique_chars_set.update(matches)

  unique_chars = "".join(unique_chars_set)
  return unique_chars

Using the function defined above, we can get different strings that can be used as features for our model.

In [None]:
whitespaces = regex_feature_maker(r'[\s]', train_sentences)
digits = regex_feature_maker(r'[\d]', train_sentences)
non_russian_characters = regex_feature_maker(r'[^ЁёА-я\s\d\W]', train_sentences)
russian_characters = regex_feature_maker(r'[ЁёА-я]', train_sentences)
word_characters = regex_feature_maker(r'[\w]', train_sentences)
non_whitespace_chars = regex_feature_maker(r'[\S]', train_sentences)
punctuation = regex_feature_maker(r'[^\w\s]', train_sentences)

In [None]:
feature_sets = [
    ("Whitespace characters", whitespaces),
    ("Digits", digits),
    ("Non-Russian characters", non_russian_characters),
    ("Russian characters", russian_characters),
    ("Word characters", word_characters),
    ("Non-whitespace characters", non_whitespace_chars),
    ("Punctuation", punctuation)
]

for description, chars in feature_sets:
    print(f"{description}:")
    print(f"  Count: {len(chars)}")
    print(f"  Characters: {list(chars) if description == 'Whitespace characters' else chars}")
    print()

Whitespace characters:
  Count: 2
  Characters: ['\t', ' ']

Digits:
  Count: 10
  Characters: 0715843629

Non-Russian characters:
  Count: 53
  Characters: TeèMLCóvôDfcduXzUxNqJbjàhBKIaoRêkpéHmOrPVGtQElASngsiy

Russian characters:
  Count: 64
  Characters: ЗгйФУрХтЛИЕмЦЬГЯъщНМшвоЩуВКюАЫяОихЮелЧЙЖкьШТСзыбРёжпчДнБПасдэфЭц

Word characters:
  Count: 127
  Characters: TóХLмЦcЯdzUНМqàKaRКАpхЙrСыEбжДП4ЗeрCЕvГъjhЩIoВkЮе9VGкьQРп7БsфэiMФУèтИDfЬXJшоêюЫяОHЧOШ6ёnнg8ЭyгйЛôu3щNxbв0B1у5иéлmЖPtТзlAчSасдц2

Non-whitespace characters:
  Count: 158
  Characters: Tó(ХL«мЦcЯ[d́zUНМqàK—aRКАpхЙr̀СыE№б–жДП»4ЗeрCЕv-Гъ!j.hЩIoВkЮе9VG&кьQР]п7БsфэiM)}ФУèтИDfЬXJш{#"о“êюЫяО,HЧOШ<6ёnнg8Эyгй'Лô*u3’щNxbв0B1у5;>иéлmЖPtТзlAчS?асд:ц…2„

Punctuation:
  Count: 31
  Characters: }'(«*-[́!’.{#—"“;>,̀&<№–]?»:…)„



Now to train a model that uses any of the strings above as features all we need to do is assign the variable containing that string to a new "unique_chars" variable.

In [103]:
unique_chars = non_whitespace_chars

# We are creating a matrix with zero vectors for each review (in training set and test set)
train_features_matrix = numpy.zeros((len(train_sentences), len(unique_chars)))
print(train_features_matrix.shape)

test_features_matrix = numpy.zeros((len(test_sentences), len(unique_chars)))
print(test_features_matrix.shape)

(10000, 158)
(1000, 158)


# Modifying the feature vectors

## Visualisation tests

In [None]:
counter = 0

for sentence, author in zip(train_sentences, train_authors):
    for char in sentence:
      if char in unique_chars:
        print('Author:', author)
        print(sentence)
        char_id = unique_chars.index(char)
        print(char_id)
        # break
    counter +=1
    if counter == 40:
      # break
      sys.exit()

Author: Tolstoy
Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших себе уши лентами, и на все эти лица, или ничем не занятые, или занятые самыми разнообразными интересами, но только не музыкой.
0
Author: Tolstoy
Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших себе уши лентами, и на все эти лица, или ничем не занятые, или занятые самыми разнообразными интересами, но только не музыкой.
0
Author: Tolstoy
Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших себе уши лентами, и на

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


Visualisation test 2:

In [None]:
counter = 0

for sentence, author, feature_vector in zip(train_sentences, train_authors, train_features_matrix):
    for char in sentence:
      if char in unique_chars:
        print('Author:', author)
        print(sentence)
        char_id = unique_chars.index(char)
        print(char_id)
        feature_vector[char_id] = 1
        print(feature_vector.tolist())
        # sys.exit()
    counter +=1
    if counter == 40:
      break

Author: Tolstoy
Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших себе уши лентами, и на все эти лица, или ничем не занятые, или занятые самыми разнообразными интересами, но только не музыкой.
0
[1.0, 0.0]
Author: Tolstoy
Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших себе уши лентами, и на все эти лица, или ничем не занятые, или занятые самыми разнообразными интересами, но только не музыкой.
0
[1.0, 0.0]
Author: Tolstoy
Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших 

## Writing a function for vector modification

In [68]:
def modify_feature_vectors(sentences, features_matrix):
  for sentence, feature_vector in zip(sentences, features_matrix):
    for char in sentence:
      if char in unique_chars:
        char_id = unique_chars.index(char)
        feature_vector[char_id] = 1
  return features_matrix

In [104]:
train_features_matrix = numpy.zeros((len(train_sentences), len(unique_chars)))
train_features_matrix_final = modify_feature_vectors(train_sentences, train_features_matrix)

print(train_features_matrix_final[36:40])

[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
  0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.

# Training

In [105]:
lr_non_w = LogisticRegression()

# Train the model on the data, storing the information learned from the dat`a
# Model is learning the relationship between digits (x_train) and labels (y_train)
lr_non_w.fit(train_features_matrix_final, train_authors)

print(lr_non_w.classes_)

['Chekhov' 'Dostoevsky' 'Gogol' 'Tolstoy']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Modifying the test set feature vectors

In [106]:
test_features_matrix = numpy.zeros((len(test_sentences), len(unique_chars)))
test_features_matrix_final = modify_feature_vectors(test_sentences, test_features_matrix)

print(test_features_matrix_final[35:40])

[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0.
  0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
  0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
  1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
  0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0.
  1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1.

# Making predictions

In [107]:
predict_data = [test_sentences, test_features_matrix_final, test_authors, lr_non_w]

In [108]:
predict_new(0, predict_data)
predict_new(28, predict_data)
predict_new(39, predict_data)

— Это мы понимаем… Мы ведь не все отвинчиваем… оставляем… Не без ума делаем… понимаем….
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0.]
Chekhov

Prediction:
['Chekhov']
[[0.92468247 0.02456515 0.02081541 0.02993698]]
--------------------------------------------------------

А вишь, подрядье-то!
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.

In [109]:
test_predictions_char = lr_non_w.predict(test_features_matrix_final)

In [110]:
predict2(test_predictions_char, test_authors)

Chekhov (Correct:Chekhov)
Chekhov (Incorrect:Gogol)
Dostoevsky (Incorrect:Gogol)
Gogol (Correct:Gogol)
Tolstoy (Correct:Tolstoy)
Tolstoy (Correct:Tolstoy)
Tolstoy (Incorrect:Chekhov)
Tolstoy (Correct:Tolstoy)
Gogol (Correct:Gogol)
Dostoevsky (Incorrect:Tolstoy)


#Analysing the model

In [111]:
model_analysis2(lr_non_w, unique_chars)

Class: Chekhov
Feature: ", Coefficient: -2.2505
Feature: —, Coefficient: 1.4233
Feature: I, Coefficient: -1.3221
Feature: …, Coefficient: 1.0884
Feature: 2, Coefficient: -1.0812
Intercept: 0.1436

Class: Dostoevsky
Feature: —, Coefficient: -1.3775
Feature: Ж, Coefficient: -1.1847
Feature: ", Coefficient: 1.1110
Feature: e, Coefficient: 0.7934
Feature: Ч, Coefficient: -0.7237
Intercept: -0.3903

Class: Gogol
Feature: ё, Coefficient: -1.4180
Feature: Л, Coefficient: -0.9613
Feature: 6, Coefficient: -0.8751
Feature: i, Coefficient: 0.8296
Feature: >, Coefficient: 0.8098
Intercept: 0.2757

Class: Tolstoy
Feature: ́, Coefficient: 1.3238
Feature: X, Coefficient: 1.2543
Feature: 6, Coefficient: 1.1635
Feature: ", Coefficient: 1.1129
Feature: 8, Coefficient: 0.9731
Intercept: -0.0290



# Saving the model

In [113]:
# Save to file in the current working directory
pkl_filename = "logreg_non-whitespace_chars.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(lr_non_w, file)

# Evaluating the model

## Dummy Model

In [None]:
dummy_predictions = ['Dostoevsky'] * len(test_sentences)
print(classification_report(test_authors, dummy_predictions))

              precision    recall  f1-score   support

     Chekhov       0.00      0.00      0.00       250
  Dostoevsky       0.25      1.00      0.40       250
       Gogol       0.00      0.00      0.00       250
     Tolstoy       0.00      0.00      0.00       250

    accuracy                           0.25      1000
   macro avg       0.06      0.25      0.10      1000
weighted avg       0.06      0.25      0.10      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Characters model

In [114]:
print(classification_report(test_authors, test_predictions_char))

              precision    recall  f1-score   support

     Chekhov       0.56      0.55      0.56       250
  Dostoevsky       0.43      0.41      0.42       250
       Gogol       0.44      0.51      0.47       250
     Tolstoy       0.47      0.43      0.45       250

    accuracy                           0.48      1000
   macro avg       0.48      0.47      0.47      1000
weighted avg       0.48      0.47      0.47      1000

