# Preparing the data

## Importing libraries, downloading the model

In [11]:
import pandas
import sklearn
import numpy
import spacy
from collections import Counter
import pickle
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

#Besides modules, I will also import some functions written by me from my functions.py file
from functions import predict_new, predict2, model_analysis2, fivegram_pos_extractor_from_sentence

print(pandas.__version__)
print(sklearn.__version__)
print(numpy.__version__)

2.0.3
1.2.2
1.25.2


In [12]:
# Small Russian model:
# !python -m spacy download ru_core_news_sm
# nlp = spacy.load('ru_core_news_sm')

# Large Russian model:
!python -m spacy download ru_core_news_lg
nlp = spacy.load('ru_core_news_lg')

Collecting ru-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Making lists from csv files

In [13]:
# Let's load the training data from a csv file
train_set = pandas.read_csv('./train_data_2024.csv', encoding='utf-8')
train_set.head()

Unnamed: 0,text,author
0,Он старался не развлекаться и не портить себе ...,Tolstoy
1,Всегда этак у меня перед припадком бывает.,Chekhov
2,"Катерина Николаевна тут же и. отказала ему, ...",Dostoevsky
3,Анна Андреевна.,Gogol
4,"— То, что я видел сейчас, хуже всякой простуды...",Chekhov


In [14]:
test_set = pandas.read_csv('./test_data_2024.csv', encoding='utf-8')
test_set.head()

Unnamed: 0,text,author
0,— Это мы понимаем… Мы ведь не все отвинчиваем…...,Chekhov
1,Да что я?,Gogol
2,"Я думаю, у меня горло замерзло от проклятого м...",Gogol
3,"На деда, несмотря на весь страх, смех напал, к...",Gogol
4,"Действительно, влияние товарищей оказало на не...",Tolstoy


In [15]:
train_sentences = train_set['text'].to_list()
train_authors = train_set['author'].to_list()

test_sentences = test_set['text'].to_list()
test_authors = test_set['author'].to_list()

print(len(train_authors), len(test_authors))

10000 1000


## Preparing the feature set

We can simply copy the contents of fivegrams_list from the Data_analysis notebook

In [16]:
fivegrams_list = [('VERB', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), ('PRON', 'VERB', 'ADP', 'NOUN', 'PUNCT'), ('VERB', 'ADP', 'NOUN', 'CCONJ', 'VERB'), ('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), ('X', 'X', 'X', 'X', 'X'), ('NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), ('VERB', 'ADP', 'NOUN', 'PUNCT', 'PUNCT'), ('VERB', 'NOUN', 'ADP', 'NOUN', 'PUNCT'), ('NOUN', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'), ('VERB', 'ADP', 'DET', 'NOUN', 'PUNCT'), ('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), ('VERB', 'PUNCT', 'SCONJ', 'PRON', 'VERB')]
print(len(fivegrams_list))

12


In [17]:
entity_types = ['PER', 'LOC', 'ORG']

# We are creating a matrix with zero vectors for each review (in training set and test set)
train_features_matrix = numpy.zeros((len(train_sentences), 3 + len(fivegrams_list)))
print(train_features_matrix.shape)

test_features_matrix = numpy.zeros((len(test_sentences), 3 + len(fivegrams_list)))
print(test_features_matrix.shape)

(10000, 15)
(1000, 15)


# Modifying the feature vectors

## Visualisation tests

In [None]:
train_doc_sentences = nlp.pipe(train_sentences)

counter = 0

for sentence, author in zip(train_doc_sentences, train_authors):
    print(author)
    print(sentence)
    NEs_in_sentence = [entity.label_ for entity in sentence.ents]
    print(NEs_in_sentence)
    for entity_type in entity_types:
      if entity_type in NEs_in_sentence:
        print(entity_type)
    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    print(fivegram_pos_list)
    for fivegram in fivegrams_list:
      if fivegram in fivegram_pos_list:
        print(fivegram)
        fivegram_id = fivegrams_list.index(fivegram)
        print(fivegram_id)
    print()
    counter +=1
    if counter == 5:
        break

Tolstoy
Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших себе уши лентами, и на все эти лица, или ничем не занятые, или занятые самыми разнообразными интересами, но только не музыкой.
[]
[('NOUN', 'PUNCT', 'ADP', 'VERB', 'ADP'), ('DET', 'DET', 'NOUN', 'PUNCT', 'CCONJ'), ('PART', 'VERB', 'PUNCT', 'CCONJ', 'VERB'), ('ADP', 'NOUN', 'VERB', 'PRON', 'NOUN'), ('ADP', 'DET', 'DET', 'NOUN', 'PUNCT'), ('CCONJ', 'PART', 'PART', 'NOUN', 'PUNCT'), ('PUNCT', 'CCONJ', 'PART', 'PART', 'NOUN'), ('CCONJ', 'PART', 'VERB', 'PRON', 'NOUN'), ('VERB', 'ADP', 'NOUN', 'NOUN', 'ADJ'), ('ADJ', 'VERB', 'ADJ', 'NOUN', 'PUNCT'), ('NOUN', 'NOUN', 'ADJ', 'NOUN', 'PUNCT'), ('ADV', 'ADP', 'NOUN', 'VERB', 'PRON'), ('PRON', 'PART', 'VERB', 'PUNCT', 'CCONJ'), ('PRON', 'VERB', 'PART', 'VERB', 'CCONJ'), ('ADJ', 'ADJ', 'NOUN', 'PUNCT', 'CCONJ'), ('PUNC

Visualisation test 2:

In [None]:
# Process a text
train_features_matrix = numpy.zeros((len(train_sentences), 3 + len(fivegrams_list)))
train_doc_sentences = nlp.pipe(train_sentences)

counter = 0
# loop over each review, label and feature vector at the same time (zip)
for sentence, author, feature_vector in zip(train_doc_sentences, train_authors, train_features_matrix):
    print('Author:', author)
    print(sentence)
    NEs_in_sentence = [entity.label_ for entity in sentence.ents]
    #print(tokens_list)
    for entity_type in entity_types:
      if entity_type in NEs_in_sentence:
        entity_id = entity_types.index(entity_type)
        print(entity_type)
        print(entity_id)
        feature_vector[entity_id] = 1
        print(feature_vector)
    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    print(fivegram_pos_list)
    for fivegram in fivegrams_list:
      if fivegram in fivegram_pos_list:
        fivegram_id = fivegrams_list.index(fivegram) + 3
        print(fivegram_id)
        print(fivegram)
        feature_vector[fivegram_id] = 1
        print(feature_vector)
    print()
    counter +=1
    if counter == 10:
        break

Author: Tolstoy
Он старался не развлекаться и не портить себе впечатления, глядя на махание руками белогалстучного капельмейстера, всегда так неприятно развлекающее музыкальное внимание, на дам в шляпах, старательно для концерта завязавших себе уши лентами, и на все эти лица, или ничем не занятые, или занятые самыми разнообразными интересами, но только не музыкой.
[('NOUN', 'PUNCT', 'ADP', 'VERB', 'ADP'), ('DET', 'DET', 'NOUN', 'PUNCT', 'CCONJ'), ('PART', 'VERB', 'PUNCT', 'CCONJ', 'VERB'), ('ADP', 'NOUN', 'VERB', 'PRON', 'NOUN'), ('ADP', 'DET', 'DET', 'NOUN', 'PUNCT'), ('CCONJ', 'PART', 'PART', 'NOUN', 'PUNCT'), ('PUNCT', 'CCONJ', 'PART', 'PART', 'NOUN'), ('CCONJ', 'PART', 'VERB', 'PRON', 'NOUN'), ('VERB', 'ADP', 'NOUN', 'NOUN', 'ADJ'), ('ADJ', 'VERB', 'ADJ', 'NOUN', 'PUNCT'), ('NOUN', 'NOUN', 'ADJ', 'NOUN', 'PUNCT'), ('ADV', 'ADP', 'NOUN', 'VERB', 'PRON'), ('PRON', 'PART', 'VERB', 'PUNCT', 'CCONJ'), ('PRON', 'VERB', 'PART', 'VERB', 'CCONJ'), ('ADJ', 'ADJ', 'NOUN', 'PUNCT', 'CCONJ'), (

## Writing a function for feature vector modification

In [18]:
def modify_feature_vectors(doc_sentences, features_matrix):
  for sentence, feature_vector in zip(doc_sentences, features_matrix):
    NEs_in_sentence = [entity.label_ for entity in sentence.ents]
    for entity_type in entity_types:
      if entity_type in NEs_in_sentence:
        entity_id = entity_types.index(entity_type)
        feature_vector[entity_id] = 1

    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    for fivegram in fivegrams_list:
      if fivegram in fivegram_pos_list:
        fivegram_id = fivegrams_list.index(fivegram) + 3
        feature_vector[fivegram_id] = 1

  return features_matrix

In [19]:
train_features_matrix = numpy.zeros((len(train_sentences), 3 + len(fivegrams_list)))
train_doc_sentences = nlp.pipe(train_sentences)

train_features_matrix_final = modify_feature_vectors(train_doc_sentences, train_features_matrix)

# Training

In [20]:
lr7_NER_and_common_5grams = LogisticRegression()

# Train the model on the data, storing the information learned from the dat`a
# Model is learning the relationship between digits (x_train) and labels (y_train)
lr7_NER_and_common_5grams.fit(train_features_matrix_final, train_authors)

print(lr7_NER_and_common_5grams.classes_)

['Chekhov' 'Dostoevsky' 'Gogol' 'Tolstoy']


# Modifying the test set feature vectors

In [21]:
test_doc_sentences = nlp.pipe(test_sentences)
test_features_matrix = numpy.zeros((len(test_sentences), 3 + len(fivegrams_list)))

test_features_matrix_final = modify_feature_vectors(test_doc_sentences, test_features_matrix)

# Making predictions

In [22]:
predict_data = [test_sentences, test_features_matrix_final, test_authors, lr7_NER_and_common_5grams]

In [23]:
predict_new(0, predict_data)
predict_new(1, predict_data)
predict_new(2, predict_data)
predict_new(3, predict_data)
predict_new(4, predict_data)
predict_new(5, predict_data)

— Это мы понимаем… Мы ведь не все отвинчиваем… оставляем… Не без ума делаем… понимаем….
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Chekhov

Prediction:
['Dostoevsky']
[[0.24466936 0.26670898 0.25764907 0.2309726 ]]
--------------------------------------------------------

Да что я?
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Gogol

Prediction:
['Dostoevsky']
[[0.24466936 0.26670898 0.25764907 0.2309726 ]]
--------------------------------------------------------

Я думаю, у меня горло замерзло от проклятого морозу.
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Gogol

Prediction:
['Gogol']
[[0.24897658 0.23538726 0.28700098 0.22863519]]
--------------------------------------------------------

На деда, несмотря на весь страх, смех напал, когда увидел, как черти с собачьими мордами, на немецких ножках, вертя хвостами, увивались около ведьм, будто парни около красных девушек; а музыканты тузили себя в щеки кулаками, словно в бубны, и свистали носами, как в валторны.
[0. 0. 0. 0. 0.

In [24]:
test_predictions_NER_common_5grams = lr7_NER_and_common_5grams.predict(test_features_matrix_final)

In [25]:
predict2(test_predictions_NER_common_5grams, test_authors)

Dostoevsky (Incorrect:Chekhov)
Dostoevsky (Incorrect:Gogol)
Gogol (Correct:Gogol)
Gogol (Correct:Gogol)
Dostoevsky (Incorrect:Tolstoy)
Tolstoy (Correct:Tolstoy)
Dostoevsky (Incorrect:Chekhov)
Dostoevsky (Incorrect:Tolstoy)
Dostoevsky (Incorrect:Gogol)
Dostoevsky (Incorrect:Tolstoy)


#Analysing the model

In [26]:
features_list = entity_types + fivegrams_list
model_analysis2(lr7_NER_and_common_5grams, features_list, top_n=20)

Class: Chekhov
Feature: ('X', 'X', 'X', 'X', 'X'), Coefficient: -1.0036
Feature: ('VERB', 'ADP', 'NOUN', 'CCONJ', 'VERB'), Coefficient: 0.9345
Feature: ('VERB', 'ADP', 'NOUN', 'PUNCT', 'PUNCT'), Coefficient: 0.8803
Feature: ORG, Coefficient: -0.5815
Feature: ('VERB', 'NOUN', 'ADP', 'NOUN', 'PUNCT'), Coefficient: 0.4428
Feature: LOC, Coefficient: -0.2767
Feature: ('NOUN', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'), Coefficient: 0.2490
Feature: ('VERB', 'ADP', 'DET', 'NOUN', 'PUNCT'), Coefficient: -0.1674
Feature: ('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), Coefficient: -0.1335
Feature: ('NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), Coefficient: -0.0969
Feature: ('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), Coefficient: -0.0954
Feature: ('VERB', 'PUNCT', 'SCONJ', 'PRON', 'VERB'), Coefficient: -0.0917
Feature: PER, Coefficient: 0.0778
Feature: ('VERB', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), Coefficient: 0.0199
Feature: ('PRON', 'VERB', 'ADP', 'NOUN', 'PUNCT'), Coefficient: -0.0150
Intercept: -0.0201

Class: Dostoev

# Saving the model

In [27]:
# Save to file in the current working directory
pkl_filename = "logreg_NER_and_common_5grams.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(lr7_NER_and_common_5grams, file)

# Evaluating the model

## Dummy model

In [None]:
dummy_predictions = ['Dostoevsky'] * len(test_sentences)
print(classification_report(test_authors, dummy_predictions))

              precision    recall  f1-score   support

     Chekhov       0.00      0.00      0.00       250
  Dostoevsky       0.25      1.00      0.40       250
       Gogol       0.00      0.00      0.00       250
     Tolstoy       0.00      0.00      0.00       250

    accuracy                           0.25      1000
   macro avg       0.06      0.25      0.10      1000
weighted avg       0.06      0.25      0.10      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## NER + Common POS Model

In [28]:
print(classification_report(test_authors, test_predictions_NER_common_5grams))

              precision    recall  f1-score   support

     Chekhov       0.17      0.02      0.04       250
  Dostoevsky       0.28      0.66      0.39       250
       Gogol       0.28      0.06      0.10       250
     Tolstoy       0.28      0.36      0.31       250

    accuracy                           0.27      1000
   macro avg       0.25      0.27      0.21      1000
weighted avg       0.25      0.27      0.21      1000

