# Preparing the data

## Importing libraries, downloading the model

In [None]:
import pandas
import sklearn
import numpy

print(pandas.__version__)
print(sklearn.__version__)
print(numpy.__version__)

1.5.3
1.2.2
1.22.4


In [None]:
import spacy
# Small Russian model:
# !python -m spacy download ru_core_news_sm
# nlp = spacy.load('ru_core_news_sm')

# Large Russian model:
!python -m spacy download ru_core_news_lg
nlp = spacy.load('ru_core_news_lg')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ru-core-news-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.5.0/ru_core_news_lg-3.5.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.5.0)
  Downloading pymorphy3-1.2.0-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-lg==3.5.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting docopt>=0.6 (from pymorphy3>=1.0.0->ru-core-news-lg==3.5.0)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru

## Making lists and doc objects from csv files

In [None]:
# Let's load the training data from a csv file
train_set = pandas.read_csv('./train_data.csv', encoding='utf-8')
# train_set

In [None]:
test_set = pandas.read_csv('./test_data.csv', encoding='utf-8')
# test_set

In [None]:
train_sentences = train_set['text'].to_list()
train_authors = train_set['author'].to_list()

test_sentences = test_set['text'].to_list()
test_authors = test_set['author'].to_list()

print(len(train_authors), len(test_authors))

10000 1000


In [None]:
train_doc_sentences = nlp.pipe(train_sentences)
test_doc_sentences = nlp.pipe(test_sentences)

## Preparing the feature set

In [None]:
# Extract data for Dostoyevsky
dostoyevsky_data = train_set[train_set['author'] == 'Dostoevsky']['text'].to_list()

# Extract data for Tolstoy
tolstoy_data = train_set[train_set['author'] == 'Tolstoy']['text'].to_list()

# Extract data for Chekhov
chekhov_data = train_set[train_set['author'] == 'Chekhov']['text'].to_list()

# Extract data for Gogol
gogol_data = train_set[train_set['author'] == 'Gogol']['text'].to_list()

dostoyevsky_data_doc = nlp.pipe(dostoyevsky_data)
tolstoy_data_doc = nlp.pipe(tolstoy_data)
chekhov_data_doc = nlp.pipe(chekhov_data)
gogol_data_doc = nlp.pipe(gogol_data)

In [None]:
from collections import Counter

def fivegram_pos_count(list_of_doc_sentences):
  n = 5
  fivegram_pos_tags = []
  for doc in list_of_doc_sentences:
    # Iterate over each possible fivegram in the document
    for i in range(len(doc) - n + 1):
            # Extract the tokens for the current fivegram
            fivegram_tokens = doc[i : i + n]
            # Extract the POS tags of the tokens and add the POS tag combination to the list
            fivegram_pos = tuple(token.pos_ for token in fivegram_tokens)
            fivegram_pos_tags.append(fivegram_pos)
  most_common_fivegrams = Counter(fivegram_pos_tags).most_common(5)
  five_fivegrams_list = [element[0] for element in most_common_fivegrams]
  return five_fivegrams_list

def fivegram_pos_extractor_from_sentence(doc):
    n = 5
    fivegram_pos_tags = []
    for i in range(len(doc) - n + 1):
    # Extract the tokens for the current fivegram
        fivegram_tokens = doc[i : i + n]
        # Extract the POS tags of the tokens and add the POS tag combination to the list
        fivegram_pos = tuple(token.pos_ for token in fivegram_tokens)
        fivegram_pos_tags.append(fivegram_pos)
    unique_fivegram_pos_tags = list(set(fivegram_pos_tags))

    return unique_fivegram_pos_tags

In [None]:
from collections import Counter
five_fivegrams_dostoyevsky = fivegram_pos_count(dostoyevsky_data_doc)
five_fivegrams_tolstoy = fivegram_pos_count(tolstoy_data_doc)
five_fivegrams_chekhov = fivegram_pos_count(chekhov_data_doc)
five_fivegrams_gogol = fivegram_pos_count(gogol_data_doc)

fivegrams_list = five_fivegrams_dostoyevsky + five_fivegrams_tolstoy + five_fivegrams_chekhov + five_fivegrams_gogol
fivegrams_list = list(set(fivegrams_list))


[('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), ('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), ('VERB', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), ('NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), ('NOUN', 'VERB', 'ADP', 'NOUN', 'PUNCT')]


In [None]:
print(five_fivegrams_dostoyevsky)
print(five_fivegrams_tolstoy)
print(five_fivegrams_chekhov)
print(five_fivegrams_gogol)
print(len(fivegrams_list), fivegrams_list)

[('VERB', 'ADP', 'DET', 'NOUN', 'PUNCT'), ('PUNCT', 'SPACE', 'PUNCT', 'VERB', 'PRON'), ('NOUN', 'PUNCT', 'SPACE', 'PUNCT', 'SPACE'), ('VERB', 'PUNCT', 'SPACE', 'PUNCT', 'SPACE'), ('VERB', 'PRON', 'ADP', 'NOUN', 'PUNCT')]
[('X', 'X', 'X', 'X', 'X'), ('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), ('X', 'X', 'X', 'X', 'PUNCT'), ('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), ('NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT')]
[('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), ('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), ('PUNCT', 'PUNCT', 'VERB', 'PRON', 'PUNCT'), ('NOUN', 'VERB', 'ADP', 'NOUN', 'PUNCT'), ('ADJ', 'PUNCT', 'ADJ', 'NOUN', 'PUNCT')]
[('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), ('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), ('VERB', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), ('NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), ('NOUN', 'VERB', 'ADP', 'NOUN', 'PUNCT')]
14 [('NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), ('NOUN', 'PUNCT', 'SPACE', 'PUNCT', 'SPACE'), ('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), ('ADJ', 'PUNCT', 'ADJ', 'NOUN', 'PUN

In [None]:
import numpy

# We are creating a matrix with zero vectors for each review (in training set and test set)
train_features_matrix = numpy.zeros((len(train_sentences), len(fivegrams_list)))
print(train_features_matrix.shape)

test_features_matrix = numpy.zeros((len(test_sentences), len(fivegrams_list)))
print(test_features_matrix.shape)

(10000, 14)
(1000, 14)


# Modifying the feature vectors

## Visualisation tests

In [None]:
import sys

train_doc_sentences = nlp.pipe(train_sentences)

for sentence, author in zip(train_doc_sentences, train_authors):
    print(author)
    print(sentence)
    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    print(fivegram_pos_list)
    for fivegram in fivegrams_list:
      if fivegram in fivegram_pos_list:
        print(fivegram)
        fivegram_id = fivegrams_list.index(fivegram)
        print(fivegram_id)
        sys.exit()

Dostoevsky
Но каково же было мое изумление, когда Наташа с первых же слов остановила меня и сказала, что нечего ее утешать, что она уже пять дней, как знает про это..     – Боже мой!
[('VERB', 'PUNCT', 'SCONJ', 'VERB', 'PRON'), ('DET', 'NOUN', 'PUNCT', 'SCONJ', 'PROPN'), ('VERB', 'PRON', 'CCONJ', 'VERB', 'PUNCT'), ('SPACE', 'PUNCT', 'NOUN', 'DET', 'PUNCT'), ('PRON', 'PUNCT', 'SPACE', 'PUNCT', 'NOUN'), ('ADJ', 'PART', 'NOUN', 'VERB', 'PRON'), ('PROPN', 'ADP', 'ADJ', 'PART', 'NOUN'), ('ADP', 'PRON', 'PUNCT', 'SPACE', 'PUNCT'), ('PUNCT', 'SPACE', 'PUNCT', 'NOUN', 'DET'), ('SCONJ', 'PRON', 'ADV', 'NUM', 'NOUN'), ('NOUN', 'VERB', 'PRON', 'CCONJ', 'VERB'), ('ADJ', 'PART', 'AUX', 'DET', 'NOUN'), ('AUX', 'DET', 'NOUN', 'PUNCT', 'SCONJ'), ('PRON', 'CCONJ', 'VERB', 'PUNCT', 'SCONJ'), ('SCONJ', 'VERB', 'ADP', 'PRON', 'PUNCT'), ('VERB', 'PUNCT', 'SCONJ', 'PRON', 'ADV'), ('PART', 'AUX', 'DET', 'NOUN', 'PUNCT'), ('NOUN', 'PUNCT', 'SCONJ', 'PROPN', 'ADP'), ('PUNCT', 'SCONJ', 'VERB', 'ADP', 'PRON'), (

SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


Visualisation test 2:

In [None]:
import sys

train_doc_sentences = nlp.pipe(train_sentences)

for sentence, author, feature_vector in zip(train_doc_sentences, train_authors, train_features_matrix):
    print('Author:', author)
    print(sentence)
    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    print(fivegram_pos_list)
    for fivegram in fivegrams_list:
      if fivegram in fivegram_pos_list:
        fivegram_id = fivegrams_list.index(fivegram)
        print(fivegram_id)
        print(fivegram)
        feature_vector[fivegram_id] = 1
        print(feature_vector.tolist())
        sys.exit()

Author: Dostoevsky
Но каково же было мое изумление, когда Наташа с первых же слов остановила меня и сказала, что нечего ее утешать, что она уже пять дней, как знает про это..     – Боже мой!
[('VERB', 'PUNCT', 'SCONJ', 'VERB', 'PRON'), ('DET', 'NOUN', 'PUNCT', 'SCONJ', 'PROPN'), ('VERB', 'PRON', 'CCONJ', 'VERB', 'PUNCT'), ('SPACE', 'PUNCT', 'NOUN', 'DET', 'PUNCT'), ('PRON', 'PUNCT', 'SPACE', 'PUNCT', 'NOUN'), ('ADJ', 'PART', 'NOUN', 'VERB', 'PRON'), ('PROPN', 'ADP', 'ADJ', 'PART', 'NOUN'), ('ADP', 'PRON', 'PUNCT', 'SPACE', 'PUNCT'), ('PUNCT', 'SPACE', 'PUNCT', 'NOUN', 'DET'), ('SCONJ', 'PRON', 'ADV', 'NUM', 'NOUN'), ('NOUN', 'VERB', 'PRON', 'CCONJ', 'VERB'), ('ADJ', 'PART', 'AUX', 'DET', 'NOUN'), ('AUX', 'DET', 'NOUN', 'PUNCT', 'SCONJ'), ('PRON', 'CCONJ', 'VERB', 'PUNCT', 'SCONJ'), ('SCONJ', 'VERB', 'ADP', 'PRON', 'PUNCT'), ('VERB', 'PUNCT', 'SCONJ', 'PRON', 'ADV'), ('PART', 'AUX', 'DET', 'NOUN', 'PUNCT'), ('NOUN', 'PUNCT', 'SCONJ', 'PROPN', 'ADP'), ('PUNCT', 'SCONJ', 'VERB', 'ADP', 'P

SystemExit: ignored

## Writing a function for feature vector modification

In [None]:
def modify_feature_vectors(doc_sentences, features_matrix):
  for sentence, feature_vector in zip(doc_sentences, features_matrix):
    fivegram_pos_list = fivegram_pos_extractor_from_sentence(sentence)
    for fivegram in fivegrams_list:
      if fivegram in fivegram_pos_list:
        fivegram_id = fivegrams_list.index(fivegram)
        feature_vector[fivegram_id] = 1
  return features_matrix

In [None]:
train_features_matrix = numpy.zeros((len(train_sentences), len(fivegrams_list)))
train_doc_sentences = nlp.pipe(train_sentences)

In [None]:
train_features_matrix_final = modify_feature_vectors(train_doc_sentences, train_features_matrix)

# Training

In [None]:
from sklearn.linear_model import LogisticRegression

lr_common_5grams = LogisticRegression()

# Train the model on the data, storing the information learned from the dat`a
# Model is learning the relationship between digits (x_train) and labels (y_train)
lr_common_5grams.fit(train_features_matrix_final, train_authors)

print(lr_common_5grams.classes_)
print(lr_common_5grams.get_params())

['Chekhov' 'Dostoevsky' 'Gogol' 'Tolstoy']
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


# Modifying the test set feature vectors

In [None]:
test_doc_sentences = nlp.pipe(test_sentences)
test_features_matrix = numpy.zeros((len(test_sentences), len(fivegrams_list)))

test_features_matrix_final = modify_feature_vectors(test_doc_sentences, test_features_matrix)

# Making predictions

In [None]:
def predict(i):
    print(test_sentences[i])
    # print the features of the index
    print(test_features_matrix_final[i])
    # print the correct label of the index
    print(test_authors[i])

    print()
    print("Prediction:")
    # print the prediction for the features of this index
    print(lr_common_5grams.predict([test_features_matrix_final[i]]))
    # print the probabilities for each label predictions
    print(lr_common_5grams.predict_proba([test_features_matrix_final[i]]))
    print()

In [None]:
predict(0)
predict(1)
predict(2)
predict(3)
predict(4)
predict(5)

In [None]:
test_predictions_common_5grams = lr_common_5grams.predict(test_features_matrix_final)

for p, r in zip(test_predictions_common_5grams[:10], test_authors[:10]):
    if p == r:
        result = "Correct"
    else:
        result = "Incorrect"
    print(p + "(" + result + ":" + r + ")")

Dostoevsky(Correct:Dostoevsky)
Dostoevsky(Incorrect:Chekhov)
Gogol(Correct:Gogol)
Dostoevsky(Correct:Dostoevsky)
Dostoevsky(Incorrect:Gogol)
Dostoevsky(Correct:Dostoevsky)
Dostoevsky(Correct:Dostoevsky)
Dostoevsky(Incorrect:Chekhov)
Dostoevsky(Correct:Dostoevsky)
Dostoevsky(Correct:Dostoevsky)


# Saving the model

In [None]:
import pickle

# Save to file in the current working directory
pkl_filename = "logreg_common_5grams.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(lr_common_5grams, file)

# Evaluating the model

## Dummy model

In [None]:
from sklearn.metrics import accuracy_score

dummy_predictions = ['Dostoevsky'] * len(test_sentences)
print(len(dummy_predictions))

# Calculate the accuracy of these "dummy predictions"
acc_dummy = accuracy_score(test_authors, dummy_predictions)
print('The accuracy is:', acc_dummy)
print()

from sklearn.metrics import classification_report
print(classification_report(test_authors, dummy_predictions))

1000
The accuracy is: 0.25

              precision    recall  f1-score   support

     Chekhov       0.00      0.00      0.00       250
  Dostoevsky       0.25      1.00      0.40       250
       Gogol       0.00      0.00      0.00       250
     Tolstoy       0.00      0.00      0.00       250

    accuracy                           0.25      1000
   macro avg       0.06      0.25      0.10      1000
weighted avg       0.06      0.25      0.10      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Common POS Model

In [None]:
from sklearn.metrics import accuracy_score

print('Accuracy:')

acc = accuracy_score(test_authors, test_predictions_common_5grams)
print(acc)
corr_count = accuracy_score(test_authors, test_predictions_common_5grams, normalize=False)
total_count = len(test_authors)

print("Total reviews: " + str(str(total_count)))
print("Total correct predictions:" + str(corr_count))
corr_ratio = corr_count / total_count
print("Correct ratio:" + str(corr_ratio))

Accuracy:
0.272
Total reviews: 1000
Total correct predictions:272
Correct ratio:0.272


In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_authors, test_predictions_common_5grams))

              precision    recall  f1-score   support

     Chekhov       0.50      0.04      0.07       250
  Dostoevsky       0.26      0.95      0.41       250
       Gogol       0.31      0.07      0.12       250
     Tolstoy       0.29      0.03      0.06       250

    accuracy                           0.27      1000
   macro avg       0.34      0.27      0.16      1000
weighted avg       0.34      0.27      0.16      1000

