# Preparing the data

##Importing libraries, downloading the model

In [None]:
import pandas
import sklearn
import numpy
import spacy
import string
import sys
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.metrics import accuracy_score, classification_report

print(pandas.__version__)
print(sklearn.__version__)
print(numpy.__version__)

1.5.3
1.2.2
1.22.4


In [None]:
# Small Russian model:
# !python -m spacy download ru_core_news_sm
# nlp = spacy.load('ru_core_news_sm')

# Large Russian model:
!python -m spacy download ru_core_news_lg
nlp = spacy.load('ru_core_news_lg')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ru-core-news-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.5.0/ru_core_news_lg-3.5.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.5.0)
  Downloading pymorphy3-1.2.0-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-lg==3.5.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting docopt>=0.6 (from pymorphy3>=1.0.0->ru-core-news-lg==3.5.0)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru

##Making lists and doc objects from csv files

In [None]:
# Let's load the training data from a csv file
train_set = pandas.read_csv('./train_data.csv', encoding='utf-8')
# train_set

In [None]:
test_set = pandas.read_csv('./test_data.csv', encoding='utf-8')
# test_set

In [None]:
train_sentences = train_set['text'].to_list()
train_authors = train_set['author'].to_list()

test_sentences = test_set['text'].to_list()
test_authors = test_set['author'].to_list()

print(len(train_authors), len(test_authors))

10000 1000


In [None]:
train_doc_sentences = nlp.pipe(train_sentences)
test_doc_sentences = nlp.pipe(test_sentences)

## Preparing the feature set

First we will import the Latin letters using 'string.ascii_letters'.

In [None]:
print(string.ascii_letters)

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ


Then we will check if additionaly French, German or Ukrainian special characters appear in the sentences, since all four writers were known to be using words from these languages.

In [None]:
french_accent_marks = "éèêëàÉÈÊËÀ"
german_characters = "äöüßÄÖÜ"
ukrainian_characters = "ґєїіҐЄЇІ"

for sentence in train_sentences:
  for character in sentence:
    if character in ukrainian_characters:
      print('Ukr character found')
    if character in german_characters:
      print('German character found')
    if character in french_accent_marks:
      print('French character found')

French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found
French character found


As we can see, there are no special Ukrainian or German characters, but there are special French characters. Thus we will add them to our character string.

In [None]:
new_string = string.ascii_letters + french_accent_marks
print(len(new_string), new_string)

62 abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZéèêëàÉÈÊËÀ



In [None]:
# We are creating a matrix with zero vectors for each review (in training set and test set)
train_features_matrix = numpy.zeros((len(train_sentences), len(new_string)))
print(train_features_matrix.shape)

test_features_matrix = numpy.zeros((len(test_sentences), len(new_string)))
print(test_features_matrix.shape)

(10000, 62)
(1000, 62)


# Modifying the feature vectors

## Visualisation tests

In [None]:
counter = 0

for sentence, author in zip(train_sentences, train_authors):
    for char in sentence:
      if char in new_string:
        print('Author:', author)
        print(sentence)
        char_id = new_string.index(char)
        print(char_id)
        break
    counter +=1
    if counter == 40:
      break
        # sys.exit()

Author: Tolstoy
Oh, sans doute, c'est la plus charmante femme du monde, — сказала Анна Павловна с улыбкой над своей восторженностью.
40


Visualisation test 2:

In [None]:
counter = 0

for sentence, author, feature_vector in zip(train_sentences, train_authors, train_features_matrix):
    for char in sentence:
      if char in new_string:
        print('Author:', author)
        print(sentence)
        char_id = new_string.index(char)
        print(char_id)
        feature_vector[char_id] = 1
        print(feature_vector.tolist())
        # sys.exit()
    counter +=1
    if counter == 40:
      break

## Writing a function for vector modification

In [None]:
def modify_feature_vectors(sentences, features_matrix):
  for sentence, feature_vector in zip(sentences, features_matrix):
    for char in sentence:
      if char in new_string:
        char_id = new_string.index(char)
        feature_vector[char_id] = 1
  return features_matrix

In [None]:
train_features_matrix = numpy.zeros((len(train_sentences), len(new_string)))
train_features_matrix_final = modify_feature_vectors(train_sentences, train_features_matrix)

print(train_features_matrix_final[35:40])

# Training

In [None]:
lr_latin = LogisticRegression()

# Train the model on the data, storing the information learned from the dat`a
# Model is learning the relationship between digits (x_train) and labels (y_train)
lr_latin.fit(train_features_matrix_final, train_authors)

print(lr_latin.classes_)
print(lr_latin.get_params())

['Chekhov' 'Dostoevsky' 'Gogol' 'Tolstoy']
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Modifying the test set feature vectors

In [None]:
test_features_matrix = numpy.zeros((len(test_sentences), len(new_string)))
test_features_matrix_final = modify_feature_vectors(test_sentences, test_features_matrix)

print(test_features_matrix_final[35:40])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


# Making predictions

In [None]:
def predict(i):
    print(test_sentences[i])
    # print the features of the index
    print(test_features_matrix_final[i])
    # print the correct label of the index
    print(test_authors[i])

    print()
    print("Prediction:")
    # print the prediction for the features of this index
    print(lr_latin.predict([test_features_matrix_final[i]]))
    # print the probabilities for each label predictions
    print(lr_latin.predict_proba([test_features_matrix_final[i]]))
    print()

In [None]:
predict(0)
predict(28)
predict(39)

"Фома Фомич, говорю, разве это возможное дело?
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Dostoevsky

Prediction:
['Chekhov']
[[0.25429837 0.25096826 0.25394822 0.24078515]]

Je vous dois la vie.
[1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Tolstoy

Prediction:
['Tolstoy']
[[0.0466672  0.34852971 0.00792362 0.59687947]]

На одной скамье, в уединенной аллее, увидел я m-me M *.
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Dostoevsky

Prediction:
['Chekhov']
[[0.37399333 0.2254538  0.09022878 0.31032409]]



# Saving the model

In [None]:
# Save to file in the current working directory
pkl_filename = "logreg_latin.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(lr_latin, file)

# Evaluating the model

## Dummy Model

In [None]:
dummy_predictions = ['Dostoevsky'] * len(test_sentences)
print(len(dummy_predictions))

# Calculate the accuracy of these "dummy predictions"
acc_dummy = accuracy_score(test_authors, dummy_predictions)
print(f'The accuracy is: {acc_dummy}')
print()

print(classification_report(test_authors, dummy_predictions))

1000
The accuracy is: 0.25

              precision    recall  f1-score   support

     Chekhov       0.00      0.00      0.00       250
  Dostoevsky       0.25      1.00      0.40       250
       Gogol       0.00      0.00      0.00       250
     Tolstoy       0.00      0.00      0.00       250

    accuracy                           0.25      1000
   macro avg       0.06      0.25      0.10      1000
weighted avg       0.06      0.25      0.10      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Latin characters model

In [None]:
print('Accuracy:')

acc = accuracy_score(test_authors, test_predictions_latin)
print(acc)
corr_count = accuracy_score(test_authors, test_predictions_latin, normalize=False)
total_count = len(test_authors)

print(f'Total reviews: {str(total_count)}')
print(f'Total correct predictions: {str(corr_count)}')
corr_ratio = corr_count / total_count
print(f'Correct ratio: {str(corr_ratio)}')

Accuracy:
0.265
Total reviews: 1000
Total correct predictions:265
Correct ratio:0.265


In [None]:
print(classification_report(test_authors, test_predictions_latin))

              precision    recall  f1-score   support

     Chekhov       0.25      0.99      0.40       250
  Dostoevsky       0.50      0.01      0.02       250
       Gogol       0.00      0.00      0.00       250
     Tolstoy       0.70      0.06      0.12       250

    accuracy                           0.27      1000
   macro avg       0.36      0.27      0.13      1000
weighted avg       0.36      0.27      0.13      1000

