# Importing libraries, downloading the model

In [None]:
import pandas
import sklearn
import numpy
import spacy
import re
from statistics import mean
from collections import Counter
import string

print(pandas.__version__)
print(sklearn.__version__)
print(numpy.__version__)

1.5.3
1.2.2
1.22.4


In [None]:
!python -m spacy download ru_core_news_lg
nlp = spacy.load('ru_core_news_lg')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ru-core-news-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.5.0/ru_core_news_lg-3.5.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.5.0)
  Downloading pymorphy3-1.2.0-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-lg==3.5.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting docopt>=0.6 (from pymorphy3>=1.0.0->ru-core-news-lg==3.5.0)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru

# Making lists and doc objects from csv files

In [None]:
# Let's load the training data from a csv file
train_set = pandas.read_csv('./train_data.csv', encoding='utf-8')
train_set

In [None]:
test_set = pandas.read_csv('./test_data.csv', encoding='utf-8')
test_set

Let's get lists of authors and sentences for both the train set and the test set:

In [None]:
train_sentences = train_set['text'].to_list()
train_authors = train_set['author'].to_list()

test_sentences = test_set['text'].to_list()
test_authors = test_set['author'].to_list()

print(len(train_authors), len(test_authors))

10000 1000


Now let's create doc objects out of the lists above:

In [None]:
train_doc_sentences = nlp.pipe(train_sentences)
test_doc_sentences = nlp.pipe(test_sentences)

# Data per class (author)

It would also be useful to extract the data per author:

In [None]:
def data_extract(author_name):
  needed_data = train_set[train_set['author'] == author_name]['text'].to_list()
  return needed_data

In [None]:
# Extract data for Dostoyevsky
dostoyevsky_data = data_extract('Dostoevsky')
tolstoy_data = data_extract('Tolstoy')
chekhov_data = data_extract('Chekhov')
gogol_data = data_extract('Gogol')

Number of sentences by Dostoyevsky: 2500
Number of sentences by Tolstoy: 2500
Number of sentences by Chekhov: 2500
Number of sentences by Gogol: 2500


In [None]:
# Print the number of sentences per author
print(f'Number of sentences by Dostoyevsky: {len(dostoyevsky_data)}')
print(f'Number of sentences by Tolstoy: {len(tolstoy_data)}')
print(f'Number of sentences by Chekhov: {len(chekhov_data)}')
print(f'Number of sentences by Gogol: {len(gogol_data)}')

Now let's process the data per each of the four authors with Spacy, thus creating a doc object for list that we made above:

In [None]:
dostoyevsky_data_doc = nlp.pipe(dostoyevsky_data)
tolstoy_data_doc = nlp.pipe(tolstoy_data)
chekhov_data_doc = nlp.pipe(chekhov_data)
gogol_data_doc = nlp.pipe(gogol_data)

# Functions

Let's write all the functions necessary for our data analysis:

In [None]:
def get_sentence_length(sentences):
  sentence_lengths = [len(sentence) for sentence in sentences]
  avg_sent_length = mean(sentence_lengths)
  print(f'Nr of sentences: {len(sentence_lengths)}')
  print(f'Average nr of words per sentence: {avg_sent_length}')

def get_avg_word_length(sentences):
  word_length_list_total = []
  for sentence in sentences:
    tokens_length_list_per_sentence = [len(token.text) for token in sentence]
    avg_length = mean(tokens_length_list_per_sentence)
    word_length_list_total.append(avg_length)
  avg_word_length = mean(word_length_list_total)
  print(f'Nr of sentences: {len(word_length_list_total)}')
  print(f'Average token length: {len(avg_word_length)}')

def ner_counter(sentences):
  all_named_entities = []
  for doc in sentences:
    for entity in doc.ents:
        all_named_entities.append(entity.label_)
  named_entity_counts = Counter(all_named_entities)
  print(named_entity_counts)

def punctuation_mark_counter_greedy(sentences):
  new_punctuation_string = "—«»–‹›…" + string.punctuation
  punctuation_marks = []
  for sentence in sentences:
    for char in sentence:
      if char in new_punctuation_string:
        punctuation_marks.append(char)
  punctuation_marks_counts = Counter(punctuation_marks)
  print(punctuation_marks_counts)

def replace_ellipsis(sentences):
  updated_sentences = [sentence.replace("...", "…") for sentence in sentences]
  return updated_sentences

def count_sentences_with_latin_chars(sentences):
  latin_pattern = '[a-zA-Z]'
  count = 0
  for sentence in sentences:
    if re.search(latin_pattern, sentence):
      count += 1
      # print(sentence)
  print(count)

def fivegram_pos_extractor_unique(list_of_doc_sentences):
    n = 5
    fivegram_pos_tags = []
    for doc in list_of_doc_sentences:
        # Iterate over each possible fivegram in the document
        for i in range(len(doc) - n + 1):
            # Extract the tokens for the current fivegram
            fivegram_tokens = doc[i : i + n]
            # Extract the POS tags of the tokens and add the POS tag combination to the list
            fivegram_pos = tuple(token.pos_ for token in fivegram_tokens)
            fivegram_pos_tags.append(fivegram_pos)
    unique_fivegram_pos_tags = list(set(fivegram_pos_tags))

    return unique_fivegram_pos_tags

def fivegram_pos_count(list_of_doc_sentences):
  n = 5
  fivegram_pos_tags = []
  for doc in list_of_doc_sentences:
    # Iterate over each possible fivegram in the document
    for i in range(len(doc) - n + 1):
            # Extract the tokens for the current fivegram
            fivegram_tokens = doc[i : i + n]
            # Extract the POS tags of the tokens and add the POS tag combination to the list
            fivegram_pos = tuple(token.pos_ for token in fivegram_tokens)
            fivegram_pos_tags.append(fivegram_pos)
  most_common_fivegrams = Counter(fivegram_pos_tags).most_common(5)
  return most_common_fivegrams

# Analyzing the data

## Sentence and word length

Initially I considered using sentence and word length as features for one of the models. However since we had some exercises on it, I thought I might not be allowed to use it and I focused on other features instead.

In [1]:
get_sentence_length(dostoyevsky_data_doc)
get_sentence_length(tolstoy_data_doc)
get_sentence_length(chekhov_data_doc)
get_sentence_length(gogol_data_doc)

NameError: name 'get_sentence_length' is not defined

In [None]:
get_avg_word_length(dostoyevsky_data_doc)
get_avg_word_length(tolstoy_data_doc)
get_avg_word_length(chekhov_data_doc)
get_avg_word_length(gogol_data_doc)

Nr of sentences: 2500
Average token length: 3.7622330273529023
Nr of sentences: 2500
Average token length: 3.8788479197579444
Nr of sentences: 2500
Average token length: 3.7922887875675118
Nr of sentences: 2500
Average token length: 4.026964518720382


## Named Entity Counter

In [None]:
ner_counter(train_doc_sentences)
ner_counter(test_doc_sentences)
ner_counter(dostoyevsky_data_doc)
ner_counter(tolstoy_data_doc)
ner_counter(chekhov_data_doc)
ner_counter(gogol_data_doc)

Counter({'PER': 3819, 'LOC': 591, 'ORG': 118})
Counter({'PER': 387, 'LOC': 52, 'ORG': 9})
Counter({'PER': 819, 'LOC': 98, 'ORG': 25})
Counter({'PER': 1196, 'LOC': 236, 'ORG': 55})
Counter({'PER': 995, 'LOC': 112, 'ORG': 19})
Counter({'PER': 809, 'LOC': 145, 'ORG': 19})


## Punctuation mark counter

Counting all punctuation marks.

In [None]:
punctuation_mark_counter_greedy(train_sentences)
punctuation_mark_counter_greedy(test_sentences)
punctuation_mark_counter_greedy(dostoyevsky_data)
punctuation_mark_counter_greedy(tolstoy_data)
punctuation_mark_counter_greedy(chekhov_data)
punctuation_mark_counter_greedy(gogol_data)

Counter({',': 21220, '.': 15248, '-': 2112, '!': 1891, '–': 1848, '—': 1820, '?': 1248, ';': 1025, ':': 819, '…': 525, '«': 514, '»': 512, ')': 355, '(': 354, '"': 239, '[': 98, ']': 98, '&': 98, '#': 97, '*': 95, "'": 43, '{': 17, '}': 17, '<': 14, '>': 14, '/': 2, '%': 1})
Counter({',': 2101, '.': 1459, '-': 209, '—': 179, '–': 175, '!': 170, '?': 123, ';': 96, ':': 82, '«': 56, '…': 50, '»': 39, ')': 32, '(': 30, '"': 29, '[': 10, ']': 10, '&': 7, '#': 7, '*': 6, '{': 3, '}': 3, "'": 3, '<': 1, '>': 1})
Counter({',': 5706, '.': 3261, '–': 838, '-': 727, '!': 427, '?': 368, ';': 298, '…': 191, ':': 190, '«': 165, '»': 161, '"': 94, '—': 89, '(': 85, ')': 83, '[': 16, ']': 16, '*': 7, "'": 2, '<': 1, '>': 1})
Counter({',': 5937, '.': 4260, '-': 698, '—': 469, ';': 287, '?': 271, '–': 251, '!': 216, ':': 193, '"': 112, '«': 111, '»': 110, ')': 105, '(': 101, '…': 99, '&': 97, '#': 97, '[': 66, ']': 65, "'": 38, '*': 31, '{': 17, '}': 17, '/': 2, '<': 2, '>': 2})
Counter({'.': 5054, ','

## Sentences with Latin characters counter

In [None]:
import re

count_sentences_with_latin_chars(train_sentences)
count_sentences_with_latin_chars(test_sentences)
count_sentences_with_latin_chars(dostoyevsky_data)
count_sentences_with_latin_chars(tolstoy_data)
count_sentences_with_latin_chars(chekhov_data)
count_sentences_with_latin_chars(gogol_data)

277
30
58
163
25
31


## 5-gram POS sequence counter

In [None]:
print(len(fivegram_pos_extractor_unique(train_doc_sentences)))
print(len(fivegram_pos_extractor_unique(test_doc_sentences)))
print(len(fivegram_pos_extractor_unique(dostoyevsky_data_doc)))
print(len(fivegram_pos_extractor_unique(tolstoy_data_doc)))
print(len(fivegram_pos_extractor_unique(chekhov_data_doc)))
print(len(fivegram_pos_extractor_unique(gogol_data_doc)))

49912
10750
23180
21998
17207
19514


In [None]:
from collections import Counter

print(fivegram_pos_count(train_doc_sentences))
print(fivegram_pos_count(test_doc_sentences))
print(fivegram_pos_count(dostoyevsky_data_doc))
print(fivegram_pos_count(tolstoy_data_doc))
print(fivegram_pos_count(chekhov_data_doc))
print(fivegram_pos_count(gogol_data_doc))

[(('X', 'X', 'X', 'X', 'X'), 307), (('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), 254), (('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), 248), (('VERB', 'ADP', 'DET', 'NOUN', 'PUNCT'), 210), (('VERB', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), 181)]
[(('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), 31), (('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), 26), (('NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), 20), (('NOUN', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'), 20), (('VERB', 'ADP', 'NOUN', 'NOUN', 'PUNCT'), 20)]
[(('VERB', 'ADP', 'DET', 'NOUN', 'PUNCT'), 60), (('PUNCT', 'SPACE', 'PUNCT', 'VERB', 'PRON'), 59), (('NOUN', 'PUNCT', 'SPACE', 'PUNCT', 'SPACE'), 56), (('VERB', 'PUNCT', 'SPACE', 'PUNCT', 'SPACE'), 42), (('VERB', 'PRON', 'ADP', 'NOUN', 'PUNCT'), 40)]
[(('X', 'X', 'X', 'X', 'X'), 297), (('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), 87), (('X', 'X', 'X', 'X', 'PUNCT'), 87), (('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), 84), (('NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), 59)]
[(('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), 62), (('PUNCT', 