# Importing libraries, downloading the model

In [59]:
# Import necessary libraries
import pandas
import sklearn
import numpy
import spacy
import re
import json
from statistics import mean
from collections import Counter
import string
from typing import List

# Print versions of the major libraries
print(f"Pandas version: {pandas.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"NumPy version: {numpy.__version__}")

Pandas version: 2.0.3
Scikit-learn version: 1.2.2
NumPy version: 1.25.2


In [16]:
# Download and load the small Russian model for quick test purposes^
!python -m spacy download ru_core_news_sm
nlp = spacy.load('ru_core_news_sm')

# Large Russian model:
# !python -m spacy download ru_core_news_lg
# nlp = spacy.load('ru_core_news_lg')

Collecting ru-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Making lists and doc objects from csv files

Let's load the training data from a csv file and display the first few rows of the training set:

In [55]:
train_set = pandas.read_csv('./train_data.csv', encoding='utf-8-sig')
train_set.head()

Unnamed: 0,text,author
0,"Но каково же было мое изумление, когда Наташа ...",Dostoevsky
1,— закричали в толпе.. — Давай совет кошевой!,Gogol
2,"И всё, бывало, извиняется.",Chekhov
3,Живу-ут!.,Chekhov
4,Там воду освятим: они скорее выздоровеют; и я ...,Tolstoy


Let's do the same thing on the test set:

In [56]:
test_set = pandas.read_csv('./test_data.csv', encoding='utf-8-sig')
test_set.head()

Unnamed: 0,text,author
0,"""Фома Фомич, говорю, разве это возможное дело?",Dostoevsky
1,Пора бы уже домой.,Chekhov
2,"А казаки все до одного прощались, зная, что мн...",Gogol
3,"Вдруг слезы градом у обоих из глаз, дрогнули р...",Dostoevsky
4,Но художник видел в этом нежном личике одну то...,Gogol


Let's extract the lists of authors and sentences for both the training set and the test set:

In [57]:
train_sentences = train_set['text'].to_list()
train_authors = train_set['author'].to_list()

test_sentences = test_set['text'].to_list()
test_authors = test_set['author'].to_list()

print(f"Number of training authors: {len(train_authors)}")
print(f"Number of test authors: {len(test_authors)}")

Number of training authors: 10000
Number of test authors: 1000


# Data per class (author)

It would also be useful to extract the data per author. First we will create a function data_extract that takes in the name of the author (string) and returns a list of sentences for the specified author.

In [60]:
def data_extract(author_name: str) -> List[str]:
  """
  Extracts all sentences for a given author from the provided dataset.

  Arg:
  author_name (str): The name of the author whose data is to be extracted.

  Returns:
  list: A list of text sentences for the specified author.
  """
  author_data = train_set[train_set['author'] == author_name]['text'].to_list()
  return author_data

Now let's use the function for all four classes (authors).

In [61]:
dostoyevsky_data = data_extract('Dostoevsky')
tolstoy_data = data_extract('Tolstoy')
chekhov_data = data_extract('Chekhov')
gogol_data = data_extract('Gogol')

Let's create a dictionary with the names of the authors as keys, and the lists of sentences as values.

In [80]:
author_data = {
    'Training': train_sentences,
    'Test': test_sentences,
    'Dostoyevsky': dostoyevsky_data,
    'Tolstoy': tolstoy_data,
    'Chekhov': chekhov_data,
    'Gogol': gogol_data
}

Let's process the data per each of the four authors with Spacy, thus creating doc objects. Let's also print the number of sentences per author.

In [119]:
author_doc_dict = {}
for author, data in author_data.items():
  author_doc_dict[author] = nlp.pipe(data)
  print(f'Number of sentences by {author}: {len(data)}')

Number of sentences by Training: 10000
Number of sentences by Test: 1000
Number of sentences by Dostoyevsky: 2500
Number of sentences by Tolstoy: 2500
Number of sentences by Chekhov: 2500
Number of sentences by Gogol: 2500


# Functions

Let's write all the functions necessary for our data analysis:

In [162]:
def get_sentence_and_word_length(sentences):
  """
  Calculate average sentence length and average word length for a list of sentences.

  Args:
  sentences (List[spacy.tokens.doc.Doc]): List of spaCy Doc objects representing sentences
  """
  word_length_list_total = []
  sentence_lengths = []
  for sentence in sentences:
    sentence_lengths.append(len(sentence))
    tokens_length_list_per_sentence = [len(token.text) for token in sentence]
    avg_length = mean(tokens_length_list_per_sentence)
    word_length_list_total.append(avg_length)
  avg_word_length = mean(word_length_list_total)
  avg_sent_length = mean(sentence_lengths)
  print(f'Average nr of words per sentence: {avg_sent_length}')
  print(f'Average token length: {round(avg_word_length, 2)}')
  print()

def ner_counter(list_of_doc_sentences):
  """
  Count occurrences of named entities in a list of spaCy Doc objects.

  Args:
  sentences (List[spacy.tokens.doc.Doc]): List of spaCy Doc objects to analyze
  """
  all_named_entities = [entity.label_ for doc in list_of_doc_sentences for entity in doc.ents]
  named_entity_counts = Counter(all_named_entities)
  print(named_entity_counts)

def punctuation_mark_counter(sentences):
    """
    Count occurrences of punctuation marks in a list of sentences using regex.

    Args:
    sentences (List[str]): List of sentences (strings) to analyze
    """
    punct_pattern = r'[,.\-!?;:…«»()"\[\]&#{}<>/\'%—–]'
    text = ' '.join(sentences)
    punctuation_marks = re.findall(punct_pattern, text)
    punctuation_counts = Counter(punctuation_marks)
    print(punctuation_counts)

def replace_ellipsis(sentences):
  """
  Replace '...' with '…' in a list of sentences (strings).

  Args:
  sentences (List[str]): List of sentences (strings) to process
  """
  updated_sentences = [sentence.replace("...", "…") for sentence in sentences]
  return updated_sentences

def count_sentences_with_latin_chars(sentences):
  """
  Count sentences containing latin characters in a list of sentences.

  Args:
  sentences (List[str]): List of sentences to process
  """
  latin_pattern = '[a-zA-Z]'
  count = 0
  for sentence in sentences:
    if re.search(latin_pattern, sentence):
      count += 1
  return count

def fivegram_pos_extractor_unique_and_count(list_of_doc_sentences):
    """
    Analyze POS tag 5-grams from a list of spaCy Doc objects.
    Get the count of the unique fivegram POS-tag combinations, as well as the 5 most common 5-grams.

    Args:
    list_of_doc_sentences (List[spacy.tokens.doc.Doc]): List of spaCy Doc objects to analyze
    """
    n = 5
    fivegram_pos_tags = []
    for doc in list_of_doc_sentences:
        # Iterate over each possible fivegram in the document
        for i in range(len(doc) - n + 1):
            # Extract the tokens for the current fivegram
            fivegram_tokens = doc[i : i + n]
            # Extract the POS tags of the tokens and add the POS tag combination to the list
            fivegram_pos = tuple(token.pos_ for token in fivegram_tokens)
            fivegram_pos_tags.append(fivegram_pos)
    unique_fivegram_pos_tags = list(set(fivegram_pos_tags))
    print(f"{len(unique_fivegram_pos_tags)} unique fivegram POS-tag combinations.")
    most_common_fivegrams = Counter(fivegram_pos_tags).most_common(5)
    print("Most common 5-grams:")
    for fivegram in most_common_fivegrams:
      print(fivegram)
    print()

# Analyzing the data

## Sentence and word length

Initially I considered using sentence and word length as features for one of the models. However since we had some exercises on it, I thought I might not be allowed to use it and I focused on other features instead.

In [110]:
for author, doc in author_doc_dict.items():
  print(f'\033[1;4m{author}:\033[0m')
  get_sentence_and_word_length(doc)

[1;4mTraining:[0m
Average nr of words per sentence: 19.9926
Average token length: 3.87

[1;4mTest:[0m
Average nr of words per sentence: 19.708
Average token length: 3.85

[1;4mDostoyevsky:[0m
Average nr of words per sentence: 21.0744
Average token length: 3.76

[1;4mTolstoy:[0m
Average nr of words per sentence: 22.2136
Average token length: 3.88

[1;4mChekhov:[0m
Average nr of words per sentence: 17.61
Average token length: 3.79

[1;4mGogol:[0m
Average nr of words per sentence: 19.0724
Average token length: 4.03



## Named Entity Counter

In [105]:
for author, doc in author_doc_dict.items():
  print(f'\033[1;4m{author}:\033[0m')
  ner_counter(doc)
  print()

[1;4mTraining:[0m
Counter({'PER': 3802, 'LOC': 617, 'ORG': 145})

[1;4mTest:[0m
Counter({'PER': 374, 'LOC': 57, 'ORG': 11})

[1;4mDostoyevsky:[0m
Counter({'PER': 819, 'LOC': 116, 'ORG': 31})

[1;4mTolstoy:[0m
Counter({'PER': 1205, 'LOC': 256, 'ORG': 69})

[1;4mChekhov:[0m
Counter({'PER': 995, 'LOC': 121, 'ORG': 22})

[1;4mGogol:[0m
Counter({'PER': 783, 'LOC': 124, 'ORG': 23})



## Punctuation mark counter

In [163]:
for author, sentence_list in author_data.items():
  print(f'\033[1;4m{author}:\033[0m')
  punctuation_mark_counter(sentence_list)

[1;4mTraining:[0m
Counter({',': 21220, '.': 15248, '-': 2112, '!': 1891, '–': 1848, '—': 1820, '?': 1248, ';': 1025, ':': 819, '…': 525, '«': 514, '»': 512, ')': 355, '(': 354, '"': 239, '[': 98, ']': 98, '&': 98, '#': 97, "'": 43, '{': 17, '}': 17, '<': 14, '>': 14, '/': 2, '%': 1})
[1;4mTest:[0m
Counter({',': 2101, '.': 1459, '-': 209, '—': 179, '–': 175, '!': 170, '?': 123, ';': 96, ':': 82, '«': 56, '…': 50, '»': 39, ')': 32, '(': 30, '"': 29, '[': 10, ']': 10, '&': 7, '#': 7, '{': 3, '}': 3, "'": 3, '<': 1, '>': 1})
[1;4mDostoyevsky:[0m
Counter({',': 5706, '.': 3261, '–': 838, '-': 727, '!': 427, '?': 368, ';': 298, '…': 191, ':': 190, '«': 165, '»': 161, '"': 94, '—': 89, '(': 85, ')': 83, '[': 16, ']': 16, "'": 2, '<': 1, '>': 1})
[1;4mTolstoy:[0m
Counter({',': 5937, '.': 4260, '-': 698, '—': 469, ';': 287, '?': 271, '–': 251, '!': 216, ':': 193, '"': 112, '«': 111, '»': 110, ')': 105, '(': 101, '…': 99, '&': 97, '#': 97, '[': 66, ']': 65, "'": 38, '{': 17, '}': 17, '/':

## Sentences with Latin characters counter

In [144]:
for author, sentence_list in author_data.items():
  latin_chars = count_sentences_with_latin_chars(sentence_list)
  print(f'Latin characters in \033[1;4m{author}\033[0m sentences: {latin_chars}')

Latin characters in [1;4mTraining[0m sentences: 277
Latin characters in [1;4mTest[0m sentences: 30
Latin characters in [1;4mDostoyevsky[0m sentences: 58
Latin characters in [1;4mTolstoy[0m sentences: 163
Latin characters in [1;4mChekhov[0m sentences: 25
Latin characters in [1;4mGogol[0m sentences: 31


## 5-gram POS sequence counter

In [145]:
for author, doc in author_doc_dict.items():
  print(f'\033[1;4m{author}:\033[0m')
  fivegram_pos_extractor_unique_and_count(doc)

[1;4mTraining:[0m
49929 unique fivegram POS-tag combinations.
Most common 5-grams:
(('X', 'X', 'X', 'X', 'X'), 306)
(('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), 247)
(('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), 247)
(('VERB', 'ADP', 'DET', 'NOUN', 'PUNCT'), 210)
(('NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT'), 183)

[1;4mTest:[0m
10782 unique fivegram POS-tag combinations.
Most common 5-grams:
(('NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN'), 30)
(('PUNCT', 'VERB', 'ADP', 'NOUN', 'PUNCT'), 26)
(('VERB', 'ADP', 'NOUN', 'NOUN', 'PUNCT'), 20)
(('NOUN', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'), 19)
(('VERB', 'ADP', 'DET', 'NOUN', 'PUNCT'), 19)

[1;4mDostoyevsky:[0m
23205 unique fivegram POS-tag combinations.
Most common 5-grams:
(('VERB', 'ADP', 'DET', 'NOUN', 'PUNCT'), 61)
(('PUNCT', 'SPACE', 'PUNCT', 'VERB', 'PRON'), 59)
(('NOUN', 'PUNCT', 'SPACE', 'PUNCT', 'SPACE'), 55)
(('ADJ', 'NOUN', 'PUNCT', 'SPACE', 'PUNCT'), 41)
(('VERB', 'PRON', 'ADP', 'NOUN', 'PUNCT'), 40)

[1;4mTolstoy:[0m
21973 unique fivegra