### Imports

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [2]:
from typing import List, Tuple

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import spacy
from datasets import load_dataset

### Load Data & Normalize text

In [4]:
# Function for load dataset Davlan/sib200 (rus_Cyrl)

def load_sib200_ru() -> Tuple[Tuple[List[str], List[int]], Tuple[List[str], List[int]], Tuple[List[str], List[int]], List[str]]:
    # Load dataset
    dataset = load_dataset("Davlan/sib200", "rus_Cyrl")

    print(dataset)

    # Need to encode categories
    train = dataset['train'].class_encode_column('category')
    valid = dataset['validation'].class_encode_column('category')
    test = dataset['test'].class_encode_column('category')

    # Check categories
    categories = train.features['category'].names
    if  categories != valid.features['category'].names or \
        categories != test.features['category'].names:
        raise RuntimeError("Categories don't match!")

    X_train, y_train = train['text'], train['category']
    X_val, y_val = valid['text'], valid['category']
    X_test, y_test = test['text'], test['category']

    return (X_train, y_train), (X_val, y_val), (X_test, y_test), categories

In [5]:
# Normalize text
def normalize_text(s: str, nlp_pipeline: spacy.Language) -> str:
    # String to lower
    s = s.lower()

    # Processing text with the NLP Pipeline
    doc = nlp_pipeline(s)

    # Lemmatization and replacement of numbers by <NUM>
    lemmas = []
    for token in doc:
        if token.is_punct:
            continue                        # Skip punctuation
        if token.like_num:
            lemmas.append('<NUM>')          # Replace numeric tokens with <NUM>
        else:
            lemmas.append(token.lemma_)     # Adding a token lemma

    return ' '.join(lemmas)

In [6]:
# Function for unpacking Tuple for normalize_text
def process_tuple(data: Tuple[List[str], List[int]], nlp_pipeline: spacy.Language) -> Tuple[List[str], List[int]]:
    texts, categories = data
    normalized_texts = [normalize_text(text, nlp_pipeline) for text in texts]
    return (normalized_texts, categories)

In [7]:
train_data, val_data, test_data, classes_list = load_sib200_ru()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/47.9k [00:00<?, ?B/s]

data/rus_Cyrl/train.tsv:   0%|          | 0.00/195k [00:00<?, ?B/s]

data/rus_Cyrl/dev.tsv:   0%|          | 0.00/25.3k [00:00<?, ?B/s]

data/rus_Cyrl/test.tsv:   0%|          | 0.00/57.4k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index_id', 'category', 'text'],
        num_rows: 701
    })
    validation: Dataset({
        features: ['index_id', 'category', 'text'],
        num_rows: 99
    })
    test: Dataset({
        features: ['index_id', 'category', 'text'],
        num_rows: 204
    })
})


Casting to class labels:   0%|          | 0/701 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/99 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/204 [00:00<?, ? examples/s]

In [8]:
# Print category list
print(classes_list)

['entertainment', 'geography', 'health', 'politics', 'science/technology', 'sports', 'travel']


In [9]:
# Check the dimensionality of all obtained samples
print('Train shape: {}'.format(len(train_data[0])))
print('Validation shape: {}'.format(len(val_data[0])))
print('Test shape: {}'.format(len(test_data[1])))

Train shape: 701
Validation shape: 99
Test shape: 204


In [10]:
!python -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-sm==3.7.0)
  Downloading pymorphy3-2.0.2-py3-none-any.whl.metadata (1.8 kB)
Collecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-sm==3.7.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-sm==3.7.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Downloading pymorphy3_d

In [11]:
# Load pipeline ru_core_news_sm
nlp = spacy.load('ru_core_news_sm')

In [12]:
# Apply normalize_text and nlp to all samples
train_data = process_tuple(train_data, nlp)
val_data = process_tuple(val_data, nlp)
test_data = process_tuple(test_data, nlp)

### Init & Fit model (LogReg)

In [13]:
# Create Pipeline for text vectorization and logistic regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('estimator', LogisticRegression())
])

In [14]:
# Parameterize using GridSearchCV
param_grid = {"estimator__C": np.linspace(0.1, 1000, 5),
              "estimator__tol": [1e-1, 1e-2, 1e-4, 1e-6],
              "estimator__solver": ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}

LR_search = GridSearchCV( estimator=pipeline,
                          param_grid=param_grid,
                          scoring='f1_macro',
                          n_jobs = -1 )

LR_search.fit(train_data[0], train_data[1])

In [15]:
# Print the best parameters
LR_search.best_params_

{'estimator__C': 500.05,
 'estimator__solver': 'newton-cholesky',
 'estimator__tol': 0.01}

### Score & Results

In [16]:
# Print the best F1 score
LR_search.best_score_

0.6539082145218936

In [17]:
# Print the size of vectorized text dictionary
LR_search.best_estimator_.named_steps['tfidf'].vocabulary_.__len__()

4330

In [18]:
# Get predictions on the validation sample and output classification_report
val_pred = LR_search.predict(val_data[0])
print(classification_report(val_data[1], val_pred))

              precision    recall  f1-score   support

           0       0.83      0.56      0.67         9
           1       0.67      0.75      0.71         8
           2       1.00      0.55      0.71        11
           3       0.90      0.64      0.75        14
           4       0.62      0.80      0.70        25
           5       0.77      0.83      0.80        12
           6       0.57      0.65      0.60        20

    accuracy                           0.70        99
   macro avg       0.77      0.68      0.70        99
weighted avg       0.73      0.70      0.70        99



In [19]:
# Get predictions on the test sample and output classification_report
test_pred = LR_search.predict(test_data[0])
print(classification_report(test_data[1], test_pred))

              precision    recall  f1-score   support

           0       0.78      0.37      0.50        19
           1       0.67      0.59      0.62        17
           2       0.48      0.50      0.49        22
           3       0.78      0.83      0.81        30
           4       0.66      0.80      0.73        51
           5       0.90      0.76      0.83        25
           6       0.60      0.62      0.61        40

    accuracy                           0.68       204
   macro avg       0.70      0.64      0.65       204
weighted avg       0.69      0.68      0.67       204

