## Алгоритмы мультиклассовой классификации

In [1]:
import os
import warnings
import pandas as pd
import numpy as np
import textwrap

warnings.filterwarnings("ignore")
data_dir = "bbc"

In [2]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

regex = re.compile("[A-Za-z]+")
mystopwords = stopwords.words('english')

def get_dataset(path):
    data = []
    for root, dirs, files in os.walk(path):
        for name in files:
            file_path = os.path.join(root, name)
            category = file_path.split('\\')[-2]
            text = " ".join(open(file_path, encoding='utf-8', errors = 'ignore').read().splitlines())
            data.append((category,text))
            
    df = pd.DataFrame(data, columns=["category","text"])
    shuffle_df = df.sample(frac = 1, random_state = 42, ignore_index = True)
    return shuffle_df

def words_only(text, regex=regex):
    return " ".join(regex.findall(text))

def lemmatize(text, mystopwords = mystopwords):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(token) for token in text.split() if not token in mystopwords])

### Препроцессинг

In [3]:
df = get_dataset(data_dir)
df.text = df.text.apply(words_only)
df.text = df.text.apply(lemmatize)

In [4]:
labels = df.category.unique()
num_labels = len(labels)
id2label = {i:l for i,l in enumerate(labels)}
label2id = {l:i for i,l in enumerate(labels)}
df["labels"] = df.category.map(lambda x: label2id[x.strip()])
df.head()

Unnamed: 0,category,text,labels
0,business,UK house price dip November UK house price dip...,0
1,business,LSE set date takeover deal The London Stock Ex...,0
2,sport,Harinordoquy suffers France axe Number eight I...,1
3,business,Barclays share merger talk Shares UK banking g...,0
4,politics,Campaign cold call questioned Labour Conservat...,2


### Разбиение выборки

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_rem, y_train, y_rem = train_test_split(df.text, df.labels, stratify = df.labels, train_size=0.7)
X_test, X_valid, y_test, y_valid = train_test_split(X_rem, y_rem, stratify = y_rem, test_size=0.5)

print('Train Shape: ', X_train.shape, y_train.shape)
print('Valid Shape: ', X_valid.shape, y_valid.shape)
print('Test Shape: ', X_test.shape, y_test.shape)

Train Shape:  (1557,) (1557,)
Valid Shape:  (334,) (334,)
Test Shape:  (334,) (334,)


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features = 5000)
vectorizer.fit(df.text)

X_train = vectorizer.transform(X_train)
X_valid = vectorizer.transform(X_valid)

### Перебор гиперпараметров алгоритмов многоклассовой классификации

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

lr = LogisticRegression()
forest = RandomForestClassifier() 
xgb = XGBClassifier()

lr_params = {'penalty': ['l1','l2'], 'tol': [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]}
forest_params = {'n_estimators': range(10,30,5), 'max_depth': range(5,10)} 
xgb_params = {'n_estimators': range(10,30,10), 'eta': [0.1, 0.5, 0.9], 'max_depth': range(5,8)}

lr_grid = GridSearchCV(lr, lr_params, cv=5, verbose=True, n_jobs=-1)
forest_grid = GridSearchCV(forest, forest_params,cv=5, verbose=True, n_jobs=-1)
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, verbose=True, n_jobs=-1)

lr_grid.fit(X_train, y_train)
print("Logistic Regression - Params: {}, Score: {}".format(lr_grid.best_params_, round(lr_grid.best_score_,3)))
forest_grid.fit(X_train, y_train)
print("Random Forest - Params: {}, Score: {}".format(forest_grid.best_params_, round(forest_grid.best_score_,3)))
xgb_grid.fit(X_train, y_train)
print("XGBoost - Params: {}, Score: {}".format(xgb_grid.best_params_, round(xgb_grid.best_score_,3)))

models = [
    LogisticRegression(penalty=lr_grid.best_params_['penalty'], tol=lr_grid.best_params_['tol']),
    RandomForestClassifier(n_estimators=forest_grid.best_params_['n_estimators'], max_depth=forest_grid.best_params_['max_depth']),
    XGBClassifier(eta=xgb_grid.best_params_['eta'], max_depth=xgb_grid.best_params_['max_depth'], n_estimators=xgb_grid.best_params_['n_estimators']) 
]
names  = ["Logistic Regression","Random Forest","XGB"]
for model,name in zip(models, names):
    model.fit(X_train,y_train)
        
    predicted = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, predicted)
    f1 = f1_score(y_valid, predicted, average = 'micro')

    print("Model: {}, Accuracy: {}, F1: {}".format(name, round(accuracy,3),round(f1,3)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Logistic Regression - Params: {'penalty': 'l2', 'tol': 0.01}, Score: 0.974
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Random Forest - Params: {'max_depth': 9, 'n_estimators': 20}, Score: 0.92
Fitting 5 folds for each of 18 candidates, totalling 90 fits
XGBoost - Params: {'eta': 0.5, 'max_depth': 5, 'n_estimators': 20}, Score: 0.951
Model: Logistic Regression, Accuracy: 0.982, F1: 0.982
Model: Random Forest, Accuracy: 0.925, F1: 0.925
Model: XGB, Accuracy: 0.952, F1: 0.952


### Проверка на валидации

In [8]:
X_test = vectorizer.transform(X_test)

names  = ["Logistic Regression","Random Forest","XGB"]
for model,name in zip(models, names):
    model.fit(X_train,y_train)
        
    predicted = model.predict(X_test)
    accuracy = accuracy_score(y_test, predicted)
    f1 = f1_score(y_test, predicted, average = 'micro')

    print("Model: {}, Accuracy: {}, F1: {}".format(name, round(accuracy,3),round(f1,3)))

Model: Logistic Regression, Accuracy: 0.964, F1: 0.964
Model: Random Forest, Accuracy: 0.91, F1: 0.91
Model: XGB, Accuracy: 0.925, F1: 0.925


## DistilBert

In [9]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

torch.cuda.is_available = lambda : False
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [10]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_rem, y_train, y_rem = train_test_split(df.text, df.labels, stratify = df.labels, train_size=0.7)
X_test, X_valid, y_test, y_valid = train_test_split(X_rem, y_rem, stratify = y_rem, test_size=0.5)

print('Train Shape: ', X_train.shape, y_train.shape)
print('Valid Shape: ', X_valid.shape, y_valid.shape)
print('Test Shape: ', X_test.shape, y_test.shape)

Train Shape:  (1557,) (1557,)
Valid Shape:  (334,) (334,)
Test Shape:  (334,) (334,)


In [15]:
train_values, valid_values, test_values = list(X_train.values), list(X_valid.values), list(X_test.values)
train_labels, valid_labels, test_labels = list(y_train.values), list(y_valid.values), list(y_test.values)

train_encodings = tokenizer(train_values, truncation=True, padding=True)
val_encodings  = tokenizer(valid_values, truncation=True, padding=True)
test_encodings = tokenizer(test_values, truncation=True, padding=True)

In [16]:
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)
    
train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, valid_labels)
test_dataset = MyDataset(test_encodings, test_labels)

In [17]:
training_args = TrainingArguments(
    output_dir='outputs/TBERT/',
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    auto_find_batch_size = True,
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_dir='outputs/logs/',
    logging_steps=50,
    evaluation_strategy="epoch",
    eval_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics= compute_metrics
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 1557
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 585


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1785,0.187078,0.952096,0.950956,0.957567,0.948924
2,0.0604,0.14956,0.97006,0.970024,0.970505,0.971508
3,0.0031,0.133563,0.973054,0.973426,0.973352,0.97387


***** Running Evaluation *****
  Num examples = 334
  Batch size = 8
Saving model checkpoint to outputs/TBERT/checkpoint-195
Configuration saved in outputs/TBERT/checkpoint-195\config.json
Model weights saved in outputs/TBERT/checkpoint-195\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 334
  Batch size = 8
Saving model checkpoint to outputs/TBERT/checkpoint-390
Configuration saved in outputs/TBERT/checkpoint-390\config.json
Model weights saved in outputs/TBERT/checkpoint-390\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 334
  Batch size = 8
Saving model checkpoint to outputs/TBERT/checkpoint-585
Configuration saved in outputs/TBERT/checkpoint-585\config.json
Model weights saved in outputs/TBERT/checkpoint-585\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from outputs/TBERT/checkpoint-585 (score: 0.13356250524520874).


TrainOutput(global_step=585, training_loss=0.21884407808638026, metrics={'train_runtime': 4827.5851, 'train_samples_per_second': 0.968, 'train_steps_per_second': 0.121, 'total_flos': 618788322984960.0, 'train_loss': 0.21884407808638026, 'epoch': 3.0})

In [16]:
q = [trainer.evaluate(eval_dataset=data) for data in [train_dataset, val_dataset, test_dataset]]
pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

***** Running Evaluation *****
  Num examples = 1557
  Batch size = 8


***** Running Evaluation *****
  Num examples = 334
  Batch size = 8
***** Running Evaluation *****
  Num examples = 334
  Batch size = 8


Unnamed: 0,eval_loss,eval_Accuracy,eval_F1,eval_Precision,eval_Recall
train,0.01119,0.996146,0.996167,0.996377,0.995972
val,0.133563,0.973054,0.973426,0.973352,0.97387
test,0.071997,0.98503,0.985583,0.985209,0.986385


In [17]:
def predict(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cpu")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return probs, probs.argmax(),model.config.id2label[probs.argmax().item()]

### Сегодняшняя  [новость](https://www.bbc.com/news/business-63128436) из категории business

In [18]:
text = "The pound has risen to its highest level for two weeks on hopes Kwasi Kwarteng will bring forward details of how he will cut debt."
predict(text)

(tensor([[0.9614, 0.0122, 0.0122, 0.0090, 0.0052]], grad_fn=<SoftmaxBackward0>),
 tensor(0),
 'business')

In [19]:
model_path = "outputs/TBERT"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

Saving model checkpoint to outputs/TBERT
Configuration saved in outputs/TBERT\config.json
Model weights saved in outputs/TBERT\pytorch_model.bin
tokenizer config file saved in outputs/TBERT\tokenizer_config.json
Special tokens file saved in outputs/TBERT\special_tokens_map.json


('outputs/TBERT\\tokenizer_config.json',
 'outputs/TBERT\\special_tokens_map.json',
 'outputs/TBERT\\vocab.txt',
 'outputs/TBERT\\added_tokens.json',
 'outputs/TBERT\\tokenizer.json')

In [20]:
from transformers import pipeline, DistilBertForSequenceClassification, DistilBertTokenizerFast

model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

loading configuration file outputs/TBERT\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "business",
    "1": "sport",
    "2": "politics",
    "3": "entertainment",
    "4": "tech"
  },
  "initializer_range": 0.02,
  "label2id": {
    "business": 0,
    "entertainment": 3,
    "politics": 2,
    "sport": 1,
    "tech": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "vocab_size": 30522
}

loading weights file outputs/TBERT\pytorch_model.bin
All model ch

In [39]:
nlp("The pound has risen to its highest level for two weeks on hopes Kwasi Kwarteng will bring forward details of how he will cut debt.")

[{'label': 'business', 'score': 0.961361825466156}]

По качеству на валидационной выборке видно, что DistilBert с оценкой 0.985 сработал лучше, чем логистическая регрессия с оценкой 0.964