In [None]:
import os
import torch
import time
import re
import spacy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer


!pip install datasets
import datasets
!pip install transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification
from transformers import DistilBertConfig
from transformers import TrainingArguments
from transformers import Trainer

import warnings
warnings.filterwarnings("ignore")

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
src_path = './drive/MyDrive/Colab Notebooks/bbc/'

In [5]:
def extract_file_content(src_path):
    targets = []
    texts = []

    for folder in os.listdir(src_path):
        target = os.path.basename(folder)

        category_folder = os.path.join(src_path, folder)
        for file in os.listdir(category_folder):
            data = ''
            with open(os.path.join(category_folder, file), mode='r', encoding='utf8', errors='ignore') as f:
                file_content = f.readlines()
                data = file_content
                data = ' '.join(data)
            targets.append(target)
            texts.append(data)

    return texts, targets

In [6]:
texts, targets = extract_file_content(src_path)
data_dict = {'texts': texts, 'target': targets}
df = pd.DataFrame(data_dict)
le = sklearn.preprocessing.LabelEncoder()
df['label'] = le.fit_transform(df['target'])
df.to_csv('./bbc.csv')
df.head()

Unnamed: 0,texts,target,label
0,Elton plays Paris charity concert\n \n Sir Elt...,entertainment,1
1,DVD review: Spider-Man 2\n \n It's a universal...,entertainment,1
2,Legendary music studio to close\n \n The New Y...,entertainment,1
3,New Harry Potter tops book chart\n \n Harry Po...,entertainment,1
4,Eminem secret gig venue revealed\n \n Rapper E...,entertainment,1


In [7]:
def clean_data(text, stopwords = set(stopwords.words('english'))):
    text = text.lower()
    text = re.sub('[^\w\s]','', text)

    words = [w for w in word_tokenize(text) if (w not in stopwords)]
    return ' '.join(words)

In [8]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatize(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

In [9]:
df['text'] = df['texts'].map(clean_data)
df['text'] = df['texts'].map(lemmatize)

In [10]:
df.head()

Unnamed: 0,texts,target,label,text
0,Elton plays Paris charity concert\n \n Sir Elt...,entertainment,1,elton play Paris charity concert \n \n Sir El...
1,DVD review: Spider-Man 2\n \n It's a universal...,entertainment,1,dvd review : Spider - Man 2 \n \n it be a uni...
2,Legendary music studio to close\n \n The New Y...,entertainment,1,legendary music studio to close \n \n the New...
3,New Harry Potter tops book chart\n \n Harry Po...,entertainment,1,New Harry Potter top book chart \n \n Harry P...
4,Eminem secret gig venue revealed\n \n Rapper E...,entertainment,1,Eminem secret gig venue reveal \n \n Rapper E...


In [11]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))
X = tfidf.fit_transform(df['text']).toarray()
y = df['label']
X.shape, y.shape

((2225, 30966), (2225,))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 42,
                                                    stratify=y)

In [13]:
clf = RandomForestClassifier(n_estimators=200, random_state=42)

In [None]:
clf.fit(X_train, y_train)

In [15]:
y_pred_proba = clf.predict_proba(X_test)
y_pred = clf.predict(X_test)

In [16]:
f1_score(y_test, y_pred, average='micro')

0.9393258426966292

In [17]:
roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

0.9954265263245968

In [18]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df['text'],
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 42,
                                                    stratify=y)

In [20]:
train_enc = tokenizer(X_train.tolist(), truncation=True, padding=True, return_tensors="pt")
test_enc = tokenizer(X_test.tolist(), truncation=True, padding=True, return_tensors="pt")

In [21]:
class BBCDataset(torch.utils.data.Dataset):

    def __init__(self, encs, labels):
        self.encs = encs
        self.labels = labels

    def __getitem__(self, idx):
        item = { key: torch.tensor(val[idx]) for key, val in self.encs.items() }
        item['labels'] = torch.tensor(self.labels[idx], device=device)
        return item

    def __len__(self):
        return len(self.labels)

In [22]:
train_dataset = BBCDataset(train_enc, y_train.tolist())
test_dataset = BBCDataset(test_enc, y_test.tolist())

In [23]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=24, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=24, shuffle=True)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=df['target'].nunique())
model.classifier 
model.to(device)

In [25]:
training_args = TrainingArguments (
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    do_train=True,
    do_eval=True,
    logging_strategy='steps',
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps'
)

In [26]:
optim = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
metric = datasets.load_metric('accuracy')

In [32]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(optim, None),
    compute_metrics=compute_metrics
)

In [34]:
trainer.train()

***** Running training *****
  Num examples = 1780
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 112


Step,Training Loss,Validation Loss,Accuracy
10,1.6408,1.654312,0.179775
20,1.6241,1.729082,0.229213
30,1.7115,1.63226,0.229213
40,1.6195,1.624896,0.229213
50,1.6568,1.626637,0.229213
60,1.6396,1.617317,0.229213
70,1.6303,1.609732,0.229213
80,1.594,1.629173,0.173034
90,1.6061,1.637583,0.229213
100,1.6374,1.621746,0.229213


***** Running Evaluation *****
  Num examples = 445
  Batch size = 16
***** Running Evaluation *****
  Num examples = 445
  Batch size = 16
***** Running Evaluation *****
  Num examples = 445
  Batch size = 16
***** Running Evaluation *****
  Num examples = 445
  Batch size = 16
***** Running Evaluation *****
  Num examples = 445
  Batch size = 16
***** Running Evaluation *****
  Num examples = 445
  Batch size = 16
***** Running Evaluation *****
  Num examples = 445
  Batch size = 16
***** Running Evaluation *****
  Num examples = 445
  Batch size = 16
***** Running Evaluation *****
  Num examples = 445
  Batch size = 16
***** Running Evaluation *****
  Num examples = 445
  Batch size = 16
***** Running Evaluation *****
  Num examples = 445
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=112, training_loss=1.6370792772088731, metrics={'train_runtime': 9480.8422, 'train_samples_per_second': 0.188, 'train_steps_per_second': 0.012, 'total_flos': 235804584652800.0, 'train_loss': 1.6370792772088731, 'epoch': 1.0})

In [35]:
save_directory = "./models/DistilBertModel"
model.save_pretrained(save_directory)

Configuration saved in ./models/DistilBertModel/config.json
Model weights saved in ./models/DistilBertModel/pytorch_model.bin


In [36]:
model = DistilBertForSequenceClassification.from_pretrained(save_directory)

loading configuration file ./models/DistilBertModel/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.22.2",
  "vocab_size": 30522
}

loading weights file ./models/DistilBertModel/pytorch_model.

In [37]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        for batch_idx, batch in enumerate(data_loader):
            input_idx = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_idx, attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100

In [39]:
model.eval()
model.to(device)
compute_accuracy(model, test_loader, device)

tensor(22.9213)

На таком относительно небольшом наборе данных классические методы машинного обучения показывают хороший результат и сравнительно быстро обучаются (в моем случае это случайный лес на 200 деревье). Берт-Трансформер, в свою очередь, довольно тяжелая модель и процесс дообучения требует больших вычислительных мощностей именно поэтому я указал лишь одну эпоху (гпу коллаб не дал, не знаю почему просто я каким-то образом "исчерпал свой лимит"), т.к. на цпу даже одна эпоха выполняется 2 часа. Вероятнее всего, из-за слабого дообучения и качество оставляет желать лучшего.