# 1. Библиотеки

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string
from nltk.corpus import stopwords 
from typing import List, Dict
from tqdm import tqdm
from torch import save
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.naive_bayes import MultinomialNB


# Проверка наличия GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


# 2. Чтение данных

In [2]:
base_path = "../NLP_BBCNEWS/"

df_solution = pd.read_csv(f"{base_path}BBC News Sample Solution.csv")
df_test =  pd.read_csv(f"{base_path}BBC News Test.csv")
df_train = pd.read_csv(f"{base_path}BBC News Train.csv")
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB
None


In [3]:
transformation_dict = {
    'business': 4,        # 'business' - 2 = business
    'tech': 3,            # 'tech' - 3 = sci/tech
    'politics': 1,        # 'politics' - 0 = world
    'sport': 2,           # 'sport' - 1 = sport
}

df_train['category_id'] = df_train.Category.map(transformation_dict)
df_train = df_train.drop(columns=["ArticleId"])
df_train = df_train.dropna().reset_index().drop(columns=["index"])

df_train["category_id"] = df_train["category_id"].astype(np.int8)
df_train["category_id"].value_counts()

category_id
2    346
4    336
1    274
3    261
Name: count, dtype: int64

# 3. Предобработка

In [4]:
def clean_text(text):
    
    # Удаление HTML-тегов
    text = re.sub(r'<.*?>', '', text)
    # Удаление ссылок
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    
    return text


# Пример очистки текста
df_train['Text'] = df_train['Text'].apply(clean_text)

# 4. BERT model

In [5]:
# Инициализация BERT
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)

distil_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distil_bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

bert_model.eval()           # Устанавливаем режим оценки (чтобы заморозить веса)
distil_bert_model.eval()    # Устанавливаем режим оценки (чтобы заморозить веса)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

# 5. Feature extraction

In [None]:
df_train

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...
...,...,...
730,1923,eu to probe alitalia state aid the european ...
731,373,u2 to play at grammy awards show irish rock ba...
732,1704,sport betting rules in spotlight a group of mp...
733,206,alfa romeos to get gm engines fiat is to sto...


In [10]:
# Функция для извлечения признаков
def extract_features_in_batches(texts, tokenizer, bert_model, batch_size=32, max_length=128, info="data"):
    features = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Extracting BERT features for {info}:"):
        batch_texts = texts[i:i+batch_size]
        tokens = tokenizer(batch_texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
        input_ids = tokens["input_ids"].to(device)
        attention_mask = tokens["attention_mask"].to(device)

        with torch.no_grad():
            outputs = bert_model(input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Извлекаем [CLS]
            features.append(cls_embeddings.cpu().numpy())
    
    return np.vstack(features)


# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(df_train["Text"], df_train["category_id"], test_size=0.15, random_state=42)

use_model = distil_bert_model
use_tokenizer = distil_tokenizer

# 6. Загрузка/Извлечение признаков

In [11]:
try:
    # Загрузка признаков
    with open("features/train_features.pkl", "rb") as f:
        X_train_features, y_train = pickle.load(f)
    with open("features/test_features.pkl", "rb") as f:
        X_test_features, y_test = pickle.load(f)

except Exception as ex:
    print(ex)
    # Извлечение признаков
    X_train_features = extract_features_in_batches(X_train.tolist(), use_tokenizer, use_model, batch_size=32, max_length=64, info="train_data")
    X_test_features = extract_features_in_batches(X_test.tolist(), use_tokenizer, use_model, batch_size=32, max_length=64, info="test_data")

    # Сохранение
    with open("features/train_features.pkl", "wb") as f:
        pickle.dump((X_train_features, y_train), f)
    with open("features/test_features.pkl", "wb") as f:
        pickle.dump((X_test_features, y_test), f)

classes = ['World', 'Sports', 'Business', 'Sci/Tech']

# 7. Классификация

## 7.1. LogisticRegression

In [14]:
# Обучение классификатора (например, логистическая регрессия)
clf = LogisticRegression(max_iter=2000)  # Также можно использовать SVM или DecisionTreeClassifier
clf.fit(X_train_features, y_train)

# Предсказание и оценка
y_pred = clf.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
report = classification_report(y_test, y_pred, target_names=classes)
print(report)

Accuracy: 0.9105
              precision    recall  f1-score   support

       World       0.93      0.90      0.91      1900
      Sports       0.97      0.98      0.98      1900
    Business       0.86      0.87      0.87      1900
    Sci/Tech       0.88      0.89      0.88      1900

    accuracy                           0.91      7600
   macro avg       0.91      0.91      0.91      7600
weighted avg       0.91      0.91      0.91      7600



# 7.2. LGB

In [13]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
model = lgb.LGBMClassifier(n_estimators=50, random_state=42)
model.fit(X_train_features, y_train)
y_pred = model.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
report = classification_report(y_test, y_pred, target_names=classes)
print(report)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.653342 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 768
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Accuracy: 0.8905
              precision    recall  f1-score   support

       World       0.91      0.88      0.89      1900
      Sports       0.96      0.98      0.97      1900
    Business       0.84      0.84      0.84      1900
    Sci/Tech       0.86      0.86      0.86      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600

