# 1. Библиотеки

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string
from nltk.corpus import stopwords 
from typing import List, Dict
from tqdm import tqdm
from torch import save
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.naive_bayes import MultinomialNB


# Проверка наличия GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


# 2. Чтение данных

In [2]:
train = pd.read_csv("train.csv", encoding = 'latin')
test = pd.read_csv("test.csv", encoding = "latin")

colms = ["Class Index", "Title", "Description"]
train.columns = colms
test.columns = colms

train.info()
print("\n")
test.info()


classes = ['World', 'Sports', 'Business', 'Sci/Tech']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Class Index  120000 non-null  int64 
 1   Title        120000 non-null  object
 2   Description  120000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7600 entries, 0 to 7599
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class Index  7600 non-null   int64 
 1   Title        7600 non-null   object
 2   Description  7600 non-null   object
dtypes: int64(1), object(2)
memory usage: 178.2+ KB


In [None]:
train.head(2)

# 3. Предобработка

In [3]:
train_df = train
test_df = test

In [4]:
def combine_text(row):
    return f"{row['Title']} {row['Description']}"

train_df['Text'] = train_df.apply(combine_text, axis=1)
test_df['Text'] = test_df.apply(combine_text, axis=1)


# train_df['Text'] = train_df["Description"]
# test_df['Text'] = test_df["Description"]
train_df.head(5)

Unnamed: 0,Class Index,Title,Description,Text
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new..."


In [5]:
STOPWORDS = set(stopwords.words('english'))

english_punctuations = string.punctuation
punctuations_list = english_punctuations

def clean_text(text):
    
    text =  re.sub(r'(.)\1+', r'\1', text)

    text =  re.sub(r'@[^\s]+', ' ', text)

    text =  re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))',' ', text)

    text =  re.sub(r'[0-9]+', '', text)

    # Удаление HTML-тегов
    text = re.sub(r'<.*?>', '', text)
    # Удаление ссылок
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    text = " ".join([word for word in str(text).split() if word not in STOPWORDS])
    
    translator = str.maketrans('', '', punctuations_list)
    text = text.translate(translator)
    
    return text

def clean_text(text):
    
    # Удаление HTML-тегов
    text = re.sub(r'<.*?>', '', text)
    # Удаление ссылок
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    
    return text


# Пример очистки текста
train_df['Text'] = train_df['Text'].apply(clean_text)
test_df['Text'] = test_df['Text'].apply(clean_text)

# 4. BERT model

In [6]:
# Инициализация BERT
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)

distil_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distil_bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

bert_model.eval()           # Устанавливаем режим оценки (чтобы заморозить веса)
distil_bert_model.eval()    # Устанавливаем режим оценки (чтобы заморозить веса)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

# 5. Feature extraction

In [16]:
# Функция для извлечения признаков
def extract_features_in_batches(texts, tokenizer, bert_model, batch_size=32, max_length=128, info="data"):
    features = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Extracting BERT features for {info}:"):
        batch_texts = texts[i:i+batch_size]
        tokens = tokenizer(batch_texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
        input_ids = tokens["input_ids"].to(device)
        attention_mask = tokens["attention_mask"].to(device)

        with torch.no_grad():
            outputs = bert_model(input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Извлекаем [CLS]
            features.append(cls_embeddings.cpu().numpy())
    
    return np.vstack(features)


# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_df["Text"], test_df["Text"], train_df["Class Index"], test_df["Class Index"]

use_model = distil_bert_model
use_tokenizer = distil_tokenizer

# 6. Загрузка/Извлечение признаков

In [17]:

X_train_features = extract_features_in_batches(X_train.tolist(), use_tokenizer, use_model, batch_size=32, max_length=64, info="train_data")
X_test_features = extract_features_in_batches(X_test.tolist(), use_tokenizer, use_model, batch_size=32, max_length=64, info="test_data")
# Сохранение
with open("features/train_features_full.pkl", "wb") as f:
    pickle.dump((X_train_features, y_train), f)
with open("features/test_features_full.pkl", "wb") as f:
    pickle.dump((X_test_features, y_test), f)

Extracting BERT features for train_data::   1%|          | 38/3750 [00:06<11:01,  5.61it/s]


KeyboardInterrupt: 

In [18]:
try:
    # Загрузка признаков
    with open("features/train_features.pkl", "rb") as f:
        X_train_features, y_train = pickle.load(f)
    with open("features/test_features.pkl", "rb") as f:
        X_test_features, y_test = pickle.load(f)

except Exception as ex:
    print(ex)
    # Извлечение признаков
    X_train_features = extract_features_in_batches(X_train.tolist(), use_tokenizer, use_model, batch_size=32, max_length=64, info="train_data")
    X_test_features = extract_features_in_batches(X_test.tolist(), use_tokenizer, use_model, batch_size=32, max_length=64, info="test_data")

    # Сохранение
    with open("features/train_features.pkl", "wb") as f:
        pickle.dump((X_train_features, y_train), f)
    with open("features/test_features.pkl", "wb") as f:
        pickle.dump((X_test_features, y_test), f)

# 7. Классификация

## 7.1. LogisticRegression

In [19]:
# Обучение классификатора (например, логистическая регрессия)
clf = LogisticRegression(max_iter=1000)  # Также можно использовать SVM или DecisionTreeClassifier
clf.fit(X_train_features, y_train)

# Предсказание и оценка
y_pred = clf.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
report = classification_report(y_test, y_pred, target_names=classes, digits=4)
print(report)

Accuracy: 0.9105
              precision    recall  f1-score   support

       World     0.9283    0.9000    0.9139      1900
      Sports     0.9709    0.9837    0.9773      1900
    Business     0.8603    0.8721    0.8662      1900
    Sci/Tech     0.8831    0.8863    0.8847      1900

    accuracy                         0.9105      7600
   macro avg     0.9107    0.9105    0.9105      7600
weighted avg     0.9107    0.9105    0.9105      7600



# 7.3. LGB

In [15]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
model = lgb.LGBMClassifier(n_estimators=50, random_state=42)
model.fit(X_train_features, y_train)
y_pred = model.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
report = classification_report(y_test, y_pred, target_names=classes, digits=4)
print(report)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.822280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 768
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Accuracy: 0.8691
              precision    recall  f1-score   support

       World     0.8902    0.8705    0.8803      1900
      Sports     0.9299    0.9500    0.9399      1900
    Business     0.8193    0.8211    0.8202      1900
    Sci/Tech     0.8361    0.8347    0.8354      1900

    accuracy                         0.8691      7600
   macro avg     0.8689    0.8691    0.8689      7600
weighted avg     0.8689    0.8691    0.8689      7600



In [None]:
# !pip install torchtext==0.5.0