# 1. Библиотеки

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string
from nltk.corpus import stopwords 
from typing import List, Dict
from tqdm import tqdm
from torch import save
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import classification_report
import torch.nn.functional as F


# Проверка наличия GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# 2. Чтение данных

In [2]:
train = pd.read_csv("train.csv", encoding = 'latin')
test = pd.read_csv("test.csv", encoding = "latin")

colms = ["Class Index", "Title", "Description"]
train.columns = colms
test.columns = colms

train.info()
print("\n")
test.info()


classes = ['World', 'Sports', 'Business', 'Sci/Tech']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Class Index  120000 non-null  int64 
 1   Title        120000 non-null  object
 2   Description  120000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7600 entries, 0 to 7599
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class Index  7600 non-null   int64 
 1   Title        7600 non-null   object
 2   Description  7600 non-null   object
dtypes: int64(1), object(2)
memory usage: 178.2+ KB


In [67]:
train.head(2)

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...


# 3. Предобработка

In [3]:
train_df = train
test_df = test

In [4]:
def combine_text(row):
    return f"{row['Title']} - {row['Description']}"

train_df['Text'] = train_df.apply(combine_text, axis=1)
test_df['Text'] = test_df.apply(combine_text, axis=1)


# train_df['Text'] = train_df["Description"]
# test_df['Text'] = test_df["Description"]
train_df.head(5)

Unnamed: 0,Class Index,Title,Description,Text
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new..."


In [5]:
# Шаг 1: Очистка текста

STOPWORDS = set(stopwords.words('english'))

english_punctuations = string.punctuation
punctuations_list = english_punctuations

def clean_text(text):
    
    text =  re.sub(r'(.)\1+', r'\1', text)

    text =  re.sub(r'@[^\s]+', ' ', text)

    text =  re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))',' ', text)

    text =  re.sub(r'[0-9]+', '', text)

    # Удаление HTML-тегов
    text = re.sub(r'<.*?>', '', text)
    # Удаление ссылок
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    text = " ".join([word for word in str(text).split() if word not in STOPWORDS])
    
    translator = str.maketrans('', '', punctuations_list)
    text = text.translate(translator)
    
    return text

def clean_text(text):
    
    # Удаление HTML-тегов
    text = re.sub(r'<.*?>', '', text)
    # Удаление ссылок
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    
    return text


# Пример очистки текста
train_df['Text'] = train_df['Text'].apply(clean_text)
test_df['Text'] = test_df['Text'].apply(clean_text)

In [6]:
class NewsDataset(Dataset):
  def __init__(self,df):
    self.n_samples = len(df)
    self.dataframe = df

  def __getitem__(self, index):
    row = self.dataframe.iloc[index]
    return row['Class Index'], row['Text']

  def __len__(self):
    return self.n_samples

In [7]:
# now we convert the dataframe for the training and testing into datasets
train_dataset = NewsDataset(train_df)
test_dataset = NewsDataset(test_df)

In [None]:
# !pip install torchtext==0.5.0

Collecting torchtext==0.5.0
  Downloading torchtext-0.5.0-py3-none-any.whl (73 kB)
     -------------------------------------- 73.2/73.2 kB 669.4 kB/s eta 0:00:00
Installing collected packages: torchtext
Successfully installed torchtext-0.5.0



[notice] A new release of pip available: 22.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import torch
from torchtext.data.utils import get_tokenizer
import collections
import torchtext
from torch.utils.data import Dataset, DataLoader
import pandas as pd

tokenizer = get_tokenizer('basic_english')

In [9]:
counter = collections.Counter()
for (label, line) in train_dataset:
    counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line), ngrams=1))
vocab = torchtext.vocab.Vocab(counter, min_freq=1)

In [10]:
vocab_size = len(vocab)
print(f"Vocab size if {vocab_size}")

def encode(x):
    return [vocab.stoi[s] for s in tokenizer(x)]

def decode(x):
    return [vocab.itos[i] for i in x]

Vocab size if 95131


In [11]:
def padify(b):
    v = [encode(x[1]) for x in b]
    l = max(map(len,v))
    return ( # tuple of two tensors - labels and features
        torch.LongTensor([t[0]-1 for t in b]),
        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
    )

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class RNNClassifierWithAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        super(RNNClassifierWithAttention, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Эмбеддинг
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)  # LSTM для обработки последовательностей
        self.attention = nn.Linear(hidden_dim, 1)  # Слой внимания
        self.fc = nn.Linear(hidden_dim, num_class)  # Полносвязный слой для классификации

        
    def forward(self, x):
        batch_size = x.size(0)
        
        x = self.embedding(x)  # Эмбеддинг
        rnn_out, (hidden, cell) = self.rnn(x)  # Проходим через LSTM
        # Применяем внимание
        attention_scores = torch.tanh(self.attention(rnn_out))        # [batch_size, seq_len, 1]  
        attention_weights = torch.softmax(attention_scores, dim=1)    # [batch_size, seq_len, 1]
        
        # Получаем контекстный вектор, взвешивая выходы RNN
        context_vector = torch.sum(attention_weights * rnn_out, dim=1)  # [batch_size, hidden_dim]
        
        # Применяем полносвязный слой для классификации
        output = self.fc(context_vector)  # [batch_size, num_class]
        
        return output
    
    
class CNNRNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, cnn_out_channels, kernel_size, hidden_dim, num_classes):
        super(CNNRNNClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Слой эмбеддинга
        self.conv1d = nn.Conv1d(in_channels=embedding_dim, 
                               out_channels=cnn_out_channels, 
                               kernel_size=kernel_size, 
                               padding=kernel_size // 2)  # 1D свертка
        
        self.rnn = nn.LSTM(input_size=cnn_out_channels, hidden_size=hidden_dim, batch_first=True)
        
        self.fc = nn.Linear(hidden_dim, num_classes)  # Полносвязный слой
        
    def forward(self, x):
        x = self.embedding(x)  # Преобразование в эмбеддинги [batch_size, seq_len, embedding_dim]
        x = x.permute(0, 2, 1)  # Перестановка для Conv1d [batch_size, embedding_dim, seq_len]
        
        x = F.relu(self.conv1d(x))  # Применение свертки и активации ReLU
        x = x.permute(0, 2, 1)  # Обратно в форму [batch_size, seq_len, cnn_out_channels]
        
        rnn_out, (hidden, cell) = self.rnn(x)  # Передача в LSTM
        output = self.fc(hidden[-1])  # Используем последний скрытый слой
        
        return output




train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, collate_fn=padify, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

# 6. Training

In [13]:
# model = RNNClassifier(vocab_size, 64, 32, 4).to(device)
# model = RNNWithAttention(vocab_size, 64, 32, 4).to(device)
model = RNNClassifierWithAttention(vocab_size, embedding_dim=128, hidden_dim=256, num_class=4).to(device)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {trainable_params}")
# model = CNNRNNClassifier(vocab_size, embedding_dim=300, cnn_out_channels=128, kernel_size=5, hidden_dim=256, num_classes=4).to(device)


lr = 0.001
report_freq=200
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
criterion = criterion.to(device)



num_epochs = 2
batch_size = 64


def train_model(model, train_loader, criterion, optimizer, num_epochs, device):
    model.to(device)  # Переносим модель на устройство (CPU/GPU)

    for epoch in range(num_epochs):
        model.train()  # Устанавливаем режим обучения
        total_loss = 0
        correct = 0
        total = 0
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
        
        for labels, inputs in progress_bar:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()  # Обнуляем градиенты
            
            outputs = model(inputs)  # Прямой проход
            loss = criterion(outputs, labels)  # Вычисление функции потерь
            
            loss.backward()  # Обратное распространение
            optimizer.step()  # Обновление весов
            
            total_loss += loss.item()

            # Расчет accuracy
            _, predicted = torch.max(outputs, 1)  # Получаем предсказанные классы
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

            acc = correct / total  # Текущая точность
            progress_bar.set_postfix(loss=loss.item(), accuracy=acc * 100)
        
        avg_loss = total_loss / len(train_loader)
        final_acc = correct / total * 100
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {final_acc:.2f}%")

    print("Training complete.")


# Запуск обучения
train_model(model, train_loader, criterion, optimizer, num_epochs, device)

from torch import save
save(model.state_dict(), "CNN+RNN.pth")

Number of parameters: 12573317


                                                                                       

KeyboardInterrupt: 

In [None]:
#               precision    recall  f1-score   support

#        World       0.94      0.91      0.92      1900
#       Sports       0.95      0.98      0.96      1900
#     Business       0.90      0.85      0.88      1900
#     Sci/Tech       0.86      0.91      0.89      1900

#     accuracy                           0.91      7600
#    macro avg       0.91      0.91      0.91      7600
# weighted avg       0.91      0.91      0.91      7600

In [None]:
#               precision    recall  f1-score   support

#        World       0.93      0.90      0.92      1900
#       Sports       0.96      0.97      0.97      1900
#     Business       0.85      0.91      0.88      1900
#     Sci/Tech       0.90      0.86      0.88      1900

#     accuracy                           0.91      7600
#    macro avg       0.91      0.91      0.91      7600
# weighted avg       0.91      0.91      0.91      7600



# 7. Оценка модели

In [15]:
def test_model_with_accuracy(model, test_loader, vocab, classes, device):
    model.eval()  # Переводим модель в режим оценки (без градиентов)
    
    correct = 0  # Количество правильных предсказаний
    total = 0  # Общее количество примеров
    with torch.no_grad():  # Не вычисляем градиенты во время тестирования
        for batch_idx, (target, data) in enumerate(test_loader):
            
            # Перенос данных и меток на устройство
            data, target = data.to(device), target.to(device)
            
            # Получение предсказаний
            pred = model(data)
            
            # Вычисляем количество правильных предсказаний
            _, predicted = torch.max(pred, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

        # Вычисление точности
        accuracy = 100 * correct / total
        print(f"Accuracy of the model on the test data: {accuracy:.2f}%")




test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, collate_fn=padify, shuffle=True)

# Загрузка модели (определите вашу архитектуру перед загрузкой)
# model = RNNClassifierWithAttention(vocab_size, 64, 32, 4).to(device)  
# model = RNNClassifier(vocab_size, 64, 32, 4).to(device)  # 90_75
model = CNNRNNClassifier(vocab_size, embedding_dim=300, cnn_out_channels=128, kernel_size=5, hidden_dim=256, num_classes=4).to(device)
model.load_state_dict(torch.load("models/final/CNN+RNN_final.pth"))
model.eval()  # Устанавливаем модель в режим оценки
model.to(device)


# Получаем прогнозы и истинные метки
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        labels, text = batch
        text, labels = text.to(device), labels.to(device)

        # Получаем выходы модели
        outputs = model(text)
        predicted_classes = torch.argmax(outputs, dim=1)

        all_preds.extend(predicted_classes.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Создаем отчет классификации
report = classification_report(all_labels, all_preds, target_names=classes, digits=4)
print(report)

# test_model_with_accuracy(model, test_loader, vocab, classes, device)

  model.load_state_dict(torch.load("models/final/CNN+RNN_final.pth"))


              precision    recall  f1-score   support

       World     0.8974    0.9158    0.9065      1900
      Sports     0.9461    0.9800    0.9628      1900
    Business     0.8960    0.8568    0.8760      1900
    Sci/Tech     0.8918    0.8805    0.8861      1900

    accuracy                         0.9083      7600
   macro avg     0.9078    0.9083    0.9078      7600
weighted avg     0.9078    0.9083    0.9078      7600



In [93]:
save(model.state_dict(), "RNN + Attetion_final.pth")

# TITLE + DESC без очистки
## RNN:
### train = 87.49%
### test = 90.12%

## RNN + Attetion V1:
### train = 88.87%
### test = 90.74%

## RNN + Attetion V2:
### train = 89.07%
### test = 90.53%

# TITLE + DESC без очистки
## RNN:
### train = 86.85%
### test = 90.75%

## RNN + Attetion V1:
### train = 88.96%
### test = 91.31%

## RNN + Attetion V2:
### train = 89.07%
### test = 91.57%