# 1. Библиотеки

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string
from nltk.corpus import stopwords 
from typing import List, Dict
from tqdm import tqdm
from torch import save
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import classification_report
import torch.nn.functional as F
import seaborn as sns
import matplotlib.pyplot as plt
import nltk  
from nltk.util import ngrams
from wordcloud import WordCloud



# Проверка наличия GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# 2. Чтение данных

In [5]:
base_path = "../NLP_BBCNEWS/"

df_solution = pd.read_csv(f"{base_path}BBC News Sample Solution.csv")
df_test =  pd.read_csv(f"{base_path}BBC News Test.csv")
df_train = pd.read_csv(f"{base_path}BBC News Train.csv")
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB
None


In [6]:
transformation_dict = {
    'business': 4,        # 'business' - 2 = business
    'tech': 3,            # 'tech' - 3 = sci/tech
    'politics': 1,        # 'politics' - 0 = world
    'sport': 2,           # 'sport' - 1 = sport
}

df_train['category_id'] = df_train.Category.map(transformation_dict)
df_train = df_train.drop(columns=["ArticleId"])
df_train = df_train.dropna().reset_index().drop(columns=["index"])

df_train["category_id"] = df_train["category_id"].astype(np.int8)
df_train["category_id"].value_counts()

category_id
2    346
4    336
1    274
3    261
Name: count, dtype: int64

In [7]:
def clean_text(text):
    
    # Удаление HTML-тегов
    text = re.sub(r'<.*?>', '', text)
    # Удаление ссылок
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    
    return text


# Пример очистки текста
df_train['Text'] = df_train['Text'].apply(clean_text)

In [8]:
class NewsDataset(Dataset):
  def __init__(self,df):
    self.n_samples = len(df)
    self.dataframe = df 
  def __getitem__(self, index):
    row = self.dataframe.iloc[index]
    return row['category_id'], row['Text']  
  def __len__(self):
    return self.n_samples
  


# Разделение датасета (80% - train, 20% - test)
df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42)

print(f"Размер train: {len(df_train)}, Размер test: {len(df_test)}")

train_dataset = NewsDataset(df_train)
test_dataset = NewsDataset(df_test)

Размер train: 973, Размер test: 244


In [9]:
import torch
from torchtext.data.utils import get_tokenizer
import collections
import torchtext
from torch.utils.data import Dataset, DataLoader
import pandas as pd

tokenizer = get_tokenizer('basic_english')

In [10]:
counter = collections.Counter()
for (label, line) in train_dataset:
    counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line), ngrams=1))
vocab = torchtext.vocab.Vocab(counter, min_freq=1)

In [11]:
vocab_size = len(vocab)
print(f"Vocab size if {vocab_size}")

def encode(x):
    return [vocab.stoi[s] for s in tokenizer(x)]

def decode(x):
    return [vocab.itos[i] for i in x]

Vocab size if 21418


In [12]:
def padify(b):
    v = [encode(x[1]) for x in b]
    l = max(map(len,v))
    return ( # tuple of two tensors - labels and features
        torch.LongTensor([t[0]-1 for t in b]),
        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
    )

In [None]:
class CNNRNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, cnn_out_channels, kernel_size, hidden_dim, num_classes):
        super(CNNRNNClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Слой эмбеддинга
        self.conv1d = nn.Conv1d(in_channels=embedding_dim, 
                               out_channels=cnn_out_channels, 
                               kernel_size=kernel_size, 
                               padding=kernel_size // 2)  # 1D свертка
        
        self.rnn = nn.LSTM(input_size=cnn_out_channels, hidden_size=hidden_dim, batch_first=True)
        
        self.fc = nn.Linear(hidden_dim, num_classes)  # Полносвязный слой
        
    def forward(self, x):
        x = self.embedding(x)  # Преобразование в эмбеддинги [batch_size, seq_len, embedding_dim]
        x = x.permute(0, 2, 1)  # Перестановка для Conv1d [batch_size, embedding_dim, seq_len]
        
        x = F.relu(self.conv1d(x))  # Применение свертки и активации ReLU
        x = x.permute(0, 2, 1)  # Обратно в форму [batch_size, seq_len, cnn_out_channels]
        
        rnn_out, (hidden, cell) = self.rnn(x)  # Передача в LSTM
        output = self.fc(hidden[-1])  # Используем последний скрытый слой
        
        return output




device(type='cuda')

# 6. Training

In [51]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, collate_fn=padify, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


# Загрузка модели (определите вашу архитектуру перед загрузкой)
model = CNNRNNClassifier(vocab_size, embedding_dim=300, cnn_out_channels=128, kernel_size=5, hidden_dim=256, num_classes=4).to(device)

old_state_dict = torch.load("models/final/CNN+RNN_final.pth")
# Удаляем веса эмбеддингов, так как они несовместимы
del old_state_dict['embedding.weight']

# Загружаем остальные веса
model.load_state_dict(old_state_dict, strict=False)



optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

num_epochs = 5

for epoch in range(num_epochs):
    correct = 0
    total = 0
    epoch_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)

    for batch in progress_bar:
        labels, text = batch
        text, labels = text.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(text)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()


        # Расчет accuracy
        _, predicted = torch.max(outputs, 1)  # Получаем предсказанные классы
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
        acc = correct / total  # Текущая точность

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=epoch_loss / (progress_bar.n + 1), accuracy=acc * 100)
    final_acc = correct / total * 100
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss / len(train_loader):.4f}, acc: {final_acc}")


  old_state_dict = torch.load("models/final/CNN+RNN_final.pth")
                                                                                    

Epoch 1/5, Loss: 1.4390, acc: 33.19630010277492


                                                                                    

Epoch 2/5, Loss: 1.2593, acc: 45.83761562178829


                                                                                    

Epoch 3/5, Loss: 1.1799, acc: 50.668036998972255


                                                                                    

Epoch 4/5, Loss: 1.0856, acc: 53.340184994861254


                                                                                     

Epoch 5/5, Loss: 1.0636, acc: 53.23741007194245




In [53]:

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, collate_fn=padify, shuffle=True)

model.eval()  # Устанавливаем модель в режим оценки
model.to(device)

classes = ['World', 'Sports', 'Business', 'Sci/Tech']

# Получаем прогнозы и истинные метки
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        labels, text = batch
        text, labels = text.to(device), labels.to(device)

        # Получаем выходы модели
        outputs = model(text)
        predicted_classes = torch.argmax(outputs, dim=1)

        all_preds.extend(predicted_classes.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Создаем отчет классификации
report = classification_report(all_labels, all_preds, target_names=classes)
print(report)


              precision    recall  f1-score   support

       World       0.39      0.81      0.52        70
      Sports       0.67      0.93      0.78        59
    Business       0.36      0.11      0.17        44
    Sci/Tech       0.00      0.00      0.00        71

    accuracy                           0.48       244
   macro avg       0.35      0.47      0.37       244
weighted avg       0.34      0.48      0.37       244



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#               precision    recall  f1-score   support

#        World       0.94      0.91      0.92      1900
#       Sports       0.95      0.98      0.96      1900
#     Business       0.90      0.85      0.88      1900
#     Sci/Tech       0.86      0.91      0.89      1900

#     accuracy                           0.91      7600
#    macro avg       0.91      0.91      0.91      7600
# weighted avg       0.91      0.91      0.91      7600

In [None]:
#               precision    recall  f1-score   support

#        World       0.93      0.90      0.92      1900
#       Sports       0.96      0.97      0.97      1900
#     Business       0.85      0.91      0.88      1900
#     Sci/Tech       0.90      0.86      0.88      1900

#     accuracy                           0.91      7600
#    macro avg       0.91      0.91      0.91      7600
# weighted avg       0.91      0.91      0.91      7600

