# 1. Библеотеки

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import nltk  
from nltk.util import ngrams
from nltk.corpus import stopwords 
from collections import Counter
from wordcloud import WordCloud
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import random
from sklearn.model_selection import train_test_split
import string
from typing import List, Dict
from tqdm import tqdm
from torch import save
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import classification_report

# 2. Чтение данных

In [4]:
base_path = "../NLP_BBCNEWS/"

df_solution = pd.read_csv(f"{base_path}BBC News Sample Solution.csv")
df_test =  pd.read_csv(f"{base_path}BBC News Test.csv")
df_train = pd.read_csv(f"{base_path}BBC News Train.csv")
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB
None


In [5]:
transformation_dict = {
    'business': 4,        # 'business' - 2 = business
    'tech': 3,            # 'tech' - 3 = sci/tech
    'politics': 1,        # 'politics' - 0 = world
    'sport': 2,           # 'sport' - 1 = sport
}

df_train['category_id'] = df_train.Category.map(transformation_dict)
df_train = df_train.drop(columns=["ArticleId"])


In [6]:
np.sum(df_train["category_id"].isnull())

273

In [7]:
df_train = df_train.dropna().reset_index().drop(columns=["index"])

In [8]:

df_train["category_id"] = df_train["category_id"].astype(np.int8)
df_train["category_id"].value_counts()

category_id
2    346
4    336
1    274
3    261
Name: count, dtype: int64

In [9]:
df_train.head(2)

Unnamed: 0,Text,Category,category_id
0,worldcom ex-boss launches defence lawyers defe...,business,4
1,german business confidence slides german busin...,business,4


In [10]:
def clean_text(text):
    
    # Удаление HTML-тегов
    text = re.sub(r'<.*?>', '', text)
    # Удаление ссылок
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    
    return text


# Пример очистки текста
df_train['Text'] = df_train['Text'].apply(clean_text)

In [11]:
class NewsDataset(Dataset):
  def __init__(self,df):
    self.n_samples = len(df)
    self.dataframe = df 
  def __getitem__(self, index):
    row = self.dataframe.iloc[index]
    return row['category_id'], row['Text']  
  def __len__(self):
    return self.n_samples
  


# Разделение датасета (80% - train, 20% - test)
df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42)

print(f"Размер train: {len(df_train)}, Размер test: {len(df_test)}")

train_dataset = NewsDataset(df_train)
test_dataset = NewsDataset(df_test)

Размер train: 973, Размер test: 244


In [12]:
import torch
from torchtext.data.utils import get_tokenizer
import collections
import torchtext
from torch.utils.data import Dataset, DataLoader
import pandas as pd

tokenizer = get_tokenizer('basic_english')

In [13]:
counter = collections.Counter()
for (label, line) in train_dataset:
    counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line), ngrams=1))
vocab = torchtext.vocab.Vocab(counter, min_freq=1)

In [14]:
vocab_size = len(vocab)
print(f"Vocab size if {vocab_size}")

def encode(x):
    return [vocab.stoi[s] for s in tokenizer(x)]

def decode(x):
    return [vocab.itos[i] for i in x]

Vocab size if 21418


In [15]:
def padify(b):
    v = [encode(x[1]) for x in b]
    l = max(map(len,v))
    return ( # tuple of two tensors - labels and features
        torch.LongTensor([t[0]-1 for t in b]),
        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
    )

In [16]:

class RNNClassifierWithAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        super(RNNClassifierWithAttention, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Эмбеддинг
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)  # LSTM для обработки последовательностей
        self.attention = nn.Linear(hidden_dim, 1)  # Слой внимания
        self.fc = nn.Linear(hidden_dim, num_class)  # Полносвязный слой для классификации

        
    def forward(self, x):
        batch_size = x.size(0)
        
        x = self.embedding(x)  # Эмбеддинг
        rnn_out, (hidden, cell) = self.rnn(x)  # Проходим через LSTM
        # Применяем внимание
        attention_scores = torch.tanh(self.attention(rnn_out))        # [batch_size, seq_len, 1]  
        attention_weights = torch.softmax(attention_scores, dim=1)    # [batch_size, seq_len, 1]
        
        # Получаем контекстный вектор, взвешивая выходы RNN
        context_vector = torch.sum(attention_weights * rnn_out, dim=1)  # [batch_size, hidden_dim]
        
        # Применяем полносвязный слой для классификации
        output = self.fc(context_vector)  # [batch_size, num_class]
        
        return output


In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, collate_fn=padify, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


# Загрузка модели (определите вашу архитектуру перед загрузкой)
# model = RNNClassifierWithAttention(vocab_size, 64, 32, 4).to(device)  
model = RNNClassifierWithAttention(vocab_size, 128, 256, 4).to(device)  # vocab_size, embedding_dim, hidden_dim, num_class

# old_state_dict = torch.load("models/rnn_attention_TORCH_91_57.pth")

old_state_dict = torch.load("models/final/RNN + Attetion_final.pth")
# Удаляем веса эмбеддингов, так как они несовместимы
del old_state_dict['embedding.weight']

# Загружаем остальные веса
model.load_state_dict(old_state_dict, strict=False)



optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

num_epochs = 4

for epoch in range(num_epochs):
    correct = 0
    total = 0
    epoch_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)

    for batch in progress_bar:
        labels, text = batch
        text, labels = text.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(text)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()


        # Расчет accuracy
        _, predicted = torch.max(outputs, 1)  # Получаем предсказанные классы
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
        acc = correct / total  # Текущая точность

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=epoch_loss / (progress_bar.n + 1), accuracy=acc * 100)
    final_acc = correct / total * 100
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss / len(train_loader):.4f}, acc: {final_acc}")


  old_state_dict = torch.load("models/final/RNN + Attetion_final.pth")
                                                                                    

Epoch 1/4, Loss: 1.1792, acc: 53.03186022610483


                                                                                     

Epoch 2/4, Loss: 0.4636, acc: 83.96711202466598


                                                                                     

Epoch 3/4, Loss: 0.2055, acc: 93.83350462487154


                                                                                     

Epoch 4/4, Loss: 0.1054, acc: 97.73895169578623




In [18]:

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, collate_fn=padify, shuffle=True)

model.eval()  # Устанавливаем модель в режим оценки
model.to(device)

classes = ['World', 'Sports', 'Business', 'Sci/Tech']

# Получаем прогнозы и истинные метки
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        labels, text = batch
        text, labels = text.to(device), labels.to(device)

        # Получаем выходы модели
        outputs = model(text)
        predicted_classes = torch.argmax(outputs, dim=1)

        all_preds.extend(predicted_classes.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Создаем отчет классификации
report = classification_report(all_labels, all_preds, target_names=classes)
print(report)


              precision    recall  f1-score   support

       World       1.00      0.60      0.75        70
      Sports       0.62      0.98      0.76        59
    Business       0.85      0.77      0.81        44
    Sci/Tech       0.90      0.86      0.88        71

    accuracy                           0.80       244
   macro avg       0.84      0.80      0.80       244
weighted avg       0.85      0.80      0.80       244



# epoch = 6, batch_size=16

              precision    recall  f1-score   support

       World       0.84      0.87      0.85        70
      Sports       0.85      0.95      0.90        59
    Business       0.85      0.91      0.88        44
    Sci/Tech       0.95      0.77      0.85        71

    accuracy                           0.87       244
   macro avg       0.87      0.88      0.87       244
weighted avg       0.87      0.87      0.87       244

              precision    recall  f1-score   support

       World       0.92      0.80      0.85        70
      Sports       0.90      0.92      0.91        59
    Business       0.84      0.84      0.84        44
    Sci/Tech       0.86      0.96      0.91        71

    accuracy                           0.88       244
   macro avg       0.88      0.88      0.88       244
weighted avg       0.88      0.88      0.88       244

              precision    recall  f1-score   support

       World       0.97      0.84      0.90        70
      Sports       0.95      0.98      0.97        59
    Business       0.85      0.89      0.87        44
    Sci/Tech       0.87      0.93      0.90        71

    accuracy                           0.91       244
   macro avg       0.91      0.91      0.91       244
weighted avg       0.91      0.91      0.91       244

              precision    recall  f1-score   support

       World       0.97      0.86      0.91        70
      Sports       0.98      0.98      0.98        59
    Business       0.88      0.98      0.92        44
    Sci/Tech       0.89      0.93      0.91        71

    accuracy                           0.93       244
   macro avg       0.93      0.94      0.93       244
weighted avg       0.93      0.93      0.93       244

batch_size = 32\  
epoch = 4\
model = RNNClassifierWithAttention(vocab_size, 128, 256, 4).to(device) \
old_state_dict = torch.load("models/final/RNN + Attetion_final.pth")


              precision    recall  f1-score   support

       World       0.97      0.86      0.91        70
      Sports       0.97      0.98      0.97        59
    Business       0.88      0.98      0.92        44
    Sci/Tech       0.90      0.93      0.92        71

    accuracy                           0.93       244
    macro avg       0.93      0.94     0.93       244
    weighted avg    0.93      0.93     0.93       244