In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv('../input/qa-intents-dataset-university-domain/dataset_train.tsv',delimiter='\t',encoding="utf-8",names=['text', 'intent'])
test_data = pd.read_csv('../input/qa-intents-dataset-university-domain/dataset_test.tsv',delimiter='\t',encoding="utf-8",names=['text', 'intent'])
train_data.head()

In [None]:
full_data = pd.concat([train_data, test_data])
full_data.head()

In [None]:
full_data.info()

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(full_data, test_size=0.2, random_state=42)

In [None]:
unique_values_normalized = train['intent'].value_counts(normalize=True)
print(unique_values_normalized*100)

In [None]:
unique_values_normalized2 = test['intent'].value_counts(normalize=True)
print(unique_values_normalized2*100)

In [None]:
test.info()

In [None]:
from transformers import BertTokenizer

In [None]:
# Создание токенизатора BERT
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")


In [None]:
# Преобразование текста в токены
input_texts = train['text'].values.tolist()  # Преобразуйте тексты в список
input_ids = tokenizer.batch_encode_plus(input_texts, add_special_tokens=True, padding=True, truncation=True, max_length=64, return_tensors='pt')['input_ids']
# labels = torch.tensor(train_data['intent'].values)

In [None]:
# Преобразование меток в числовой формат
unique_intents = train['intent'].unique().tolist()
intent_mapping = {intent: i for i, intent in enumerate(unique_intents)}
train_labels = [intent_mapping[intent] for intent in train['intent']]
test_labels = [intent_mapping[intent] for intent in test['intent']]

In [None]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, AdamW

In [None]:
# Загрузка предобученной модели BERT
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(train_data['intent'].unique()))

#ruberttiny
# model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

model = BertForSequenceClassification.from_pretrained("cointegrated/rubert-tiny2", num_labels=len(unique_intents))


# Определение оптимизатора
optimizer = AdamW(model.parameters(), lr=1e-3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# # Преобразование текста в токены и создание маски внимания
# input_ids = tokenizer.encode(train_data['text'].values, add_special_tokens=True, padding=True, truncation=True, max_length=64, return_tensors='pt')
# #labels = torch.tensor(train_data['intent'].values)

# Преобразование текста в токены и создание маски внимания
encoding = tokenizer.batch_encode_plus(
    train['text'].values.tolist(),  # Преобразуйте тексты в список
    add_special_tokens=True,
    padding=True,
    truncation=True,
    max_length=64,
    return_tensors='pt'
)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [None]:
import matplotlib.pyplot as plt

def plot_loss(train_losses, val_losses, n_epoches):
#     plt.plot(loss_values)
#     plt.xlabel('Epoch')
#     plt.ylabel('Loss')
#     plt.title('Training Loss')
#     plt.show()
    
    # Построение графика потерь
    plt.plot(range(1, n_epoches + 1), train_losses, label='Train Loss')
    plt.plot(range(1, n_epoches + 1), val_losses, label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
from torch.utils.data import Dataset, DataLoader

class IntentClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Создание DataLoader
def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    ds = IntentClassificationDataset(
        texts=texts,
        labels=labels,
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size
    )

# Использование DataLoader в вашем коде
batch_size = 8 # 16
max_len = 64
train_data_loader = create_data_loader(train['text'].values.tolist(), train_labels, tokenizer, max_len, batch_size)
test_data_loader = create_data_loader(test['text'].values.tolist(), test_labels, tokenizer, max_len, batch_size)

In [None]:
# print(next(iter(train_data_loader)))

In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support, balanced_accuracy_score

train bert

In [None]:
# # Количество эпох
# epochs = 5

# # Список для хранения значений потерь на каждой эпохе
# loss_values = []
# best_loss = float('inf')
# best_acc = 0.8

# for epoch in range(epochs):
#     model.train()
#     total_loss = 0

#     # Используйте tqdm для отображения прогресса
#     for batch in tqdm(train_data_loader, desc=f"Epoch {epoch+1}"):
#         optimizer.zero_grad()
#         input_ids = batch['input_ids']
#         attention_mask = batch['attention_mask']
#         labels = batch['labels']

#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         total_loss += loss.item()
#         loss.backward()
#         optimizer.step()

#     avg_train_loss = total_loss / len(train_data_loader)
#     loss_values.append(avg_train_loss)

#     print(f"Epoch: {epoch+1}, Train Loss: {avg_train_loss:.4f}")

#     model.eval()
#     predictions , true_labels = [], []

#     for batch in tqdm(test_data_loader, desc=f"Validation Epoch {epoch+1}"):
#         with torch.no_grad():
#             input_ids = batch['input_ids']
#             attention_mask = batch['attention_mask']
#             labels = batch['labels']

#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             logits = outputs.logits
#             logits = logits.detach().cpu().numpy()
#             label_ids = labels.to('cpu').numpy()
            
#             print(logits.shape)
#             predictions.extend(np.argmax(logits, axis=1).flatten())
#             true_labels.extend(label_ids.flatten())

#     acc = accuracy_score(true_labels, predictions)
#     rec = recall_score(true_labels, predictions, average='weighted')
#     f1 = f1_score(true_labels, predictions, average='weighted')

#     print(f"Accuracy: {acc:.4f}, Recall: {rec:.4f}, F1-score: {f1:.4f}")

#     # Ранняя остановка по потерям на валидации
#     if avg_train_loss > best_loss:
#         print("Early stopping due to increase in validation loss")
#         break

#     # Ранняя остановка по точности на валидации
#     if acc >= best_acc:
#         print("Early stopping due to reaching target accuracy")
#         break

#     best_loss = avg_train_loss

# plot_loss(loss_values)

In [None]:
from transformers import BertForSequenceClassification
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support

# tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
# model = BertForSequenceClassification.from_pretrained("cointegrated/rubert-tiny2")

# Количество эпох
epochs = 10

train_losses = []
val_losses = []

# Список для хранения значений потерь на каждой эпохе
loss_values = []
best_loss = float('inf')
best_acc = 0.98

for epoch in range(epochs):
    model.train()
    train_loss = 0
    val_loss = 0
    # Используйте tqdm для отображения прогресса
    for batch in tqdm(train_data_loader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        # Move tensors to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_data_loader)
    loss_values.append(avg_train_loss)

    print(f"Epoch: {epoch+1}, Train Loss: {avg_train_loss:.4f}")

    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(test_data_loader, desc=f"Validation Epoch {epoch+1}"):
        with torch.no_grad():
            # Move tensors to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            logits = outputs.logits
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()

            predictions.extend(np.argmax(logits, axis=1).flatten())
            true_labels.extend(label_ids.flatten())

    avg_val_loss = val_loss / len(test_data_loader)
    print("Balanced acc:", balanced_accuracy_score(true_labels, predictions))
    print("Weighted precision, recall, fscore:", precision_recall_fscore_support(true_labels, predictions, average='weighted'))

    best_loss = avg_train_loss

    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

# Plotting loss graph
def plot_loss(train_losses, val_losses, n_epochs):
    import matplotlib.pyplot as plt

    epochs = range(1, n_epochs + 1)
    plt.plot(epochs, train_losses, 'b', label='Training loss')
    plt.plot(epochs, val_losses, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

plot_loss(train_losses, val_losses, epochs)


In [None]:
# from transformers import BertForSequenceClassification

# # tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
# # model = BertForSequenceClassification.from_pretrained("cointegrated/rubert-tiny2")

# # Количество эпох
# epochs = 7

# train_losses = []
# val_losses = []

# # Список для хранения значений потерь на каждой эпохе
# loss_values = []
# best_loss = float('inf')
# best_acc = 0.98

# for epoch in range(epochs):
#     model.train()
# #     total_loss = 0
#     train_loss = 0
#     val_loss = 0
#     # Используйте tqdm для отображения прогресса
#     for batch in tqdm(train_data_loader, desc=f"Epoch {epoch+1}"):
#         optimizer.zero_grad()
# #         input_ids = batch['input_ids']
# #         attention_mask = batch['attention_mask']
# #         labels = batch['labels']
#         # Move tensors to device
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)


#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         # total_loss += loss.item()
#         train_loss += loss.item()
#         loss.backward()
#         optimizer.step()

#     avg_train_loss = total_loss / len(train_data_loader)
#     loss_values.append(avg_train_loss)

#     print(f"Epoch: {epoch+1}, Train Loss: {avg_train_loss:.4f}")

#     model.eval()
#     predictions , true_labels = [], []

#     for batch in tqdm(test_data_loader, desc=f"Validation Epoch {epoch+1}"):
#         with torch.no_grad():
# #             input_ids = batch['input_ids']
# #             attention_mask = batch['attention_mask']
# #             labels = batch['labels']
#             # Move tensors to device
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)


#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             logits = outputs.logits
#             logits = logits.detach().cpu().numpy()
#             label_ids = labels.to('cpu').numpy()
            
#             val_loss += loss.item()
            
#             # print(logits.shape)
#             predictions.extend(np.argmax(logits, axis=1).flatten())
#             true_labels.extend(label_ids.flatten())

# #     acc = accuracy_score(true_labels, predictions)
# #     rec = recall_score(true_labels, predictions, average='weighted')
# #     f1 = f1_score(true_labels, predictions, average='weighted')

# #     print(f"Accuracy: {acc:.4f}, Recall: {rec:.4f}, F1-score: {f1:.4f}")
#     print("Balanced acc:", balanced_accuracy_score(true_labels, predictions))
#     print("Weighted precision, recall, fscore:", precision_recall_fscore_support(true_labels, predictions, average='weighted'))

# #     # Ранняя остановка по потерям на валидации
# #     if avg_train_loss > best_loss:
# #         print("Early stopping due to increase in validation loss")
# #         break

# #     # Ранняя остановка по точности на валидации
# #     if acc >= best_acc:
# #         print("Early stopping due to reaching target accuracy")
# #         break

#     best_loss = avg_train_loss
    
#     train_loss /= len(train_loader)
#     val_loss /= len(test_loader)
#     train_losses.append(train_loss)
#     val_losses.append(val_loss)


# plot_loss(val_loss, train_loss, n_epoches)


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import timeit

In [None]:
# Предсказание на тестовых данных
model.eval()
predictions = []
start_test = timeit.default_timer()
with torch.no_grad():
    for batch in test_data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        predictions.extend(preds)
end_test = timeit.default_timer()
predicted_labels = [pred.item() for pred in predictions]

# Общее количество предсказанных ответов
num_predictions = len(predictions)

# Среднее время на один ответ
average_time_per_response = (end_test - start_test) / num_predictions

print(f'Time for testing: {end_test - start_test:.4f} seconds')
print(f'Average time per response: {average_time_per_response:.6f} seconds')

In [None]:
# print(balanced_accuracy_score(test_labels, predicted_labels))

In [None]:
print(balanced_accuracy_score(test_labels, predicted_labels))
print(precision_recall_fscore_support(test_labels, predicted_labels, average='weighted'))

In [None]:
# Построение confusion matrix
cm = confusion_matrix(test_labels, predicted_labels)
print(f'Confusion Matrix: \n{cm}')

In [None]:
# # Визуализация матрицы ошибок с использованием seaborn
# plt.figure(figsize=(20, 20))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=set(test_labels), yticklabels=set(predicted_labels))
# plt.xlabel('Предсказанный класс')
# plt.ylabel('Истинный класс')
# plt.title('Матрица ошибок')
# plt.show()
