In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/qa-intents-dataset-university-domain/dataset_test.tsv
/kaggle/input/qa-intents-dataset-university-domain/labels_description.txt
/kaggle/input/qa-intents-dataset-university-domain/dataset_train.tsv


In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, balanced_accuracy_score
from transformers import BertTokenizer

In [3]:
# Загрузка данных
train_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_train.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
test_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_test.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
full_data = pd.concat([train_data, test_data])

# Разделение на обучающий и тестовый наборы данных
train, test = train_test_split(full_data, test_size=0.2, random_state=42)

# Вывод нормализованных значений классов
# unique_values_normalized = train['intent'].value_counts(normalize=True)
# print(unique_values_normalized * 100)

# Инициализация токенизатора BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Создание пайплайна
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True))),
    ('AdaBoost', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=50), n_estimators=50, learning_rate=1.0, random_state=42)) 
])

# Преобразование меток в числовые значения
le = LabelEncoder()
train_labels = le.fit_transform(train['intent'])
test_labels = le.transform(test['intent'])

# Обучение пайплайна
pipeline.fit(train['text'], train_labels)

# Прогноз на тестовом наборе
predictions = pipeline.predict(test['text'])

# Вывод метрик оценки качества модели
print(balanced_accuracy_score(test_labels, predictions))
print(precision_recall_fscore_support(test_labels, predictions, average='weighted'))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



0.9110938847325247
(0.9274658775930188, 0.926673751328374, 0.9254661476284773, None)


  _warn_prf(average, modifier, msg_start, len(result))


Вместо TF-IDF используем предобученные эмбеддинги BERT для представления текста:

In [4]:
# import numpy as np
# import pandas as pd
# import torch
# from torch.utils.data import Dataset, DataLoader
# from transformers import DistilBertTokenizer, DistilBertModel
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import precision_recall_fscore_support, balanced_accuracy_score
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# from tqdm import tqdm

In [5]:
# # Загрузка данных
# train_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_train.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
# test_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_test.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
# full_data = pd.concat([train_data, test_data])

# # Разделение на обучающий и тестовый наборы данных
# train, test = train_test_split(full_data, test_size=0.2, random_state=42)

# # Инициализация токенизатора и модели DistilBERT
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# distilbert_model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
# distilbert_model = distilbert_model.eval()  # Выключаем обучение DistilBERT

In [6]:
# class TextDataset(Dataset):
#     def __init__(self, texts, labels):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
#         self.max_len = 512

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = self.texts[idx]
#         inputs = self.tokenizer.encode_plus(
#             text,
#             None,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             padding='max_length',
#             return_token_type_ids=True,
#             return_attention_mask=True,
#             truncation=True
#         )
#         input_ids = inputs['input_ids']
#         attention_mask = inputs['attention_mask']

#         return {
#             'input_ids': torch.tensor(input_ids, dtype=torch.long),
#             'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
#             'label': torch.tensor(self.labels[idx], dtype=torch.long)
#         }

In [7]:
# def extract_distilbert_embeddings(texts):
#     dataset = TextDataset(texts, [0] * len(texts))  # fake labels
#     loader = DataLoader(dataset, batch_size=4)

#     embeddings = []
#     for batch in tqdm(loader, desc="Extracting DistilBERT embeddings"):
#         input_ids = batch['input_ids']
#         attention_mask = batch['attention_mask']
#         with torch.no_grad():
#             outputs = distilbert_model(input_ids, attention_mask=attention_mask)
#         cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
#         embeddings.append(cls_embeddings)

#     return np.concatenate(embeddings, axis=0)

In [8]:
# # Преобразование меток в числовые значения
# le = LabelEncoder()
# train_labels = le.fit_transform(train['intent'])
# test_labels = le.transform(test['intent'])

In [9]:
# # Извлечение эмбеддингов DistilBERT
# print("Extracting embeddings for training data...")
# train_embeddings = extract_distilbert_embeddings(train['text'].tolist())
# print("Extracting embeddings for test data...")
# test_embeddings = extract_distilbert_embeddings(test['text'].tolist())

In [10]:
# # Создание и обучение модели
# model = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42), n_estimators=50, learning_rate=1.0, random_state=42)
# model.fit(train_embeddings, train_labels)

In [11]:
# # Прогноз на тестовом наборе
# predictions = model.predict(test_embeddings)

# # Вывод метрик оценки качества модели
# print(balanced_accuracy_score(test_labels, predictions))
# print(precision_recall_fscore_support(test_labels, predictions, average='weighted'))