In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/qa-intents-dataset-university-domain/dataset_test.tsv
/kaggle/input/qa-intents-dataset-university-domain/labels_description.txt
/kaggle/input/qa-intents-dataset-university-domain/dataset_train.tsv


In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, balanced_accuracy_score
from transformers import BertTokenizer
import timeit

In [3]:
# Загрузка данных
train_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_train.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
test_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_test.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
full_data = pd.concat([train_data, test_data])

# Разделение на обучающий и тестовый наборы данных
train, test = train_test_split(full_data, test_size=0.2, random_state=42)

# Вывод нормализованных значений классов
# unique_values_normalized = train['intent'].value_counts(normalize=True)
# print(unique_values_normalized * 100)

# Инициализация токенизатора BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Создание пайплайна
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True))),
    ('AdaBoost', AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=50), n_estimators=50, learning_rate=1.0, random_state=42)) 
])

# Преобразование меток в числовые значения
le = LabelEncoder()
train_labels = le.fit_transform(train['intent'])
test_labels = le.transform(test['intent'])

# Обучение пайплайна
pipeline.fit(train['text'], train_labels)

start_test = timeit.default_timer()
# Прогноз на тестовом наборе
predictions = pipeline.predict(test['text'])
end_test = timeit.default_timer()


# Общее количество предсказанных ответов
num_predictions = len(predictions)

# Среднее время на один ответ
average_time_per_response = (end_test - start_test) / num_predictions

print(f'Time for testing: {end_test - start_test:.4f} seconds')
print(f'Average time per response: {average_time_per_response:.6f} seconds')
# Вывод метрик оценки качества модели
print(balanced_accuracy_score(test_labels, predictions))
print(precision_recall_fscore_support(test_labels, predictions, average='weighted'))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Time for testing: 1.4710 seconds
Average time per response: 0.000521 seconds
0.9110938847325247
(0.9274658775930188, 0.926673751328374, 0.9254661476284773, None)


  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
# Перевод меток обратно в исходные интенты
predicted_intents = le.inverse_transform(predictions)

Вместо TF-IDF используем предобученные эмбеддинги BERT для представления текста:

In [5]:
# import numpy as np
# import pandas as pd
# import torch
# from torch.utils.data import Dataset, DataLoader
# from transformers import DistilBertTokenizer, DistilBertModel
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import precision_recall_fscore_support, balanced_accuracy_score
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# from tqdm import tqdm

In [6]:
# # Загрузка данных
# train_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_train.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
# test_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_test.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
# full_data = pd.concat([train_data, test_data])

# # Разделение на обучающий и тестовый наборы данных
# train, test = train_test_split(full_data, test_size=0.2, random_state=42)

# # Инициализация токенизатора и модели DistilBERT
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# distilbert_model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
# distilbert_model = distilbert_model.eval()  # Выключаем обучение DistilBERT

In [7]:
# class TextDataset(Dataset):
#     def __init__(self, texts, labels):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
#         self.max_len = 512

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = self.texts[idx]
#         inputs = self.tokenizer.encode_plus(
#             text,
#             None,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             padding='max_length',
#             return_token_type_ids=True,
#             return_attention_mask=True,
#             truncation=True
#         )
#         input_ids = inputs['input_ids']
#         attention_mask = inputs['attention_mask']

#         return {
#             'input_ids': torch.tensor(input_ids, dtype=torch.long),
#             'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
#             'label': torch.tensor(self.labels[idx], dtype=torch.long)
#         }

In [8]:
# def extract_distilbert_embeddings(texts):
#     dataset = TextDataset(texts, [0] * len(texts))  # fake labels
#     loader = DataLoader(dataset, batch_size=4)

#     embeddings = []
#     for batch in tqdm(loader, desc="Extracting DistilBERT embeddings"):
#         input_ids = batch['input_ids']
#         attention_mask = batch['attention_mask']
#         with torch.no_grad():
#             outputs = distilbert_model(input_ids, attention_mask=attention_mask)
#         cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
#         embeddings.append(cls_embeddings)

#     return np.concatenate(embeddings, axis=0)

In [9]:
# # Преобразование меток в числовые значения
# le = LabelEncoder()
# train_labels = le.fit_transform(train['intent'])
# test_labels = le.transform(test['intent'])

In [10]:
# # Извлечение эмбеддингов DistilBERT
# print("Extracting embeddings for training data...")
# train_embeddings = extract_distilbert_embeddings(train['text'].tolist())
# print("Extracting embeddings for test data...")
# test_embeddings = extract_distilbert_embeddings(test['text'].tolist())

In [11]:
# # Создание и обучение модели
# model = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42), n_estimators=50, learning_rate=1.0, random_state=42)
# model.fit(train_embeddings, train_labels)

In [12]:
# # Прогноз на тестовом наборе
# predictions = model.predict(test_embeddings)

# # Вывод метрик оценки качества модели
# print(balanced_accuracy_score(test_labels, predictions))
# print(precision_recall_fscore_support(test_labels, predictions, average='weighted'))

M-USE

In [13]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel

In [14]:
# Загрузка данных
train_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_train.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
test_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_test.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
full_data = pd.concat([train_data, test_data])

# Разделение на обучающий и тестовый наборы данных
train, test = train_test_split(full_data, test_size=0.2, random_state=42)

# Загрузка модели и токенизатора
tokenizer = AutoTokenizer.from_pretrained("sadakmed/distiluse-base-multilingual-cased-v2")
model = AutoModel.from_pretrained("sadakmed/distiluse-base-multilingual-cased-v2")

# Токенизация и кодирование текстовых данных
train_encodings = tokenizer(train['text'].tolist(), truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(test['text'].tolist(), truncation=True, padding=True, return_tensors="pt")

# Извлечение эмбеддингов
with torch.no_grad():
    train_embeddings = model(**train_encodings).last_hidden_state.mean(dim=1)
    test_embeddings = model(**test_encodings).last_hidden_state.mean(dim=1)

# Преобразование меток в числовые значения
le = LabelEncoder()
train_labels = le.fit_transform(train['intent'])
test_labels = le.transform(test['intent'])

# Определение собственного Dataset для PyTorch
class TextDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Создание DataLoader-ов
train_dataset = TextDataset(train_embeddings, torch.tensor(train_labels))
test_dataset = TextDataset(test_embeddings, torch.tensor(test_labels))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Создание и обучение модели
model = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42), n_estimators=50, learning_rate=1.0, random_state=42)
model.fit(train_embeddings.numpy(), train_labels)


tokenizer_config.json:   0%|          | 0.00/584 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [15]:
start_test = timeit.default_timer()
# Прогноз на тестовом наборе
predictions = model.predict(test_embeddings.numpy())
end_test = timeit.default_timer()

# Перевод меток обратно в исходные интенты
predicted_intents = le.inverse_transform(predictions)

# Общее количество предсказанных ответов
num_predictions = len(predictions)

# Среднее время на один ответ
average_time_per_response = (end_test - start_test) / num_predictions

print(f'Time for testing: {end_test - start_test:.4f} seconds')
print(f'Average time per response: {average_time_per_response:.6f} seconds')
# Вывод метрик оценки качества модели
print(balanced_accuracy_score(test_labels, predictions))
print(precision_recall_fscore_support(test_labels, predictions, average='weighted'))

Time for testing: 3.8652 seconds
Average time per response: 0.001369 seconds
0.9135211413268872
(0.9361350074207471, 0.9341126461211477, 0.933089947627043, None)


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# import numpy as np
# import pandas as pd
# import tensorflow as tf
# import tensorflow_hub as hub
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import precision_recall_fscore_support, balanced_accuracy_score
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# from tqdm import tqdm
# import torch
# from transformers import AutoTokenizer, AutoModel

In [17]:
# # Загрузка данных
# train_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_train.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
# test_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_test.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
# full_data = pd.concat([train_data, test_data])

# # Разделение на обучающий и тестовый наборы данных
# train, test = train_test_split(full_data, test_size=0.2, random_state=42)

# tokenizer = AutoTokenizer.from_pretrained("sadakmed/distiluse-base-multilingual-cased-v2")
# model = AutoModel.from_pretrained("sadakmed/distiluse-base-multilingual-cased-v2")

In [18]:
# # Tokenize and encode the text data
# train_encodings = tokenizer(train['text'].tolist(), truncation=True, padding=True, return_tensors="pt")
# test_encodings = tokenizer(test['text'].tolist(), truncation=True, padding=True, return_tensors="pt")

# # Извлечение эмбеддингов
# with torch.no_grad():
#     train_embeddings = model(**train_encodings).last_hidden_state.mean(dim=1)
#     test_embeddings = model(**test_encodings).last_hidden_state.mean(dim=1)


In [19]:
# # Функция для извлечения эмбеддингов USE
# def extract_use_embeddings(texts):
#     embeddings = []
#     for text in tqdm(texts, desc="Extracting USE embeddings"):
#         text_embedding = use_model([text])
#         embeddings.append(text_embedding.numpy()[0])
#     return np.array(embeddings)

# # Преобразование меток в числовые значения
# le = LabelEncoder()
# train_labels = le.fit_transform(train['intent'])
# test_labels = le.transform(test['intent'])

# # Преобразование массивов numpy в тензоры PyTorch перед передачей в TextDataset
# train_dataset = TextDataset(train_embeddings, torch.tensor(train_labels))
# test_dataset = TextDataset(test_embeddings, torch.tensor(test_labels))

# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32)


# # # Извлечение эмбеддингов USE
# # print("Extracting embeddings for training data...")
# # train_embeddings = extract_use_embeddings(train['text'].tolist())
# # print("Extracting embeddings for test data...")
# # test_embeddings = extract_use_embeddings(test['text'].tolist())

# # Преобразование в тензоры PyTorch
# train_embeddings = torch.tensor(train_embeddings, dtype=torch.float32)
# test_embeddings = torch.tensor(test_embeddings, dtype=torch.float32)
# train_labels = torch.tensor(train_labels, dtype=torch.long)
# test_labels = torch.tensor(test_labels, dtype=torch.long)

# # Создание и обучение модели
# model = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42), n_estimators=50, learning_rate=1.0, random_state=42)
# model.fit(train_embeddings.numpy(), train_labels.numpy())

# # Прогноз на тестовом наборе
# predictions = model.predict(test_embeddings.numpy())

# # Вывод метрик оценки качества модели
# print(balanced_accuracy_score(test_labels.numpy(), predictions))
# print(precision_recall_fscore_support(test_labels.numpy(), predictions, average='weighted'))


In [20]:

# # Загрузка данных
# train_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_train.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
# test_data = pd.read_csv('/kaggle/input/qa-intents-dataset-university-domain/dataset_test.tsv', delimiter='\t', encoding="utf-8", names=['text', 'intent'])
# full_data = pd.concat([train_data, test_data])

# # Разделение на обучающий и тестовый наборы данных
# train, test = train_test_split(full_data, test_size=0.2, random_state=42)

In [21]:
# # Инициализация мультиязычной модели Universal Sentence Encoder
# use_model = SentenceTransformer('xlm-r-100langs-bert-base-nli-mean-tokens')

# # Функция для извлечения эмбеддингов USE
# def extract_use_embeddings(texts):
#     embeddings = []
#     for text in tqdm(texts, desc="Extracting USE embeddings"):
#         text_embedding = use_model.encode(text)
#         embeddings.append(text_embedding)
#     return np.array(embeddings)

In [22]:
# # Преобразование меток в числовые значения
# le = LabelEncoder()
# train_labels = le.fit_transform(train['intent'])
# test_labels = le.transform(test['intent'])

In [23]:
# # Извлечение эмбеддингов USE
# print("Extracting embeddings for training data...")
# train_embeddings = extract_use_embeddings(train['text'].tolist())
# print("Extracting embeddings for test data...")
# test_embeddings = extract_use_embeddings(test['text'].tolist())

In [24]:
# # Преобразование в тензоры PyTorch
# train_embeddings = torch.tensor(train_embeddings)
# test_embeddings = torch.tensor(test_embeddings)
# train_labels = torch.tensor(train_labels)
# test_labels = torch.tensor(test_labels)

# # Создание и обучение модели
# model = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=50), n_estimators=50, learning_rate=1.0, random_state=42) # AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42), n_estimators=50, learning_rate=1.0, random_state=42)
# model.fit(train_embeddings, train_labels)

# # Прогноз на тестовом наборе
# predictions = model.predict(test_embeddings)

# # Вывод метрик оценки качества модели
# print(balanced_accuracy_score(test_labels, predictions))
# print(precision_recall_fscore_support(test_labels, predictions, average='weighted'))