In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv('../input/qa-intents-dataset-university-domain/dataset_train.tsv',delimiter='\t',encoding="utf-8",names=['text', 'intent'])
test_data = pd.read_csv('../input/qa-intents-dataset-university-domain/dataset_test.tsv',delimiter='\t',encoding="utf-8",names=['text', 'intent'])
train_data.head()

In [None]:
full_data = pd.concat([train_data, test_data])
full_data.head()

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(full_data, test_size=0.2, random_state=42)
unique_values_normalized = train['intent'].value_counts(normalize=True)
print(unique_values_normalized*100)

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, balanced_accuracy_score
import timeit

In [None]:
# Создание пайплайна
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=lambda x: tokenizer.encode(x, add_special_tokens=True))),
    ('lr', LogisticRegression(C=1.0, max_iter=150, penalty='l2'))  
])

# Обучение пайплайна
pipeline.fit(train_data['text'], train_data['intent'])

start_test = timeit.default_timer()
# Прогноз на тестовом наборе
predictions = pipeline.predict(test_data['text'])
end_test = timeit.default_timer()


# Общее количество предсказанных ответов
num_predictions = len(predictions)

# Среднее время на один ответ
average_time_per_response = (end_test - start_test) / num_predictions

print(f'Time for testing: {end_test - start_test:.4f} seconds')
print(f'Average time per response: {average_time_per_response:.6f} seconds')

In [None]:
balanced_accuracy_score(test_data['intent'], predictions)

In [None]:
precision_recall_fscore_support(test_data['intent'], predictions, average = 'weighted')

In [None]:
# from sklearn.metrics import roc_auc_score

In [None]:
#roc_auc_score(test_data['intent'], predictions, average='weighted')