In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('full_train.csv')
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
df.head()

Unnamed: 0,Question,Category,answer_class
0,Какие документы я получу после окончания обуче...,Документы,0.0
1,Какие возможности для трудоустройства предоста...,Документы,0.0
2,Какие курсы и программы дополнительного образо...,Документы,0.0
3,Какие требования предъявляются к студентам при...,Документы,0.0
4,Какие перспективы карьерного роста ожидают мен...,Документы,0.0


Делаем лемматизвцию

In [99]:
from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger, Doc

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

def lemmatize_text(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    
    return " ".join([_.lemma for _ in doc.tokens])

df['Question'] = df['Question'].apply(lemmatize_text)

Делим на аспекты

In [2]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=462)

In [3]:
X_train = df_train['Question']
y_train = df_train['Category']

X_val = df_val['Question']
y_val = df_val['Category']

Обучаем CatBoost

In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report

ppl_boost = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=True, stop_words=None)),
    ('classifier', CatBoostClassifier(verbose=100, random_seed=42, iterations=1000))
])

ppl_boost.fit(X_train, y_train)

y_pred_train = ppl_boost.predict(X_train)
y_pred = ppl_boost.predict(X_val)

print("Train F1-micro", f1_score(y_train, y_pred_train, average='micro'))
print("Validate F1-micro", f1_score(y_val, y_pred, average='micro'))

Learning rate set to 0.085892
0:	learn: 2.1879939	total: 40.6ms	remaining: 40.5s
100:	learn: 0.9573599	total: 4.57s	remaining: 40.7s
200:	learn: 0.7682722	total: 9.16s	remaining: 36.4s
300:	learn: 0.6837746	total: 13.5s	remaining: 31.3s
400:	learn: 0.6286321	total: 17.9s	remaining: 26.8s
500:	learn: 0.5899114	total: 22.4s	remaining: 22.3s
600:	learn: 0.5558465	total: 26.9s	remaining: 17.8s
700:	learn: 0.5284173	total: 31.3s	remaining: 13.4s
800:	learn: 0.5039806	total: 35.7s	remaining: 8.88s
900:	learn: 0.4832714	total: 40.2s	remaining: 4.42s
999:	learn: 0.4666038	total: 44.7s	remaining: 0us
Train F1-micro 0.9155324259407526
Validate F1-micro 0.8504


Обучаем LogReg

In [104]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report


ppl_logreg = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=True, stop_words=None)),
    ('classifier', LogisticRegression(random_state=352, C=10, max_iter=1000))
])

ppl_logreg.fit(X_train, y_train)

y_pred_train = ppl_logreg.predict(X_train)
y_pred = ppl_logreg.predict(X_val)

print("Train F1-micro", f1_score(y_train, y_pred_train, average='micro'))
print("Validate F1-micro", f1_score(y_val, y_pred, average='micro'))

Train F1-micro 0.9565652522017614
Validate F1-micro 0.88


Обучаем SVM

In [105]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report
from sklearn.svm import SVC

ppl_svm = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=True, stop_words=None)),
    ('classifier', SVC(random_state=352, C=17, max_iter=1000, kernel='rbf', gamma=0.5))
])

ppl_svm.fit(X_train, y_train)

y_pred_train = ppl_svm.predict(X_train)
y_pred = ppl_svm.predict(X_val)

print("Train F1-micro", f1_score(y_train, y_pred_train, average='micro'))
print("Validate F1-micro", f1_score(y_val, y_pred, average='micro'))



Train F1-micro 0.9873899119295436
Validate F1-micro 0.8984


In [142]:
df_test = pd.read_excel('mtest.xlsx')
df_test = df_test.dropna()
df_test = df_test.rename(columns={'Вопросы': 'Question', 
                                  'Категория': 'Category', 
                                  'Ответ': 'answer_class',})


In [143]:
df_test['Question'] = df_test['Question'].apply(lemmatize_text)

In [144]:
X_test = df_test['Question']
y_test = df_test['Category']

Смотрим score на отстраненных данных

In [145]:
y_pred_boost = ppl_boost.predict(X_test)
print("CatBoost F1-micro", f1_score(y_test, y_pred_boost, average='micro'))

y_pred_logreg = ppl_logreg.predict(X_test)
print("LogReg F1-micro", f1_score(y_test, y_pred_logreg, average='micro'))

y_pred_svm = ppl_svm.predict(X_test)
print("SVM F1-micro", f1_score(y_test, y_pred_svm, average='micro'))

CatBoost F1-micro 0.78
LogReg F1-micro 0.81
SVM F1-micro 0.79


Сохраняем модели

In [154]:
joblib.dump(ppl_boost, f'Main_CatBoost.joblib')
joblib.dump(ppl_logreg, f'Main_LogReg.joblib')
joblib.dump(ppl_svm, f'Main_SVM.joblib')

['Main_SVM.joblib']

In [156]:
caters = {}
u_caters = df['Category'].unique()
for cat in u_caters:
    caters[cat] = len(caters)

u_caters

array(['Документы', 'Организация уроков', 'Оценки',
       'Перевод/ запись в группу', 'Практические работы',
       'Программа обучения', 'Портал', 'Расписание', 'Требования ПО',
       'Трудоустройство'], dtype=object)

Ансамблируем путем голосования

In [147]:
y_pred_ansamble = []
for i in range(len(y_pred_boost)):
    anses = []
    anses.append(caters[y_pred_boost[i][0]])
    anses.append(caters[y_pred_logreg[i]])
    anses.append(caters[y_pred_svm[i]])
    anses = sorted(anses)
    if anses[0] == anses[-1]:
        y_pred_ansamble.append(u_caters[anses[0]])
    elif anses[0] == anses[1]:
        y_pred_ansamble.append(u_caters[anses[0]])
    elif anses[1] == anses[2]:
        y_pred_ansamble.append(u_caters[anses[1]])
    else:
        y_pred_ansamble.append(y_pred_logreg[i])

In [148]:
print("Ansamble F1-micro", f1_score(y_test, y_pred_ansamble, average='micro'))

Ansamble F1-micro 0.83


In [149]:
ques = ['Мне идти в 5F?']
y_pred_boost = ppl_boost.predict(ques)
y_pred_logreg = ppl_logreg.predict(ques)
y_pred_svm = ppl_svm.predict(ques)

y_pred_boost, y_pred_logreg, y_pred_svm

(array([['Документы']], dtype=object),
 array(['Документы'], dtype=object),
 array(['Документы'], dtype=object))

In [150]:
categories = ['Документы', 'Организация уроков', 'Программа обучения', 'Портал', 'Расписание', 'Требования ПО', 'Трудоустройство']
not_categories = {'Практические работы': 12,
                  'Перевод/ запись в группу': 11,
                  'Оценки': 10}

Считываем модели по категориям(такие же LogReg)

In [151]:
import joblib
models_cats = {}

for cat in categories:
    models_cats[cat] = joblib.load(f'{cat}1.joblib')

Предсказываем класс, на области предсказанных категорий

In [152]:
y_pred = []

for i in range(len(y_pred_ansamble)):
    if y_pred_ansamble[i] in categories:
        cur_pred = models_cats[y_pred_ansamble[i]].predict([X_test[i]])[0]
        y_pred.append(cur_pred)
    else:
        y_pred.append(not_categories[y_pred_ansamble[i]])