In [27]:
from StatsAggregation.analysis_helpers import *
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import scipy.sparse as sp
from sklearn.ensemble import StackingClassifier
from itertools import combinations
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

In [2]:
df = load_df('aggregated_df.csv')

In [3]:
df.head()

Unnamed: 0,author,book,text,text_no_punkt,lemmas,tags,tokens,counts,probs
0,Аверченко,Averchenko_A-T-Averchenko-Sobranie-sochineniy-...,Мы за пять лет. Материалы [к биографии]\nКак б...,Мы за пять лет Материалы к биографии Как будто...,пять год материал биография кроваво-красный ра...,4_NUMR 3_NOUN 9_NOUN 9_NOUN 15_ADJF 6_NOUN 8_V...,Мы за пять лет . Материалы [ к биографии ] Как...,243,4.5e-05
1,Аверченко,Averchenko_A-T-Averchenko-Sobranie-sochineniy-...,Сидел он за большим письменным столом перед де...,Сидел он за большим письменным столом перед де...,сидеть больший письменный стол деревянный доск...,5_VERB 7_ADJF 10_ADJF 6_NOUN 10_ADJF 6_NOUN 6_...,Сидел он за большим письменным столом перед де...,243,4.5e-05
2,Аверченко,Averchenko_A-T-Averchenko-Sobranie-sochineniy-...,"— Да уж, — качал головой сдержанный Ре-ми. — Н...",Да уж качал головой сдержанный Ре-ми Нехорошо ...,качать голова сдержать ре-ми нехорошо нехорошо...,5_VERB 7_NOUN 10_PRTF 5_None 8_ADVB 8_ADVB 4_A...,"— Да уж , — качал головой сдержанный Ре-ми . —...",243,4.5e-05
3,Аверченко,Averchenko_A-T-Averchenko-Sobranie-sochineniy-...,"Поверит ли кто-нибудь, что нами за эти пять ле...",Поверит ли кто-нибудь что нами за эти пять лет...,поверить кто-нибудь пять год совместно м.г кор...,7_VERB 10_NPRO 4_NUMR 3_NOUN 9_ADVB 3_None 11_...,"Поверит ли кто-нибудь , что нами за эти пять л...",243,4.5e-05
4,Аверченко,Averchenko_A-T-Averchenko-Sobranie-sochineniy-...,"8\nАверченко А. Избранные рассказы. М., 1985. ...",8 Аверченко А Избранные рассказы М 1985 С 7 9 ...,8 избранный рассказ м 1985 7 9 ежегодник 156 1...,1_None 9_ADJF 8_NOUN 1_NOUN 4_None 1_None 1_No...,"8 Аверченко А. Избранные рассказы . М. , 1985 ...",243,4.5e-05


### Генератор фолдов для кросс-валидации

In [4]:
def books_cross_val(df, k=5):
    df_remain = df
    while k > 0:
        if k == 1:
            train_idx = df.index.difference(df_remain.index)
            test_idx = df_remain.index
        else:
            share = (k - 1) / k
            df_remain, fold, _, _ = train_test_split(df_remain, share=share, cross_val=True)
            train_idx = df.index.difference(fold.index)
            test_idx = fold.index
        yield train_idx, test_idx
        k -= 1

### TfIdf для нескольких колонок

In [9]:
class MultiTfidf(TransformerMixin):
    def __init__(self, cols=None, tfidf_type='classic'):
        self.cols = cols
        self.tfidf_type = tfidf_type
    
    def fit(self, X, y=None):
        if self.tfidf_type == 'classic':
            get_vectorizer = get_document_vectorizer
        elif self.tfidf_type == 'class_based':
            get_vectorizer = get_author_vectorizer
        else:
            raise ValueError("Unknown vectorizer")
        self.vectorizers_ = [get_vectorizer(X, column=col) for col in self.cols]
        return self
    
    def transform(self, X, y=None):
        results = []
        for vec, col in zip(self.vectorizers_, self.cols):
            results.append(vec.transform(X[col]))
        return sp.hstack(results)

### Базовый пайплайн

In [14]:
def get_base_estimator(cols, vec_type='classic'):
    pipe = Pipeline([
        ('vectorizer', MultiTfidf(cols=cols, tfidf_type=vec_type)),
        ('model', LogisticRegression(class_weight='balanced', max_iter=500, C=1000))
    ])
    return pipe

### Самое интересное

In [20]:
encoder = get_encoder(df)

In [15]:
estimators = []
for cols in combinations(['text_no_punkt', 'lemmas', 'tags', 'tokens'], 2):
    estimators.append((';'.join(cols), get_base_estimator(cols)))

In [18]:
df_train, df_test, y_train, y_test = train_test_split(df)

In [24]:
df_train.reset_index(drop=True, inplace=True)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [25]:
model = StackingClassifier(estimators, CatBoostClassifier(), cv=books_cross_val(df_train))

In [26]:
model.fit(df_train, y_train)



Learning rate set to 0.088866
0:	learn: 2.1505162	total: 258ms	remaining: 4m 17s
1:	learn: 1.8764210	total: 371ms	remaining: 3m 5s
2:	learn: 1.7073368	total: 483ms	remaining: 2m 40s
3:	learn: 1.5736159	total: 596ms	remaining: 2m 28s
4:	learn: 1.4686401	total: 711ms	remaining: 2m 21s
5:	learn: 1.3732043	total: 821ms	remaining: 2m 15s
6:	learn: 1.3048341	total: 935ms	remaining: 2m 12s
7:	learn: 1.2276186	total: 1.05s	remaining: 2m 9s
8:	learn: 1.1734988	total: 1.16s	remaining: 2m 7s
9:	learn: 1.1249574	total: 1.27s	remaining: 2m 5s
10:	learn: 1.0847118	total: 1.38s	remaining: 2m 4s
11:	learn: 1.0407011	total: 1.5s	remaining: 2m 3s
12:	learn: 1.0024706	total: 1.61s	remaining: 2m 2s
13:	learn: 0.9664199	total: 1.72s	remaining: 2m 1s
14:	learn: 0.9317617	total: 1.83s	remaining: 2m
15:	learn: 0.8998870	total: 1.95s	remaining: 1m 59s
16:	learn: 0.8673777	total: 2.06s	remaining: 1m 59s
17:	learn: 0.8377434	total: 2.17s	remaining: 1m 58s
18:	learn: 0.8114556	total: 2.29s	remaining: 1m 58s
19:	l

158:	learn: 0.2379881	total: 18.5s	remaining: 1m 37s
159:	learn: 0.2370723	total: 18.6s	remaining: 1m 37s
160:	learn: 0.2361263	total: 18.7s	remaining: 1m 37s
161:	learn: 0.2352913	total: 18.8s	remaining: 1m 37s
162:	learn: 0.2345968	total: 19s	remaining: 1m 37s
163:	learn: 0.2339504	total: 19.1s	remaining: 1m 37s
164:	learn: 0.2329699	total: 19.2s	remaining: 1m 37s
165:	learn: 0.2322523	total: 19.3s	remaining: 1m 37s
166:	learn: 0.2316205	total: 19.4s	remaining: 1m 36s
167:	learn: 0.2310380	total: 19.5s	remaining: 1m 36s
168:	learn: 0.2301813	total: 19.7s	remaining: 1m 36s
169:	learn: 0.2295421	total: 19.8s	remaining: 1m 36s
170:	learn: 0.2286891	total: 19.9s	remaining: 1m 36s
171:	learn: 0.2281175	total: 20s	remaining: 1m 36s
172:	learn: 0.2274400	total: 20.1s	remaining: 1m 36s
173:	learn: 0.2267663	total: 20.2s	remaining: 1m 36s
174:	learn: 0.2257920	total: 20.4s	remaining: 1m 35s
175:	learn: 0.2247877	total: 20.5s	remaining: 1m 35s
176:	learn: 0.2237381	total: 20.6s	remaining: 1m 3

314:	learn: 0.1521890	total: 36.8s	remaining: 1m 19s
315:	learn: 0.1519880	total: 36.9s	remaining: 1m 19s
316:	learn: 0.1515977	total: 37s	remaining: 1m 19s
317:	learn: 0.1513032	total: 37.1s	remaining: 1m 19s
318:	learn: 0.1507722	total: 37.3s	remaining: 1m 19s
319:	learn: 0.1504506	total: 37.4s	remaining: 1m 19s
320:	learn: 0.1501512	total: 37.5s	remaining: 1m 19s
321:	learn: 0.1498976	total: 37.6s	remaining: 1m 19s
322:	learn: 0.1495156	total: 37.7s	remaining: 1m 19s
323:	learn: 0.1491967	total: 37.8s	remaining: 1m 18s
324:	learn: 0.1489826	total: 38s	remaining: 1m 18s
325:	learn: 0.1488152	total: 38.1s	remaining: 1m 18s
326:	learn: 0.1484188	total: 38.2s	remaining: 1m 18s
327:	learn: 0.1480220	total: 38.3s	remaining: 1m 18s
328:	learn: 0.1477763	total: 38.4s	remaining: 1m 18s
329:	learn: 0.1474314	total: 38.6s	remaining: 1m 18s
330:	learn: 0.1468940	total: 38.7s	remaining: 1m 18s
331:	learn: 0.1466765	total: 38.8s	remaining: 1m 18s
332:	learn: 0.1462777	total: 38.9s	remaining: 1m 1

472:	learn: 0.1069142	total: 55.6s	remaining: 1m 1s
473:	learn: 0.1066951	total: 55.7s	remaining: 1m 1s
474:	learn: 0.1065159	total: 55.9s	remaining: 1m 1s
475:	learn: 0.1064207	total: 56s	remaining: 1m 1s
476:	learn: 0.1062915	total: 56.1s	remaining: 1m 1s
477:	learn: 0.1061308	total: 56.2s	remaining: 1m 1s
478:	learn: 0.1059028	total: 56.3s	remaining: 1m 1s
479:	learn: 0.1053718	total: 56.5s	remaining: 1m 1s
480:	learn: 0.1051202	total: 56.6s	remaining: 1m 1s
481:	learn: 0.1047724	total: 56.7s	remaining: 1m
482:	learn: 0.1046810	total: 56.8s	remaining: 1m
483:	learn: 0.1044837	total: 56.9s	remaining: 1m
484:	learn: 0.1041728	total: 57.1s	remaining: 1m
485:	learn: 0.1039492	total: 57.2s	remaining: 1m
486:	learn: 0.1037514	total: 57.3s	remaining: 1m
487:	learn: 0.1035949	total: 57.4s	remaining: 1m
488:	learn: 0.1032793	total: 57.6s	remaining: 1m
489:	learn: 0.1028752	total: 57.7s	remaining: 1m
490:	learn: 0.1026612	total: 57.8s	remaining: 59.9s
491:	learn: 0.1025460	total: 57.9s	remain

632:	learn: 0.0785284	total: 1m 14s	remaining: 43.3s
633:	learn: 0.0784457	total: 1m 14s	remaining: 43.2s
634:	learn: 0.0783582	total: 1m 14s	remaining: 43.1s
635:	learn: 0.0781162	total: 1m 15s	remaining: 43s
636:	learn: 0.0779154	total: 1m 15s	remaining: 42.8s
637:	learn: 0.0777677	total: 1m 15s	remaining: 42.7s
638:	learn: 0.0776650	total: 1m 15s	remaining: 42.6s
639:	learn: 0.0774800	total: 1m 15s	remaining: 42.5s
640:	learn: 0.0774067	total: 1m 15s	remaining: 42.4s
641:	learn: 0.0772456	total: 1m 15s	remaining: 42.3s
642:	learn: 0.0771606	total: 1m 15s	remaining: 42.1s
643:	learn: 0.0770109	total: 1m 16s	remaining: 42s
644:	learn: 0.0769596	total: 1m 16s	remaining: 41.9s
645:	learn: 0.0765382	total: 1m 16s	remaining: 41.8s
646:	learn: 0.0763872	total: 1m 16s	remaining: 41.7s
647:	learn: 0.0762611	total: 1m 16s	remaining: 41.6s
648:	learn: 0.0759876	total: 1m 16s	remaining: 41.4s
649:	learn: 0.0758913	total: 1m 16s	remaining: 41.3s
650:	learn: 0.0757111	total: 1m 16s	remaining: 41.

788:	learn: 0.0598361	total: 1m 33s	remaining: 24.9s
789:	learn: 0.0596759	total: 1m 33s	remaining: 24.8s
790:	learn: 0.0595896	total: 1m 33s	remaining: 24.7s
791:	learn: 0.0595305	total: 1m 33s	remaining: 24.6s
792:	learn: 0.0593099	total: 1m 33s	remaining: 24.5s
793:	learn: 0.0591374	total: 1m 33s	remaining: 24.4s
794:	learn: 0.0590741	total: 1m 33s	remaining: 24.2s
795:	learn: 0.0590209	total: 1m 34s	remaining: 24.1s
796:	learn: 0.0589241	total: 1m 34s	remaining: 24s
797:	learn: 0.0588886	total: 1m 34s	remaining: 23.9s
798:	learn: 0.0588453	total: 1m 34s	remaining: 23.8s
799:	learn: 0.0586471	total: 1m 34s	remaining: 23.6s
800:	learn: 0.0585802	total: 1m 34s	remaining: 23.5s
801:	learn: 0.0584947	total: 1m 34s	remaining: 23.4s
802:	learn: 0.0583953	total: 1m 34s	remaining: 23.3s
803:	learn: 0.0583458	total: 1m 35s	remaining: 23.2s
804:	learn: 0.0582646	total: 1m 35s	remaining: 23s
805:	learn: 0.0582264	total: 1m 35s	remaining: 22.9s
806:	learn: 0.0581427	total: 1m 35s	remaining: 22.

944:	learn: 0.0470579	total: 1m 51s	remaining: 6.5s
945:	learn: 0.0469664	total: 1m 51s	remaining: 6.38s
946:	learn: 0.0469297	total: 1m 51s	remaining: 6.26s
947:	learn: 0.0468847	total: 1m 51s	remaining: 6.14s
948:	learn: 0.0468533	total: 1m 52s	remaining: 6.02s
949:	learn: 0.0468071	total: 1m 52s	remaining: 5.91s
950:	learn: 0.0467657	total: 1m 52s	remaining: 5.79s
951:	learn: 0.0466674	total: 1m 52s	remaining: 5.67s
952:	learn: 0.0466034	total: 1m 52s	remaining: 5.55s
953:	learn: 0.0465333	total: 1m 52s	remaining: 5.43s
954:	learn: 0.0464935	total: 1m 52s	remaining: 5.31s
955:	learn: 0.0463674	total: 1m 52s	remaining: 5.2s
956:	learn: 0.0462873	total: 1m 53s	remaining: 5.08s
957:	learn: 0.0462706	total: 1m 53s	remaining: 4.96s
958:	learn: 0.0462210	total: 1m 53s	remaining: 4.84s
959:	learn: 0.0461812	total: 1m 53s	remaining: 4.72s
960:	learn: 0.0461409	total: 1m 53s	remaining: 4.61s
961:	learn: 0.0460557	total: 1m 53s	remaining: 4.49s
962:	learn: 0.0459924	total: 1m 53s	remaining: 4

### Скор на тесте

In [28]:
f1_score(model.predict(df_test), y_test, average='micro')

  y = column_or_1d(y, warn=True)


0.8526947605352007

### Скор на трейне

In [30]:
f1_score(model.predict(df_train), y_train, average='micro')

  y = column_or_1d(y, warn=True)


0.9451689085121675