In [None]:
!pip install catboost

In [1]:
import joblib # для сериализации и сохранения обученной модели в файл
# (см. https://machinelearningmastery.ru/save-load-machine-learning-models-python-scikit-learn/)
import numpy as np
import pandas as pd
import catboost
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, SGDRegressor, LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

In [None]:
data_frame = pd.read_csv('drive/MyDrive/bib_data_union_v2.csv.zip', compression='zip')

In [None]:
data_frame.shape

(6007277, 23)

In [None]:
data_frame.head()

Unnamed: 0,square_brackets,round_brackets,slashes,inverse_slashes,quotes,dots,commas,semicolons,colons,abstract,...,begin_ref,tirets,key,annotation,capital_letters,years,sine,et_al,etc,style_name
0,1,1,1,0,0,5,10,0,0,0,...,1,0,0,0,0.552632,2,0,1,0,iaea
1,1,1,5,0,0,11,3,0,2,0,...,3,0,0,0,0.647059,9,0,0,0,bestpapers
2,1,2,5,0,0,18,2,0,3,0,...,3,0,0,0,0.647059,2,0,0,0,bestpapers
3,1,2,5,0,0,18,2,0,4,0,...,3,0,0,0,0.454545,2,0,0,0,bestpapers
4,1,2,5,0,0,17,2,0,3,0,...,3,0,0,0,0.588235,3,0,0,0,bestpapers


In [None]:
X = data_frame.drop(['style_name'], axis=1)
y = data_frame.style_name

In [None]:
X = pd.get_dummies(X, columns=['begin_ref', 'abstract', 'key', 'annotation', 'sine', 'et_al', 'etc', ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [2]:
def print_metrics(classifier, test_data, test_answers, _average=None):
    print(f"accuracy = {classifier.score(test_data, test_answers)}")
    precisions = precision_score(test_answers, classifier.predict(test_data), average=_average, zero_division=0)
    recalls = recall_score(test_answers, classifier.predict(test_data), average=_average, zero_division=0)
    f1_scores = f1_score(test_answers, classifier.predict(test_data), average=_average, zero_division=0)
    print("precision: ", "min = ", precisions.min(), "max = ", precisions.max(), "mean = ", np.mean(precisions), "median = ", np.median(precisions))
    print("recall: ", "min = ", recalls.min(), "max = ", recalls.max(), "mean = ", np.mean(recalls), "median = ", np.median(recalls))
    print("f1_score: ", "min = ", f1_scores.min(), "max = ", f1_scores.max(), "mean = ", np.mean(f1_scores), "median = ", np.median(f1_scores))

# Random Forest

In [None]:
clf = RandomForestClassifier(criterion='entropy', n_estimators=40, max_depth=11, n_jobs=-1)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.5389785393722284

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

   IEEEannot       0.30      0.01      0.03     13829
    IEEEtran       0.34      0.58      0.42     14633
   IEEEtranN       0.49      0.52      0.51     14815
   IEEEtranS       0.35      0.12      0.18     14399
  IEEEtranSA       0.76      0.87      0.81     14944
  IEEEtranSN       0.48      0.33      0.39     14934
      JHEP-2       0.28      0.01      0.03     14299
  aaai-named       0.98      0.79      0.88     13986
    abstract       0.76      0.94      0.84     14027
    acmtrans       0.57      0.54      0.56     14271
      aichej       0.65      0.68      0.66     13692
         aip       0.28      0.01      0.02     14315
    alphanum       0.62      0.88      0.73     13942
         ama       0.66      0.47      0.55     13918
    amsalpha       0.86      0.88      0.87     10774
    amsplain       0.34      0.62      0.44     10792
    annotate       0.79      0.80      0.79     14295
  annotation       1.00    

In [None]:
precision_score(y_test, clf.predict(X_test), average=None)

In [None]:
clf.feature_importances_ 

array([2.57334071e-02, 1.40270769e-01, 8.54970224e-02, 3.52857883e-06,
       6.35333558e-02, 4.53995573e-02, 2.30159064e-02, 3.46565340e-02,
       5.66727065e-02, 2.92628458e-03, 1.62570014e-02, 6.35904016e-02,
       2.84581430e-02, 3.16135388e-01, 1.95883331e-02, 9.04565353e-03,
       2.84745424e-06, 2.29252288e-02, 6.97758733e-03, 1.41364037e-03,
       3.78955668e-02, 1.13740366e-06])

In [None]:
y_list = list(y)
y_set = set(y_list)
for style in y_set:
  print(style, y_list.count(style))

In [None]:
precisions = precision_score(y_test, clf.predict(X_test), average=None)
recalls = recall_score(y_test, clf.predict(X_test), average=None)
f1_scores = f1_score(y_test, clf.predict(X_test), average=None)


In [None]:
print("precision: ", "min = ", precisions.min(), "max = ", precisions.max(), "mean = ", np.mean(precisions), "median = ", np.median(precisions))
print("recall: ", "min = ", recalls.min(), "max = ", recalls.max(), "mean = ", np.mean(recalls), "median = ", np.median(recalls))
print("f1_score: ", "min = ", f1_scores.min(), "max = ", f1_scores.max(), "mean = ", np.mean(f1_scores), "median = ", np.median(f1_scores))

precision:  min =  0.11790302527973477 max =  1.0 mean =  0.5440775699164183 median =  0.49117680116893336
recall:  min =  0.007471980074719801 max =  0.9984328395935493 mean =  0.5150078729198585 median =  0.5418823529411765
f1_score:  min =  0.014519731943410276 max =  0.9793527321617209 mean =  0.49919371625718445 median =  0.4951682969354887


clf1 = RandomForestClassifier(criterion='gini', max_depth=8, n_estimators=50) ==> 46%

clf2 = RandomForestClassifier(criterion='gini', n_estimators=20, max_depth=10, n_jobs=-1) ==> 50%

clf3 = RandomForestClassifier(criterion='entropy', n_estimators=20, max_depth=10, n_jobs=-1) ==> 51.6%

clf4 = RandomForestClassifier(criterion='entropy', n_estimators=30, max_depth=10, n_jobs=-1) ==> 52%

clf5 = RandomForestClassifier(criterion='entropy', n_estimators=20, max_depth=11, n_jobs=-1) ==> 52.8%

clf6 = RandomForestClassifier(criterion='entropy', n_estimators=30, max_depth=11, n_jobs=-1) ==> 53.6%

# Градиентный бустинг

In [None]:
X = X.drop(['abstract', 'annotation', 'inverse_slashes', 'etc'], axis=1)

In [None]:
y_list = list(y)
for style in y_list:
  if 'ugost2003' in style:
    style = 'ugost2003'
  elif 'ugost2008' in style:
    style = 'ugost2008'
  elif 'IEEE' in style:
    style = 'IEEE'
  elif 'h-physrev' in style:
    style = 'h-physrev'
y = pd.Series(y_list)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [42]:
train_dataset = catboost.Pool(data=X_train, label=y_train, cat_features=[ 'begin_ref_1',
 'begin_ref_2',
 'begin_ref_3',
 'begin_ref_4',
 'begin_ref_5',
 'begin_ref_6',
 'begin_ref_7',
 'begin_ref_8',
 'begin_ref_9',
 'begin_ref_10',
 'begin_ref_11',
 'begin_ref_12',
 'abstract_0',
 'abstract_1',
 'key_0',
 'key_1',
 'annotation_0',
 'annotation_1',
 'sine_0',
 'sine_1',
 'sine_2',
 'et_al_0',
 'et_al_1',
 'et_al_2',
 'et_al_3',
 'etc_0',
 'etc_1'])
test_dataset = catboost.Pool(data=X_test, label=y_test, cat_features=[ 'begin_ref_1',
 'begin_ref_2',
 'begin_ref_3',
 'begin_ref_4',
 'begin_ref_5',
 'begin_ref_6',
 'begin_ref_7',
 'begin_ref_8',
 'begin_ref_9',
 'begin_ref_10',
 'begin_ref_11',
 'begin_ref_12',
 'abstract_0',
 'abstract_1',
 'key_0',
 'key_1',
 'annotation_0',
 'annotation_1',
 'sine_0',
 'sine_1',
 'sine_2',
 'et_al_0',
 'et_al_1',
 'et_al_2',
 'et_al_3',
 'etc_0',
 'etc_1'])

ValueError: 'begin_ref_1' is not in list

In [None]:
boost_model = catboost.CatBoostClassifier(iterations=250,
                           learning_rate=0.05,
                           depth=11,
                           loss_function='MultiClass',
                           task_type='GPU',
                           gpu_ram_part=0.95,

                           eval_metric='Accuracy',
                           cat_features=[ 'begin_ref_1',
 'begin_ref_2',
 'begin_ref_3',
 'begin_ref_4',
 'begin_ref_5',
 'begin_ref_6',
 'begin_ref_7',
 'begin_ref_8',
 'begin_ref_9',
 'begin_ref_10',
 'begin_ref_11',
 'begin_ref_12',
 'abstract_0',
 'abstract_1',
 'key_0',
 'key_1',
 'annotation_0',
 'annotation_1',
 'sine_0',
 'sine_1',
 'sine_2',
 'et_al_0',
 'et_al_1',
 'et_al_2',
 'et_al_3',
 'etc_0',
 'etc_1'])
boost_model.fit(train_dataset,eval_set=(X_val, y_val))

0:	learn: 0.3080077	test: 0.3075002	best: 0.3075002 (0)	total: 3.6s	remaining: 14m 57s
1:	learn: 0.3493131	test: 0.3488401	best: 0.3488401 (1)	total: 7.2s	remaining: 14m 52s
2:	learn: 0.3974112	test: 0.3965921	best: 0.3965921 (2)	total: 10.9s	remaining: 14m 58s
3:	learn: 0.4142674	test: 0.4132910	best: 0.4132910 (3)	total: 14.5s	remaining: 14m 52s
4:	learn: 0.4298913	test: 0.4288663	best: 0.4288663 (4)	total: 18.1s	remaining: 14m 46s
5:	learn: 0.4329104	test: 0.4317653	best: 0.4317653 (5)	total: 21.6s	remaining: 14m 40s
6:	learn: 0.4393770	test: 0.4383831	best: 0.4383831 (6)	total: 25.2s	remaining: 14m 34s
7:	learn: 0.4436465	test: 0.4426504	best: 0.4426504 (7)	total: 28.7s	remaining: 14m 28s
8:	learn: 0.4436998	test: 0.4426962	best: 0.4426962 (8)	total: 32.3s	remaining: 14m 25s
9:	learn: 0.4533373	test: 0.4521023	best: 0.4521023 (9)	total: 35.9s	remaining: 14m 22s
10:	learn: 0.4557785	test: 0.4544628	best: 0.4544628 (10)	total: 39.6s	remaining: 14m 19s
11:	learn: 0.4588930	test: 0.457

<catboost.core.CatBoostClassifier at 0x7fa82c3649a0>

In [None]:
print(classification_report(y_test, boost_model.predict(test_dataset)))

              precision    recall  f1-score   support

   IEEEannot       0.31      0.07      0.12     13829
    IEEEtran       0.33      0.49      0.39     14633
   IEEEtranN       0.48      0.55      0.51     14815
   IEEEtranS       0.31      0.19      0.24     14399
  IEEEtranSA       0.80      0.90      0.85     14944
  IEEEtranSN       0.46      0.32      0.38     14934
      JHEP-2       0.40      0.40      0.40     14299
  aaai-named       0.94      0.88      0.91     13986
    abstract       0.80      0.88      0.84     14027
    acmtrans       0.65      0.71      0.68     14271
      aichej       0.77      0.76      0.77     13692
         aip       0.29      0.26      0.27     14315
    alphanum       0.88      0.84      0.86     13942
         ama       0.60      0.66      0.63     13918
    amsalpha       0.96      0.93      0.94     10774
    amsplain       0.70      0.83      0.76     10792
    annotate       0.79      0.88      0.83     14295
  annotation       1.00    

In [None]:
boost_model.get_best_score()

{'learn': {'Accuracy': 0.6170623674350406, 'MultiClass': 1.0738434509268624},
 'validation': {'Accuracy': 0.6065057729954322,
  'MultiClass': 1.082117239416175}}

In [None]:
boost_model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,begin_ref_1,21.717033
1,ampersands,15.680614
2,round_brackets,15.167533
3,begin_ref_3,13.150782
4,begin_ref_5,6.249451
5,dots,4.739864
6,begin_ref_11,3.814263
7,slashes,2.868023
8,colons,2.30749
9,page_ref,2.163333


In [None]:
metrics = boost_model.eval_metrics(
    data=train_dataset,
    metrics=['Precision','Recall','F1','Accuracy'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
from catboost import cv

params = {
    'loss_function': 'MultiClass',
    'iterations': 250,
    'depth': 10,
    'random_seed': 0,
    'learning_rate': 0.05,
    'task_type': 'GPU',
    'gpu_ram_part': 0.95,
    'eval_metric': 'Accuracy'
}

cv_data = cv(
    params=params,
    pool=catboost.Pool(data=X, label=y),
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=True, 
    verbose=False
)

In [None]:
cv_model = catboost.CatBoostClassifier(
                           loss_function='MultiClass',
                           task_type='GPU',
                           gpu_ram_part=0.95,
                           eval_metric='Accuracy',
                          )

parameters = {
    'iterations': range(150, 351, 50),
    'depth': range(6, 14),
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.5]
}

cv_model.grid_search(parameters, X, y)

# Наивная байесовская классификация

In [None]:
precisions = precision_score(y_test, bayes_clf.predict(X_test), average=None)
recalls = recall_score(y_test, bayes_clf.predict(X_test), average=None)
f1_scores = f1_score(y_test, bayes_clf.predict(X_test), average=None)

In [None]:
print("precision: ", "min = ", precisions.min(), "max = ", precisions.max(), "mean = ", np.mean(precisions), "median = ", np.median(precisions))
print("recall: ", "min = ", recalls.min(), "max = ", recalls.max(), "mean = ", np.mean(recalls), "median = ", np.median(recalls))
print("f1_score: ", "min = ", f1_scores.min(), "max = ", f1_scores.max(), "mean = ", np.mean(f1_scores), "median = ", np.median(f1_scores))

precision:  min =  0.09152843601895734 max =  0.9041331535636246 mean =  0.3503485530217752 median =  0.34906358513121194
recall:  min =  0.00959335624284078 max =  0.9831151104595319 mean =  0.32680261035539704 median =  0.3074704491725768
f1_score:  min =  0.018126888217522657 max =  0.9419714216517318 mean =  0.31132752247121714 median =  0.2896955711369486


In [None]:
cv_bayes_clf = MultinomialNB()
parameters = {'var_smoothing': [1e-2, 5e-2, 1e-3, 5e-3, 1e-4, 5e-4, 1e-5, 5e-5, 1e-6, 5e-6, 1e-7, 5e-7]}
grid = GridSearchCV(cv_bayes_clf, parameters)
grid.fit(X_train, y_train)

In [None]:
best_clf = grid.best_estimator_
best_clf.score(X_test, y_test)

0.3669924935522677

# Линейная классификация

In [None]:
sgd_clf_1 = SGDClassifier(loss='hinge', early_stopping=True, n_jobs=-1, random_state=0)
sgd_clf_1.fit(X_train, y_train)
sgd_clf_1.score(X_test, y_test)

In [None]:
precisions = precision_score(y_test, sgd_clf_1.predict(X_test), average=None)
recalls = recall_score(y_test, sgd_clf_1.predict(X_test), average=None)
f1_scores = f1_score(y_test, sgd_clf_1.predict(X_test), average=None)


In [None]:
print("precision: ", "min = ", precisions.min(), "max = ", precisions.max(), "mean = ", np.mean(precisions), "median = ", np.median(precisions))
print("recall: ", "min = ", recalls.min(), "max = ", recalls.max(), "mean = ", np.mean(recalls), "median = ", np.median(recalls))
print("f1_score: ", "min = ", f1_scores.min(), "max = ", f1_scores.max(), "mean = ", np.mean(f1_scores), "median = ", np.median(f1_scores))

precision:  min =  0.0 max =  0.9944520765085069 mean =  0.40912341157097065 median =  0.38881491344873503
recall:  min =  0.0 max =  0.9994787223959815 mean =  0.39585435843587163 median =  0.3600458190148912
f1_score:  min =  0.0 max =  0.9946708168270137 mean =  0.36837470035717856 median =  0.3093249349582364


# Векторизация

In [3]:
text_data_frame = pd.read_csv('./bib_data_union_v3.csv.zip', compression='zip')

In [14]:
text_data_frame.shape

(6006215, 2)

In [6]:
text_data_frame.head()

Unnamed: 0,tokenized_record,style_name
0,"[ num ] sp upword , sp caplet . sp caplet . sp...",iaea
1,[ capword sp othword sp capword ( year ) ] sp ...,bestpapers
2,[ capword ( year ) ] sp caplet . sp caplet . s...,bestpapers
3,[ capword sp othword sp capword ( year ) ] sp ...,bestpapers
4,[ capword ( year ) ] sp capword sp caplet . sp...,bestpapers


In [4]:
corpus = text_data_frame.tokenized_record

In [5]:
# ngram_range=(1, 1) => униграммы, ngram_range=(2, 2) => биграммы и т.д.
vectorizer = CountVectorizer(tokenizer=lambda txt: txt.split(), ngram_range=(3, 3))
vectorized_data = vectorizer.fit_transform(corpus).toarray()
print("SUCCESS")

SUCCESS


In [3]:
# временный код
X = pd.read_csv('2grams_bib_data.csv')
y = X.style_name
X = X.drop(['style_name'], axis=1)

In [6]:
X = pd.DataFrame(data=vectorized_data, columns=vectorizer.get_feature_names())
y = text_data_frame.style_name

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Линейная классификация на новом датасете

In [8]:
sgd_clf_1 = SGDClassifier(loss='hinge', early_stopping=False, n_jobs=-1, random_state=0)

t_start = time.time()

sgd_clf_1.fit(X_train, y_train)

t_finish = time.time()
print(t_finish - t_start)

SGDClassifier(n_jobs=-1, random_state=0)

In [9]:
print(classification_report(y_test, sgd_clf_1.predict(X_test), zero_division=0))

              precision    recall  f1-score   support

   IEEEannot       0.31      0.04      0.07     20957
    IEEEtran       0.25      0.73      0.37     21931
   IEEEtranN       0.43      0.41      0.42     21967
   IEEEtranS       0.34      0.01      0.01     21680
  IEEEtranSA       0.73      0.80      0.77     22415
  IEEEtranSN       0.36      0.57      0.44     22108
      JHEP-2       0.79      0.58      0.67     21360
  aaai-named       0.91      0.90      0.91     20922
    abstract       0.75      0.00      0.00     21111
    acmtrans       0.71      0.76      0.74     21473
      aichej       0.96      0.91      0.94     20369
         aip       0.16      0.00      0.01     21412
    alphanum       0.35      0.88      0.50     20977
         ama       0.96      0.90      0.93     20961
    amsalpha       0.97      0.92      0.94     16073
    amsplain       0.96      0.83      0.89     16271
    annotate       0.51      0.04      0.08     21166
  annotation       0.97    

In [21]:
print_metrics(sgd_clf_1, X_test, y_test, _average=None)

accuracy = 0.6365399183623635
precision:  min =  0.0 max =  1.0 mean =  0.6484155952821647 median =  0.723570564864398
recall:  min =  0.0 max =  0.986599586232838 mean =  0.6283646477473733 median =  0.7510262529832935
f1_score:  min =  0.0 max =  0.9863568634755074 mean =  0.6173944345170069 median =  0.6971282255038298


In [10]:
filename = './SGDClassifier_2grams.sav'
joblib.dump(sgd_clf_1, filename)

['./SGDClassifier_2grams.sav']

In [12]:
loaded_clf = joblib.load('./SGDClassifier_2grams.sav')
loaded_clf.score(X_test, y_test)

0.6455904523101013

# Наивный Байесовский алгоритм на новом датасете

In [28]:
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

GaussianNB()

In [30]:
print(classification_report(y_test, nb_clf.predict(X_test), zero_division=0))

              precision    recall  f1-score   support

   IEEEannot       0.17      0.01      0.01     20807
    IEEEtran       0.23      0.56      0.32     21899
   IEEEtranN       0.46      0.53      0.49     21766
   IEEEtranS       0.22      0.00      0.01     21727
  IEEEtranSA       0.48      0.43      0.46     22438
  IEEEtranSN       0.41      0.02      0.03     22245
      JHEP-2       0.75      0.30      0.43     21299
  aaai-named       0.60      0.26      0.37     20864
    abstract       0.39      0.10      0.15     20966
    acmtrans       0.51      0.17      0.26     21347
      aichej       0.89      0.44      0.59     20229
         aip       0.20      0.06      0.09     21472
    alphanum       0.40      0.33      0.36     20913
         ama       0.86      0.69      0.77     20797
    amsalpha       0.74      0.58      0.65     16031
    amsplain       0.64      0.68      0.66     16277
    annotate       0.28      0.08      0.12     21378
  annotation       0.88    

In [25]:
print_metrics(nb_clf, X_test, y_test, _average=None)

accuracy = 0.5512366353750142
precision:  min =  0.10247349823321555 max =  0.9970263381478335 mean =  0.5722132801623461 median =  0.5541376455116653
recall:  min =  0.04753208054981497 max =  0.9693084500767288 mean =  0.5548585606161551 median =  0.5698594245981761
f1_score:  min =  0.08279961488551217 max =  0.9730892381394668 mean =  0.5415205346974971 median =  0.582771258739105


# Градиентный бустинг на новом датасете

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [14]:
train_dataset = catboost.Pool(data=X_train, label=y_train)
test_data = catboost.Pool(data=X_test, label=y_test)

In [6]:
def gradient_boosting(_iterations: int, _learning_rate: float, _depth: int):
    boost_model = catboost.CatBoostClassifier(iterations=_iterations,
                           learning_rate=_learning_rate,
                           depth=_depth,
                           loss_function='MultiClass',
#                            task_type='GPU',
#                            gpu_ram_part=0.75,
                           eval_metric='Accuracy')
    time_start = time.time()
    boost_model.fit(train_dataset,eval_set=(X_val, y_val))
    time_finish = time.time()
    print(time_finish - time_start)
    return boost_model

# boost_model_1 = gradient_boosting(250, 0.05, 6)
# print("SUCCESS")

In [None]:
boost_model_1.get_best_score()

In [None]:
print_metrics(boost_model_1, X_test, y_test, _average=None)

In [19]:
boost_model_1.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,[ num,15.665995
1,( year,11.726335
2,) .,7.452743
3,". ,",5.989705
4,num ],5.017204
...,...,...
457,— num,0.000000
458,— othword,0.000000
459,— smallet,0.000000
460,— upword,0.000000


In [None]:
metrics = boost_model_1.eval_metrics(
    data=train_dataset,
    metrics=['Precision','Recall','F1','Accuracy'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
filename = './CatBoostClassifier_2grams_v2.sav'
joblib.dump(boost_model_1, filename)

In [12]:
loaded_catboost_model = joblib.load('./CatBoostClassifier_2grams.sav')
# print_metrics(loaded_catboost_model, X_test, y_test, _average=None)
fi = loaded_catboost_model.feature_importances_
f = open('catboost_2grams_feature_importances.txt', 'w')
f.write(fi)
f.close()

AttributeError: 'SGDClassifier' object has no attribute 'feature_importances_'

# Random forest на новом датасете

In [35]:

for _n_estimators in range(20, 51, 10):
    for _max_depth in range(10, 13):
        rf_clf = RandomForestClassifier(criterion='entropy', n_estimators=_n_estimators, max_depth=_max_depth, n_jobs=-1)
        rf_clf.fit(X_train, y_train)
        print(_n_estimators, _max_depth)
        print(rf_clf.score(X_test, y_test))
        print_metrics(rf_clf, X_test, y_test, _average=None)
        print('___________________________')

20 8
0.6121901870259536
accuracy = 0.6121901870259536
precision:  min =  0.0 max =  0.9922599488864549 mean =  0.5980777286744221 median =  0.6239096724171351
recall:  min =  0.0 max =  0.9895473061255861 mean =  0.5889736611021662 median =  0.6566919076726576
f1_score:  min =  0.0 max =  0.9842465505377902 mean =  0.5712248490553617 median =  0.5860415116182917
___________________________
20 9
0.6254969866607407
accuracy = 0.6254969866607407
precision:  min =  0.0 max =  0.9902464065708418 mean =  0.6207565322099907 median =  0.624644315229494
recall:  min =  0.0 max =  0.9809168519979913 mean =  0.6032118706864673 median =  0.6765921670286555
f1_score:  min =  0.0 max =  0.9801117805037382 mean =  0.5878706135994055 median =  0.6192770198212775
___________________________
20 10
0.6454280556408662
accuracy = 0.6454280556408662
precision:  min =  0.0 max =  0.9898599358039101 mean =  0.6252949321279772 median =  0.6232141723284018
recall:  min =  0.0 max =  0.9942031059901238 mean =  0

In [10]:
rf_clf_2 = RandomForestClassifier(criterion='entropy', n_estimators=200, max_depth=18, n_jobs=-1)
rf_clf_2.fit(X_train, y_train)
rf_clf_2.score(X_test, y_test)

0.6972223963884883

In [None]:
print(classification_report(y_test, rf_clf.predict(X_test)))

In [9]:
print_metrics(rf_clf, X_test, y_test, _average=None)

NameError: name 'print_metrics' is not defined

# Обработка отдельными батчами


In [4]:
chunksize = 6 * 10**5
i = 0
with pd.read_csv('2grams_bib_data.csv', chunksize=chunksize) as reader:
    for chunk in reader:        
        X = chunk.drop(['style_name'], axis=1)
        y = chunk.style_name
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
        train_dataset = catboost.Pool(data=X_train, label=y_train)
        test_data = catboost.Pool(data=X_test, label=y_test)
        
        if i == 0:
            boost_model = catboost.CatBoostClassifier(iterations=250,
                           learning_rate=0.05,
                           depth=6,
                            task_type='GPU',
                                                      
                           loss_function='MultiClass',
                           eval_metric='Accuracy')
            time_start = time.time()
            boost_model.fit(train_dataset, eval_set=(X_val, y_val))
            time_finish = time.time()
            print(time_finish - time_start)
        else:
            time_start = time.time()
            boost_model.fit(train_dataset, eval_set=(X_val, y_val), init_model='cat_boost_model.cbm')
            time_finish = time.time()
            print(time_finish - time_start)
        print(boost_model.get_best_score())
        boost_model.save_model('cat_boost_model.cbm')
        i += 1
        del X, y, X_train, X_test, y_train, y_test, X_val, y_val, train_dataset, test_data

KeyboardInterrupt: 

# Градиентный бустинг (2-граммы) на GPU


In [5]:
df = pd.read_csv('2grams_bib_data.csv')
X = df.drop(['style_name'], axis=1)
y = df.style_name
del df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
train_dataset = catboost.Pool(data=X_train, label=y_train)
test_data = catboost.Pool(data=X_test, label=y_test)





0:	learn: 0.1694692	test: 0.1699597	best: 0.1699597 (0)	total: 576ms	remaining: 2m 23s
1:	learn: 0.2919545	test: 0.2919556	best: 0.2919556 (1)	total: 1.13s	remaining: 2m 20s
2:	learn: 0.3427248	test: 0.3420284	best: 0.3420284 (2)	total: 1.67s	remaining: 2m 17s
3:	learn: 0.3530561	test: 0.3522557	best: 0.3522557 (3)	total: 2.21s	remaining: 2m 15s
4:	learn: 0.3647705	test: 0.3641484	best: 0.3641484 (4)	total: 2.79s	remaining: 2m 16s
5:	learn: 0.3886175	test: 0.3883591	best: 0.3883591 (5)	total: 4.02s	remaining: 2m 43s
6:	learn: 0.4008640	test: 0.4006156	best: 0.4006156 (6)	total: 5.27s	remaining: 3m 3s
7:	learn: 0.3993489	test: 0.3991066	best: 0.4006156 (6)	total: 6.48s	remaining: 3m 16s
8:	learn: 0.3992793	test: 0.3990384	best: 0.4006156 (6)	total: 7.66s	remaining: 3m 25s
9:	learn: 0.4323349	test: 0.4325284	best: 0.4325284 (9)	total: 8.3s	remaining: 3m 19s
10:	learn: 0.4363022	test: 0.4365451	best: 0.4365451 (10)	total: 8.87s	remaining: 3m 12s
11:	learn: 0.4472045	test: 0.4472435	best: 

93:	learn: 0.6945888	test: 0.6953028	best: 0.6953028 (93)	total: 1m 58s	remaining: 3m 17s
94:	learn: 0.6963281	test: 0.6970081	best: 0.6970081 (94)	total: 2m	remaining: 3m 16s
95:	learn: 0.6967267	test: 0.6974226	best: 0.6974226 (95)	total: 2m 1s	remaining: 3m 15s
96:	learn: 0.6979935	test: 0.6987202	best: 0.6987202 (96)	total: 2m 3s	remaining: 3m 14s
97:	learn: 0.6997893	test: 0.7003115	best: 0.7003115 (97)	total: 2m 4s	remaining: 3m 12s
98:	learn: 0.7015954	test: 0.7020793	best: 0.7020793 (98)	total: 2m 5s	remaining: 3m 12s
99:	learn: 0.7022424	test: 0.7026686	best: 0.7026686 (99)	total: 2m 7s	remaining: 3m 10s
100:	learn: 0.7035336	test: 0.7039678	best: 0.7039678 (100)	total: 2m 8s	remaining: 3m 9s
101:	learn: 0.7039675	test: 0.7047635	best: 0.7047635 (101)	total: 2m 10s	remaining: 3m 8s
102:	learn: 0.7046627	test: 0.7053977	best: 0.7053977 (102)	total: 2m 11s	remaining: 3m 7s
103:	learn: 0.7052922	test: 0.7060852	best: 0.7060852 (103)	total: 2m 12s	remaining: 3m 6s
104:	learn: 0.70

183:	learn: 0.7529910	test: 0.7536128	best: 0.7536128 (183)	total: 4m 3s	remaining: 1m 27s
184:	learn: 0.7531486	test: 0.7537726	best: 0.7537726 (184)	total: 4m 4s	remaining: 1m 26s
185:	learn: 0.7538683	test: 0.7544534	best: 0.7544534 (185)	total: 4m 6s	remaining: 1m 24s
186:	learn: 0.7541593	test: 0.7547464	best: 0.7547464 (186)	total: 4m 7s	remaining: 1m 23s
187:	learn: 0.7545166	test: 0.7551767	best: 0.7551767 (187)	total: 4m 9s	remaining: 1m 22s
188:	learn: 0.7549760	test: 0.7555687	best: 0.7555687 (188)	total: 4m 10s	remaining: 1m 20s
189:	learn: 0.7558261	test: 0.7564043	best: 0.7564043 (189)	total: 4m 11s	remaining: 1m 19s
190:	learn: 0.7560722	test: 0.7566324	best: 0.7566324 (190)	total: 4m 13s	remaining: 1m 18s
191:	learn: 0.7563127	test: 0.7568305	best: 0.7568305 (191)	total: 4m 14s	remaining: 1m 16s
192:	learn: 0.7575298	test: 0.7580198	best: 0.7580198 (192)	total: 4m 15s	remaining: 1m 15s
193:	learn: 0.7576056	test: 0.7580931	best: 0.7580931 (193)	total: 4m 17s	remaining: 

In [10]:
boost_model = catboost.CatBoostClassifier(iterations=1000,
                           learning_rate=0.05,
                           depth=4,
                           task_type='GPU',                                                
                           loss_function='MultiClass',
                           eval_metric='Accuracy')
time_start = time.time()
boost_model.fit(train_dataset, eval_set=(X_val, y_val))
time_finish = time.time()
print(time_finish - time_start)



0:	learn: 0.0926543	test: 0.0922787	best: 0.0922787 (0)	total: 1.21s	remaining: 20m 13s
1:	learn: 0.2339866	test: 0.2337820	best: 0.2337820 (1)	total: 2.35s	remaining: 19m 35s
2:	learn: 0.2760943	test: 0.2755518	best: 0.2755518 (2)	total: 3.51s	remaining: 19m 27s
3:	learn: 0.2944892	test: 0.2939839	best: 0.2939839 (3)	total: 4.67s	remaining: 19m 23s
4:	learn: 0.3077829	test: 0.3071767	best: 0.3071767 (4)	total: 5.81s	remaining: 19m 16s
5:	learn: 0.3402318	test: 0.3400658	best: 0.3400658 (5)	total: 7s	remaining: 19m 20s
6:	learn: 0.3439455	test: 0.3436522	best: 0.3436522 (6)	total: 8.17s	remaining: 19m 19s
7:	learn: 0.3490816	test: 0.3488857	best: 0.3488857 (7)	total: 9.35s	remaining: 19m 19s
8:	learn: 0.3491785	test: 0.3489539	best: 0.3489539 (8)	total: 10.5s	remaining: 19m 17s
9:	learn: 0.3493557	test: 0.3491320	best: 0.3491320 (9)	total: 11.6s	remaining: 19m 9s
10:	learn: 0.3524616	test: 0.3522881	best: 0.3522881 (10)	total: 12.7s	remaining: 19m 4s
11:	learn: 0.3595975	test: 0.359427

92:	learn: 0.6283786	test: 0.6280563	best: 0.6280563 (92)	total: 1m 47s	remaining: 17m 24s
93:	learn: 0.6284022	test: 0.6280896	best: 0.6280896 (93)	total: 1m 48s	remaining: 17m 22s
94:	learn: 0.6292134	test: 0.6288553	best: 0.6288553 (94)	total: 1m 49s	remaining: 17m 21s
95:	learn: 0.6291282	test: 0.6287546	best: 0.6288553 (94)	total: 1m 50s	remaining: 17m 20s
96:	learn: 0.6305856	test: 0.6302652	best: 0.6302652 (96)	total: 1m 51s	remaining: 17m 19s
97:	learn: 0.6307448	test: 0.6303701	best: 0.6303701 (97)	total: 1m 52s	remaining: 17m 18s
98:	learn: 0.6327246	test: 0.6324292	best: 0.6324292 (98)	total: 1m 54s	remaining: 17m 18s
99:	learn: 0.6368361	test: 0.6363910	best: 0.6363910 (99)	total: 1m 55s	remaining: 17m 17s
100:	learn: 0.6368825	test: 0.6364609	best: 0.6364609 (100)	total: 1m 56s	remaining: 17m 15s
101:	learn: 0.6378626	test: 0.6375528	best: 0.6375528 (101)	total: 1m 57s	remaining: 17m 14s
102:	learn: 0.6388287	test: 0.6384917	best: 0.6384917 (102)	total: 1m 58s	remaining: 1

181:	learn: 0.7117209	test: 0.7122475	best: 0.7122475 (181)	total: 3m 30s	remaining: 15m 47s
182:	learn: 0.7124258	test: 0.7129708	best: 0.7129708 (182)	total: 3m 31s	remaining: 15m 46s
183:	learn: 0.7126864	test: 0.7131864	best: 0.7131864 (183)	total: 3m 33s	remaining: 15m 45s
184:	learn: 0.7132970	test: 0.7137407	best: 0.7137407 (184)	total: 3m 34s	remaining: 15m 44s
185:	learn: 0.7133919	test: 0.7138364	best: 0.7138364 (185)	total: 3m 35s	remaining: 15m 42s
186:	learn: 0.7135414	test: 0.7140370	best: 0.7140370 (186)	total: 3m 36s	remaining: 15m 41s
187:	learn: 0.7140524	test: 0.7145721	best: 0.7145721 (187)	total: 3m 37s	remaining: 15m 40s
188:	learn: 0.7144142	test: 0.7149433	best: 0.7149433 (188)	total: 3m 38s	remaining: 15m 39s
189:	learn: 0.7150265	test: 0.7154244	best: 0.7154244 (189)	total: 3m 40s	remaining: 15m 38s
190:	learn: 0.7154912	test: 0.7159721	best: 0.7159721 (190)	total: 3m 41s	remaining: 15m 37s
191:	learn: 0.7157412	test: 0.7162334	best: 0.7162334 (191)	total: 3m 

270:	learn: 0.7393180	test: 0.7400705	best: 0.7400946 (269)	total: 4m 59s	remaining: 13m 26s
271:	learn: 0.7393887	test: 0.7401004	best: 0.7401004 (271)	total: 5m 1s	remaining: 13m 25s
272:	learn: 0.7398365	test: 0.7406115	best: 0.7406115 (272)	total: 5m 2s	remaining: 13m 24s
273:	learn: 0.7403240	test: 0.7410850	best: 0.7410850 (273)	total: 5m 3s	remaining: 13m 23s
274:	learn: 0.7408175	test: 0.7415195	best: 0.7415195 (274)	total: 5m 4s	remaining: 13m 22s
275:	learn: 0.7411726	test: 0.7418924	best: 0.7418924 (275)	total: 5m 5s	remaining: 13m 21s
276:	learn: 0.7413716	test: 0.7420738	best: 0.7420738 (276)	total: 5m 6s	remaining: 13m 20s
277:	learn: 0.7413993	test: 0.7421487	best: 0.7421487 (277)	total: 5m 7s	remaining: 13m 19s
278:	learn: 0.7422765	test: 0.7429985	best: 0.7429985 (278)	total: 5m 9s	remaining: 13m 18s
279:	learn: 0.7426572	test: 0.7433248	best: 0.7433248 (279)	total: 5m 10s	remaining: 13m 17s
280:	learn: 0.7429796	test: 0.7435977	best: 0.7435977 (280)	total: 5m 11s	rema

359:	learn: 0.7599820	test: 0.7607747	best: 0.7607747 (359)	total: 6m 42s	remaining: 11m 55s
360:	learn: 0.7604784	test: 0.7612558	best: 0.7612558 (360)	total: 6m 43s	remaining: 11m 54s
361:	learn: 0.7604820	test: 0.7612308	best: 0.7612558 (360)	total: 6m 44s	remaining: 11m 53s
362:	learn: 0.7606326	test: 0.7613956	best: 0.7613956 (362)	total: 6m 46s	remaining: 11m 52s
363:	learn: 0.7606903	test: 0.7614472	best: 0.7614472 (363)	total: 6m 47s	remaining: 11m 51s
364:	learn: 0.7608698	test: 0.7616262	best: 0.7616262 (364)	total: 6m 48s	remaining: 11m 50s
365:	learn: 0.7611609	test: 0.7619125	best: 0.7619125 (365)	total: 6m 49s	remaining: 11m 49s
366:	learn: 0.7611778	test: 0.7618983	best: 0.7619125 (365)	total: 6m 50s	remaining: 11m 48s
367:	learn: 0.7619074	test: 0.7625891	best: 0.7625891 (367)	total: 6m 51s	remaining: 11m 47s
368:	learn: 0.7619568	test: 0.7626599	best: 0.7626599 (368)	total: 6m 52s	remaining: 11m 46s
369:	learn: 0.7623913	test: 0.7630461	best: 0.7630461 (369)	total: 6m 

448:	learn: 0.7782566	test: 0.7788514	best: 0.7788514 (448)	total: 8m 25s	remaining: 10m 20s
449:	learn: 0.7783967	test: 0.7789671	best: 0.7789671 (449)	total: 8m 26s	remaining: 10m 18s
450:	learn: 0.7786816	test: 0.7791727	best: 0.7791727 (450)	total: 8m 27s	remaining: 10m 17s
451:	learn: 0.7787620	test: 0.7792526	best: 0.7792526 (451)	total: 8m 28s	remaining: 10m 16s
452:	learn: 0.7788958	test: 0.7793825	best: 0.7793825 (452)	total: 8m 29s	remaining: 10m 15s
453:	learn: 0.7790092	test: 0.7794873	best: 0.7794873 (453)	total: 8m 31s	remaining: 10m 14s
454:	learn: 0.7790961	test: 0.7795456	best: 0.7795456 (454)	total: 8m 32s	remaining: 10m 13s
455:	learn: 0.7791382	test: 0.7796238	best: 0.7796238 (455)	total: 8m 33s	remaining: 10m 12s
456:	learn: 0.7795641	test: 0.7800583	best: 0.7800583 (456)	total: 8m 34s	remaining: 10m 11s
457:	learn: 0.7796967	test: 0.7802031	best: 0.7802031 (457)	total: 8m 35s	remaining: 10m 10s
458:	learn: 0.7799051	test: 0.7803429	best: 0.7803429 (458)	total: 8m 

538:	learn: 0.7924431	test: 0.7927750	best: 0.7927750 (538)	total: 9m 58s	remaining: 8m 31s
539:	learn: 0.7925091	test: 0.7928499	best: 0.7928499 (539)	total: 9m 59s	remaining: 8m 30s
540:	learn: 0.7927719	test: 0.7931037	best: 0.7931037 (540)	total: 10m	remaining: 8m 29s
541:	learn: 0.7928853	test: 0.7932036	best: 0.7932036 (541)	total: 10m 1s	remaining: 8m 28s
542:	learn: 0.7930451	test: 0.7933601	best: 0.7933601 (542)	total: 10m 2s	remaining: 8m 27s
543:	learn: 0.7931456	test: 0.7934816	best: 0.7934816 (543)	total: 10m 3s	remaining: 8m 26s
544:	learn: 0.7933617	test: 0.7937005	best: 0.7937005 (544)	total: 10m 4s	remaining: 8m 24s
545:	learn: 0.7934707	test: 0.7938062	best: 0.7938062 (545)	total: 10m 4s	remaining: 8m 22s
546:	learn: 0.7935096	test: 0.7938736	best: 0.7938736 (546)	total: 10m 5s	remaining: 8m 21s
547:	learn: 0.7935287	test: 0.7938861	best: 0.7938861 (547)	total: 10m 5s	remaining: 8m 19s
548:	learn: 0.7934999	test: 0.7938636	best: 0.7938861 (547)	total: 10m 6s	remaining

627:	learn: 0.8020581	test: 0.8023090	best: 0.8023090 (627)	total: 11m 36s	remaining: 6m 52s
628:	learn: 0.8022934	test: 0.8024837	best: 0.8024837 (628)	total: 11m 38s	remaining: 6m 51s
629:	learn: 0.8023361	test: 0.8025353	best: 0.8025353 (629)	total: 11m 39s	remaining: 6m 50s
630:	learn: 0.8023802	test: 0.8025944	best: 0.8025944 (630)	total: 11m 40s	remaining: 6m 49s
631:	learn: 0.8024057	test: 0.8026411	best: 0.8026411 (631)	total: 11m 41s	remaining: 6m 48s
632:	learn: 0.8023966	test: 0.8026560	best: 0.8026560 (632)	total: 11m 42s	remaining: 6m 47s
633:	learn: 0.8025286	test: 0.8027584	best: 0.8027584 (633)	total: 11m 43s	remaining: 6m 46s
634:	learn: 0.8025447	test: 0.8027967	best: 0.8027967 (634)	total: 11m 44s	remaining: 6m 45s
635:	learn: 0.8027056	test: 0.8029473	best: 0.8029473 (635)	total: 11m 46s	remaining: 6m 44s
636:	learn: 0.8027991	test: 0.8029998	best: 0.8029998 (636)	total: 11m 47s	remaining: 6m 43s
637:	learn: 0.8028973	test: 0.8030947	best: 0.8030947 (637)	total: 11m

716:	learn: 0.8123603	test: 0.8124023	best: 0.8124023 (716)	total: 13m 19s	remaining: 5m 15s
717:	learn: 0.8123770	test: 0.8123806	best: 0.8124023 (716)	total: 13m 21s	remaining: 5m 14s
718:	learn: 0.8125110	test: 0.8125820	best: 0.8125820 (718)	total: 13m 22s	remaining: 5m 13s
719:	learn: 0.8126128	test: 0.8127110	best: 0.8127110 (719)	total: 13m 23s	remaining: 5m 12s
720:	learn: 0.8126269	test: 0.8127219	best: 0.8127219 (720)	total: 13m 24s	remaining: 5m 11s
721:	learn: 0.8130764	test: 0.8131480	best: 0.8131480 (721)	total: 13m 25s	remaining: 5m 10s
722:	learn: 0.8131785	test: 0.8132262	best: 0.8132262 (722)	total: 13m 26s	remaining: 5m 9s
723:	learn: 0.8131543	test: 0.8132088	best: 0.8132262 (722)	total: 13m 28s	remaining: 5m 8s
724:	learn: 0.8132858	test: 0.8133519	best: 0.8133519 (724)	total: 13m 29s	remaining: 5m 6s
725:	learn: 0.8134700	test: 0.8135334	best: 0.8135334 (725)	total: 13m 30s	remaining: 5m 5s
726:	learn: 0.8136196	test: 0.8136665	best: 0.8136665 (726)	total: 13m 31s

805:	learn: 0.8221701	test: 0.8222866	best: 0.8222866 (805)	total: 14m 59s	remaining: 3m 36s
806:	learn: 0.8221620	test: 0.8222767	best: 0.8222866 (805)	total: 14m 59s	remaining: 3m 35s
807:	learn: 0.8224273	test: 0.8224980	best: 0.8224980 (807)	total: 15m	remaining: 3m 33s
808:	learn: 0.8225646	test: 0.8226670	best: 0.8226670 (808)	total: 15m	remaining: 3m 32s
809:	learn: 0.8227563	test: 0.8228068	best: 0.8228068 (809)	total: 15m 1s	remaining: 3m 31s
810:	learn: 0.8227657	test: 0.8228102	best: 0.8228102 (810)	total: 15m 3s	remaining: 3m 30s
811:	learn: 0.8228465	test: 0.8228709	best: 0.8228709 (811)	total: 15m 4s	remaining: 3m 29s
812:	learn: 0.8229111	test: 0.8229333	best: 0.8229333 (812)	total: 15m 4s	remaining: 3m 28s
813:	learn: 0.8230237	test: 0.8230041	best: 0.8230041 (813)	total: 15m 6s	remaining: 3m 27s
814:	learn: 0.8230559	test: 0.8230532	best: 0.8230532 (814)	total: 15m 6s	remaining: 3m 25s
815:	learn: 0.8231713	test: 0.8231739	best: 0.8231739 (815)	total: 15m 7s	remaining:

894:	learn: 0.8296314	test: 0.8295418	best: 0.8295418 (894)	total: 16m 31s	remaining: 1m 56s
895:	learn: 0.8296683	test: 0.8295659	best: 0.8295659 (895)	total: 16m 32s	remaining: 1m 55s
896:	learn: 0.8298045	test: 0.8297332	best: 0.8297332 (896)	total: 16m 33s	remaining: 1m 54s
897:	learn: 0.8300012	test: 0.8298622	best: 0.8298622 (897)	total: 16m 35s	remaining: 1m 53s
898:	learn: 0.8300184	test: 0.8298747	best: 0.8298747 (898)	total: 16m 36s	remaining: 1m 51s
899:	learn: 0.8300400	test: 0.8299122	best: 0.8299122 (899)	total: 16m 37s	remaining: 1m 50s
900:	learn: 0.8301471	test: 0.8299729	best: 0.8299729 (900)	total: 16m 38s	remaining: 1m 49s
901:	learn: 0.8302830	test: 0.8301402	best: 0.8301402 (901)	total: 16m 39s	remaining: 1m 48s
902:	learn: 0.8304279	test: 0.8303000	best: 0.8303000 (902)	total: 16m 40s	remaining: 1m 47s
903:	learn: 0.8304503	test: 0.8303258	best: 0.8303258 (903)	total: 16m 41s	remaining: 1m 46s
904:	learn: 0.8304903	test: 0.8303466	best: 0.8303466 (904)	total: 16m

983:	learn: 0.8366352	test: 0.8364657	best: 0.8365165 (982)	total: 18m 13s	remaining: 17.8s
984:	learn: 0.8368080	test: 0.8366654	best: 0.8366654 (984)	total: 18m 14s	remaining: 16.7s
985:	learn: 0.8368854	test: 0.8367270	best: 0.8367270 (985)	total: 18m 16s	remaining: 15.6s
986:	learn: 0.8371626	test: 0.8370350	best: 0.8370350 (986)	total: 18m 17s	remaining: 14.4s
987:	learn: 0.8372802	test: 0.8371948	best: 0.8371948 (987)	total: 18m 18s	remaining: 13.3s
988:	learn: 0.8377715	test: 0.8377782	best: 0.8377782 (988)	total: 18m 19s	remaining: 12.2s
989:	learn: 0.8378059	test: 0.8378107	best: 0.8378107 (989)	total: 18m 20s	remaining: 11.1s
990:	learn: 0.8377773	test: 0.8377907	best: 0.8378107 (989)	total: 18m 21s	remaining: 10s
991:	learn: 0.8378353	test: 0.8378573	best: 0.8378573 (991)	total: 18m 22s	remaining: 8.89s
992:	learn: 0.8378470	test: 0.8378789	best: 0.8378789 (992)	total: 18m 23s	remaining: 7.78s
993:	learn: 0.8378634	test: 0.8378906	best: 0.8378906 (993)	total: 18m 24s	remaini

In [11]:
joblib.dump(boost_model, 'CatBoostModel_2grams_v2.sav')
boost_model.save_model('CatBoostModel_2grams_v2.cbm')

In [14]:
print(classification_report(y_test, boost_model.predict(X_test)))

              precision    recall  f1-score   support

   IEEEannot       0.68      0.57      0.62     13766
    IEEEtran       0.70      0.64      0.67     14624
   IEEEtranN       0.77      0.84      0.81     14765
   IEEEtranS       0.63      0.68      0.65     14383
  IEEEtranSA       0.92      0.92      0.92     14937
  IEEEtranSN       0.85      0.73      0.78     14727
      JHEP-2       0.82      0.83      0.83     14364
  aaai-named       0.95      0.97      0.96     14087
    abstract       0.99      0.96      0.98     13930
    acmtrans       0.88      0.88      0.88     14318
      aichej       0.98      0.97      0.98     13565
         aip       0.59      0.59      0.59     14162
    alphanum       0.71      0.77      0.74     14081
         ama       0.95      0.96      0.95     13835
    amsalpha       0.98      0.97      0.98     10881
    amsplain       0.95      0.95      0.95     10898
    annotate       0.75      0.69      0.72     14175
  annotation       0.99    