In [1]:
!pip install catboost



In [1]:
import joblib # для сериализации и сохранения обученной модели в файл
# (см. https://machinelearningmastery.ru/save-load-machine-learning-models-python-scikit-learn/)
import numpy as np
import pandas as pd
import catboost
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, SGDRegressor, LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

In [None]:
data_frame = pd.read_csv('drive/MyDrive/bib_data_union_v2.csv.zip', compression='zip')

In [None]:
data_frame.shape

(6007277, 23)

In [None]:
data_frame.head()

Unnamed: 0,square_brackets,round_brackets,slashes,inverse_slashes,quotes,dots,commas,semicolons,colons,abstract,...,begin_ref,tirets,key,annotation,capital_letters,years,sine,et_al,etc,style_name
0,1,1,1,0,0,5,10,0,0,0,...,1,0,0,0,0.552632,2,0,1,0,iaea
1,1,1,5,0,0,11,3,0,2,0,...,3,0,0,0,0.647059,9,0,0,0,bestpapers
2,1,2,5,0,0,18,2,0,3,0,...,3,0,0,0,0.647059,2,0,0,0,bestpapers
3,1,2,5,0,0,18,2,0,4,0,...,3,0,0,0,0.454545,2,0,0,0,bestpapers
4,1,2,5,0,0,17,2,0,3,0,...,3,0,0,0,0.588235,3,0,0,0,bestpapers


In [None]:
X = data_frame.drop(['style_name'], axis=1)
y = data_frame.style_name

In [None]:
X = pd.get_dummies(X, columns=['begin_ref', 'abstract', 'key', 'annotation', 'sine', 'et_al', 'etc', ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [2]:
def print_metrics(classifier, test_data, test_answers, _average=None):
    print(f"accuracy = {classifier.score(test_data, test_answers)}")
    precisions = precision_score(test_answers, classifier.predict(test_data), average=_average, zero_division=0)
    recalls = recall_score(test_answers, classifier.predict(test_data), average=_average, zero_division=0)
    f1_scores = f1_score(test_answers, classifier.predict(test_data), average=_average, zero_division=0)
    print("precision: ", "min = ", precisions.min(), "max = ", precisions.max(), "mean = ", np.mean(precisions), "median = ", np.median(precisions))
    print("recall: ", "min = ", recalls.min(), "max = ", recalls.max(), "mean = ", np.mean(recalls), "median = ", np.median(recalls))
    print("f1_score: ", "min = ", f1_scores.min(), "max = ", f1_scores.max(), "mean = ", np.mean(f1_scores), "median = ", np.median(f1_scores))

# Random Forest

In [None]:
clf = RandomForestClassifier(criterion='entropy', n_estimators=40, max_depth=11, n_jobs=-1)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.5389785393722284

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

   IEEEannot       0.30      0.01      0.03     13829
    IEEEtran       0.34      0.58      0.42     14633
   IEEEtranN       0.49      0.52      0.51     14815
   IEEEtranS       0.35      0.12      0.18     14399
  IEEEtranSA       0.76      0.87      0.81     14944
  IEEEtranSN       0.48      0.33      0.39     14934
      JHEP-2       0.28      0.01      0.03     14299
  aaai-named       0.98      0.79      0.88     13986
    abstract       0.76      0.94      0.84     14027
    acmtrans       0.57      0.54      0.56     14271
      aichej       0.65      0.68      0.66     13692
         aip       0.28      0.01      0.02     14315
    alphanum       0.62      0.88      0.73     13942
         ama       0.66      0.47      0.55     13918
    amsalpha       0.86      0.88      0.87     10774
    amsplain       0.34      0.62      0.44     10792
    annotate       0.79      0.80      0.79     14295
  annotation       1.00    

In [None]:
precision_score(y_test, clf.predict(X_test), average=None)

In [None]:
clf.feature_importances_ 

array([2.57334071e-02, 1.40270769e-01, 8.54970224e-02, 3.52857883e-06,
       6.35333558e-02, 4.53995573e-02, 2.30159064e-02, 3.46565340e-02,
       5.66727065e-02, 2.92628458e-03, 1.62570014e-02, 6.35904016e-02,
       2.84581430e-02, 3.16135388e-01, 1.95883331e-02, 9.04565353e-03,
       2.84745424e-06, 2.29252288e-02, 6.97758733e-03, 1.41364037e-03,
       3.78955668e-02, 1.13740366e-06])

In [None]:
y_list = list(y)
y_set = set(y_list)
for style in y_set:
  print(style, y_list.count(style))

In [None]:
precisions = precision_score(y_test, clf.predict(X_test), average=None)
recalls = recall_score(y_test, clf.predict(X_test), average=None)
f1_scores = f1_score(y_test, clf.predict(X_test), average=None)


In [None]:
print("precision: ", "min = ", precisions.min(), "max = ", precisions.max(), "mean = ", np.mean(precisions), "median = ", np.median(precisions))
print("recall: ", "min = ", recalls.min(), "max = ", recalls.max(), "mean = ", np.mean(recalls), "median = ", np.median(recalls))
print("f1_score: ", "min = ", f1_scores.min(), "max = ", f1_scores.max(), "mean = ", np.mean(f1_scores), "median = ", np.median(f1_scores))

precision:  min =  0.11790302527973477 max =  1.0 mean =  0.5440775699164183 median =  0.49117680116893336
recall:  min =  0.007471980074719801 max =  0.9984328395935493 mean =  0.5150078729198585 median =  0.5418823529411765
f1_score:  min =  0.014519731943410276 max =  0.9793527321617209 mean =  0.49919371625718445 median =  0.4951682969354887


clf1 = RandomForestClassifier(criterion='gini', max_depth=8, n_estimators=50) ==> 46%

clf2 = RandomForestClassifier(criterion='gini', n_estimators=20, max_depth=10, n_jobs=-1) ==> 50%

clf3 = RandomForestClassifier(criterion='entropy', n_estimators=20, max_depth=10, n_jobs=-1) ==> 51.6%

clf4 = RandomForestClassifier(criterion='entropy', n_estimators=30, max_depth=10, n_jobs=-1) ==> 52%

clf5 = RandomForestClassifier(criterion='entropy', n_estimators=20, max_depth=11, n_jobs=-1) ==> 52.8%

clf6 = RandomForestClassifier(criterion='entropy', n_estimators=30, max_depth=11, n_jobs=-1) ==> 53.6%

# Градиентный бустинг

In [None]:
X = X.drop(['abstract', 'annotation', 'inverse_slashes', 'etc'], axis=1)

In [None]:
y_list = list(y)
for style in y_list:
  if 'ugost2003' in style:
    style = 'ugost2003'
  elif 'ugost2008' in style:
    style = 'ugost2008'
  elif 'IEEE' in style:
    style = 'IEEE'
  elif 'h-physrev' in style:
    style = 'h-physrev'
y = pd.Series(y_list)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [42]:
train_dataset = catboost.Pool(data=X_train, label=y_train, cat_features=[ 'begin_ref_1',
 'begin_ref_2',
 'begin_ref_3',
 'begin_ref_4',
 'begin_ref_5',
 'begin_ref_6',
 'begin_ref_7',
 'begin_ref_8',
 'begin_ref_9',
 'begin_ref_10',
 'begin_ref_11',
 'begin_ref_12',
 'abstract_0',
 'abstract_1',
 'key_0',
 'key_1',
 'annotation_0',
 'annotation_1',
 'sine_0',
 'sine_1',
 'sine_2',
 'et_al_0',
 'et_al_1',
 'et_al_2',
 'et_al_3',
 'etc_0',
 'etc_1'])
test_dataset = catboost.Pool(data=X_test, label=y_test, cat_features=[ 'begin_ref_1',
 'begin_ref_2',
 'begin_ref_3',
 'begin_ref_4',
 'begin_ref_5',
 'begin_ref_6',
 'begin_ref_7',
 'begin_ref_8',
 'begin_ref_9',
 'begin_ref_10',
 'begin_ref_11',
 'begin_ref_12',
 'abstract_0',
 'abstract_1',
 'key_0',
 'key_1',
 'annotation_0',
 'annotation_1',
 'sine_0',
 'sine_1',
 'sine_2',
 'et_al_0',
 'et_al_1',
 'et_al_2',
 'et_al_3',
 'etc_0',
 'etc_1'])

ValueError: 'begin_ref_1' is not in list

In [None]:
boost_model = catboost.CatBoostClassifier(iterations=250,
                           learning_rate=0.05,
                           depth=11,
                           loss_function='MultiClass',
                           task_type='GPU',
                           gpu_ram_part=0.95,

                           eval_metric='Accuracy',
                           cat_features=[ 'begin_ref_1',
 'begin_ref_2',
 'begin_ref_3',
 'begin_ref_4',
 'begin_ref_5',
 'begin_ref_6',
 'begin_ref_7',
 'begin_ref_8',
 'begin_ref_9',
 'begin_ref_10',
 'begin_ref_11',
 'begin_ref_12',
 'abstract_0',
 'abstract_1',
 'key_0',
 'key_1',
 'annotation_0',
 'annotation_1',
 'sine_0',
 'sine_1',
 'sine_2',
 'et_al_0',
 'et_al_1',
 'et_al_2',
 'et_al_3',
 'etc_0',
 'etc_1'])
boost_model.fit(train_dataset,eval_set=(X_val, y_val))

0:	learn: 0.3080077	test: 0.3075002	best: 0.3075002 (0)	total: 3.6s	remaining: 14m 57s
1:	learn: 0.3493131	test: 0.3488401	best: 0.3488401 (1)	total: 7.2s	remaining: 14m 52s
2:	learn: 0.3974112	test: 0.3965921	best: 0.3965921 (2)	total: 10.9s	remaining: 14m 58s
3:	learn: 0.4142674	test: 0.4132910	best: 0.4132910 (3)	total: 14.5s	remaining: 14m 52s
4:	learn: 0.4298913	test: 0.4288663	best: 0.4288663 (4)	total: 18.1s	remaining: 14m 46s
5:	learn: 0.4329104	test: 0.4317653	best: 0.4317653 (5)	total: 21.6s	remaining: 14m 40s
6:	learn: 0.4393770	test: 0.4383831	best: 0.4383831 (6)	total: 25.2s	remaining: 14m 34s
7:	learn: 0.4436465	test: 0.4426504	best: 0.4426504 (7)	total: 28.7s	remaining: 14m 28s
8:	learn: 0.4436998	test: 0.4426962	best: 0.4426962 (8)	total: 32.3s	remaining: 14m 25s
9:	learn: 0.4533373	test: 0.4521023	best: 0.4521023 (9)	total: 35.9s	remaining: 14m 22s
10:	learn: 0.4557785	test: 0.4544628	best: 0.4544628 (10)	total: 39.6s	remaining: 14m 19s
11:	learn: 0.4588930	test: 0.457

<catboost.core.CatBoostClassifier at 0x7fa82c3649a0>

In [None]:
print(classification_report(y_test, boost_model.predict(test_dataset)))

              precision    recall  f1-score   support

   IEEEannot       0.31      0.07      0.12     13829
    IEEEtran       0.33      0.49      0.39     14633
   IEEEtranN       0.48      0.55      0.51     14815
   IEEEtranS       0.31      0.19      0.24     14399
  IEEEtranSA       0.80      0.90      0.85     14944
  IEEEtranSN       0.46      0.32      0.38     14934
      JHEP-2       0.40      0.40      0.40     14299
  aaai-named       0.94      0.88      0.91     13986
    abstract       0.80      0.88      0.84     14027
    acmtrans       0.65      0.71      0.68     14271
      aichej       0.77      0.76      0.77     13692
         aip       0.29      0.26      0.27     14315
    alphanum       0.88      0.84      0.86     13942
         ama       0.60      0.66      0.63     13918
    amsalpha       0.96      0.93      0.94     10774
    amsplain       0.70      0.83      0.76     10792
    annotate       0.79      0.88      0.83     14295
  annotation       1.00    

In [None]:
boost_model.get_best_score()

{'learn': {'Accuracy': 0.6170623674350406, 'MultiClass': 1.0738434509268624},
 'validation': {'Accuracy': 0.6065057729954322,
  'MultiClass': 1.082117239416175}}

In [None]:
boost_model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,begin_ref_1,21.717033
1,ampersands,15.680614
2,round_brackets,15.167533
3,begin_ref_3,13.150782
4,begin_ref_5,6.249451
5,dots,4.739864
6,begin_ref_11,3.814263
7,slashes,2.868023
8,colons,2.30749
9,page_ref,2.163333


In [None]:
metrics = boost_model.eval_metrics(
    data=train_dataset,
    metrics=['Precision','Recall','F1','Accuracy'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
from catboost import cv

params = {
    'loss_function': 'MultiClass',
    'iterations': 250,
    'depth': 10,
    'random_seed': 0,
    'learning_rate': 0.05,
    'task_type': 'GPU',
    'gpu_ram_part': 0.95,
    'eval_metric': 'Accuracy'
}

cv_data = cv(
    params=params,
    pool=catboost.Pool(data=X, label=y),
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=True, 
    verbose=False
)

In [None]:
cv_model = catboost.CatBoostClassifier(
                           loss_function='MultiClass',
                           task_type='GPU',
                           gpu_ram_part=0.95,
                           eval_metric='Accuracy',
                          )

parameters = {
    'iterations': range(150, 351, 50),
    'depth': range(6, 14),
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.5]
}

cv_model.grid_search(parameters, X, y)

# Наивная байесовская классификация

In [None]:
precisions = precision_score(y_test, bayes_clf.predict(X_test), average=None)
recalls = recall_score(y_test, bayes_clf.predict(X_test), average=None)
f1_scores = f1_score(y_test, bayes_clf.predict(X_test), average=None)

In [None]:
print("precision: ", "min = ", precisions.min(), "max = ", precisions.max(), "mean = ", np.mean(precisions), "median = ", np.median(precisions))
print("recall: ", "min = ", recalls.min(), "max = ", recalls.max(), "mean = ", np.mean(recalls), "median = ", np.median(recalls))
print("f1_score: ", "min = ", f1_scores.min(), "max = ", f1_scores.max(), "mean = ", np.mean(f1_scores), "median = ", np.median(f1_scores))

precision:  min =  0.09152843601895734 max =  0.9041331535636246 mean =  0.3503485530217752 median =  0.34906358513121194
recall:  min =  0.00959335624284078 max =  0.9831151104595319 mean =  0.32680261035539704 median =  0.3074704491725768
f1_score:  min =  0.018126888217522657 max =  0.9419714216517318 mean =  0.31132752247121714 median =  0.2896955711369486


In [None]:
cv_bayes_clf = MultinomialNB()
parameters = {'var_smoothing': [1e-2, 5e-2, 1e-3, 5e-3, 1e-4, 5e-4, 1e-5, 5e-5, 1e-6, 5e-6, 1e-7, 5e-7]}
grid = GridSearchCV(cv_bayes_clf, parameters)
grid.fit(X_train, y_train)

In [None]:
best_clf = grid.best_estimator_
best_clf.score(X_test, y_test)

0.3669924935522677

# Линейная классификация

In [None]:
sgd_clf_1 = SGDClassifier(loss='hinge', early_stopping=True, n_jobs=-1, random_state=0)
sgd_clf_1.fit(X_train, y_train)
sgd_clf_1.score(X_test, y_test)

In [None]:
precisions = precision_score(y_test, sgd_clf_1.predict(X_test), average=None)
recalls = recall_score(y_test, sgd_clf_1.predict(X_test), average=None)
f1_scores = f1_score(y_test, sgd_clf_1.predict(X_test), average=None)


In [None]:
print("precision: ", "min = ", precisions.min(), "max = ", precisions.max(), "mean = ", np.mean(precisions), "median = ", np.median(precisions))
print("recall: ", "min = ", recalls.min(), "max = ", recalls.max(), "mean = ", np.mean(recalls), "median = ", np.median(recalls))
print("f1_score: ", "min = ", f1_scores.min(), "max = ", f1_scores.max(), "mean = ", np.mean(f1_scores), "median = ", np.median(f1_scores))

precision:  min =  0.0 max =  0.9944520765085069 mean =  0.40912341157097065 median =  0.38881491344873503
recall:  min =  0.0 max =  0.9994787223959815 mean =  0.39585435843587163 median =  0.3600458190148912
f1_score:  min =  0.0 max =  0.9946708168270137 mean =  0.36837470035717856 median =  0.3093249349582364


# Векторизация

In [2]:
text_data_frame = pd.read_csv('./bib_data_union_v3.csv.zip', compression='zip')

In [14]:
text_data_frame.shape

(6006215, 2)

In [6]:
text_data_frame.head()

Unnamed: 0,tokenized_record,style_name
0,"[ num ] sp upword , sp caplet . sp caplet . sp...",iaea
1,[ capword sp othword sp capword ( year ) ] sp ...,bestpapers
2,[ capword ( year ) ] sp caplet . sp caplet . s...,bestpapers
3,[ capword sp othword sp capword ( year ) ] sp ...,bestpapers
4,[ capword ( year ) ] sp capword sp caplet . sp...,bestpapers


In [3]:
corpus = text_data_frame.tokenized_record

In [4]:
# ngram_range=(1, 1) => униграммы, ngram_range=(2, 2) => биграммы и т.д.
vectorizer = CountVectorizer(tokenizer=lambda txt: txt.split(), ngram_range=(1, 2))
vectorized_data = vectorizer.fit_transform(corpus).toarray()
print("SUCCESS")



SUCCESS


In [2]:
# временный код
X = pd.read_csv('2grams_bib_data.csv')
y = X.style_name
X = X.drop(['style_name'], axis=1)

In [9]:
X = pd.DataFrame(data=vectorized_data, columns=vectorizer.get_feature_names_out())
y = text_data_frame.style_name

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
del X, y

# Линейная классификация на новом датасете

In [8]:
sgd_clf_1 = SGDClassifier(loss='hinge', early_stopping=False, n_jobs=-1, random_state=0)

t_start = time.time()

sgd_clf_1.fit(X_train, y_train)

t_finish = time.time()
print(t_finish - t_start)

SGDClassifier(n_jobs=-1, random_state=0)

In [9]:
print(classification_report(y_test, sgd_clf_1.predict(X_test), zero_division=0))

              precision    recall  f1-score   support

   IEEEannot       0.31      0.04      0.07     20957
    IEEEtran       0.25      0.73      0.37     21931
   IEEEtranN       0.43      0.41      0.42     21967
   IEEEtranS       0.34      0.01      0.01     21680
  IEEEtranSA       0.73      0.80      0.77     22415
  IEEEtranSN       0.36      0.57      0.44     22108
      JHEP-2       0.79      0.58      0.67     21360
  aaai-named       0.91      0.90      0.91     20922
    abstract       0.75      0.00      0.00     21111
    acmtrans       0.71      0.76      0.74     21473
      aichej       0.96      0.91      0.94     20369
         aip       0.16      0.00      0.01     21412
    alphanum       0.35      0.88      0.50     20977
         ama       0.96      0.90      0.93     20961
    amsalpha       0.97      0.92      0.94     16073
    amsplain       0.96      0.83      0.89     16271
    annotate       0.51      0.04      0.08     21166
  annotation       0.97    

In [21]:
print_metrics(sgd_clf_1, X_test, y_test, _average=None)

accuracy = 0.6365399183623635
precision:  min =  0.0 max =  1.0 mean =  0.6484155952821647 median =  0.723570564864398
recall:  min =  0.0 max =  0.986599586232838 mean =  0.6283646477473733 median =  0.7510262529832935
f1_score:  min =  0.0 max =  0.9863568634755074 mean =  0.6173944345170069 median =  0.6971282255038298


In [10]:
filename = './SGDClassifier_2grams.sav'
joblib.dump(sgd_clf_1, filename)

['./SGDClassifier_2grams.sav']

In [12]:
loaded_clf = joblib.load('./SGDClassifier_2grams.sav')
loaded_clf.score(X_test, y_test)

0.6455904523101013

# Наивный Байесовский алгоритм на новом датасете

In [9]:
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

GaussianNB()

In [10]:
# print(classification_report(y_test, nb_clf.predict(X_test), zero_division=0))
nb_clf.score(X_test, y_test)

0.42989296080371225

In [None]:
joblib.dump(nb_clf, 'GaussianNB_2grams.sav')

In [25]:
print_metrics(nb_clf, X_test, y_test, _average=None)

accuracy = 0.5512366353750142
precision:  min =  0.10247349823321555 max =  0.9970263381478335 mean =  0.5722132801623461 median =  0.5541376455116653
recall:  min =  0.04753208054981497 max =  0.9693084500767288 mean =  0.5548585606161551 median =  0.5698594245981761
f1_score:  min =  0.08279961488551217 max =  0.9730892381394668 mean =  0.5415205346974971 median =  0.582771258739105


# Градиентный бустинг на новом датасете

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [14]:
train_dataset = catboost.Pool(data=X_train, label=y_train)
test_data = catboost.Pool(data=X_test, label=y_test)

In [6]:
def gradient_boosting(_iterations: int, _learning_rate: float, _depth: int):
    boost_model = catboost.CatBoostClassifier(iterations=_iterations,
                           learning_rate=_learning_rate,
                           depth=_depth,
                           loss_function='MultiClass',
#                            task_type='GPU',
#                            gpu_ram_part=0.75,
                           eval_metric='Accuracy')
    time_start = time.time()
    boost_model.fit(train_dataset,eval_set=(X_val, y_val))
    time_finish = time.time()
    print(time_finish - time_start)
    return boost_model

# boost_model_1 = gradient_boosting(250, 0.05, 6)
# print("SUCCESS")

In [None]:
boost_model_1.get_best_score()

In [None]:
print_metrics(boost_model_1, X_test, y_test, _average=None)

In [19]:
boost_model_1.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,[ num,15.665995
1,( year,11.726335
2,) .,7.452743
3,". ,",5.989705
4,num ],5.017204
...,...,...
457,— num,0.000000
458,— othword,0.000000
459,— smallet,0.000000
460,— upword,0.000000


In [None]:
metrics = boost_model_1.eval_metrics(
    data=train_dataset,
    metrics=['Precision','Recall','F1','Accuracy'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
filename = './CatBoostClassifier_2grams_v2.sav'
joblib.dump(boost_model_1, filename)

In [12]:
loaded_catboost_model = joblib.load('./CatBoostClassifier_2grams.sav')
# print_metrics(loaded_catboost_model, X_test, y_test, _average=None)
fi = loaded_catboost_model.feature_importances_
f = open('catboost_2grams_feature_importances.txt', 'w')
f.write(fi)
f.close()

AttributeError: 'SGDClassifier' object has no attribute 'feature_importances_'

# Random forest на новом датасете

In [35]:

for _n_estimators in range(20, 51, 10):
    for _max_depth in range(10, 13):
        rf_clf = RandomForestClassifier(criterion='entropy', n_estimators=_n_estimators, max_depth=_max_depth, n_jobs=-1)
        rf_clf.fit(X_train, y_train)
        print(_n_estimators, _max_depth)
        print(rf_clf.score(X_test, y_test))
        print_metrics(rf_clf, X_test, y_test, _average=None)
        print('___________________________')

20 8
0.6121901870259536
accuracy = 0.6121901870259536
precision:  min =  0.0 max =  0.9922599488864549 mean =  0.5980777286744221 median =  0.6239096724171351
recall:  min =  0.0 max =  0.9895473061255861 mean =  0.5889736611021662 median =  0.6566919076726576
f1_score:  min =  0.0 max =  0.9842465505377902 mean =  0.5712248490553617 median =  0.5860415116182917
___________________________
20 9
0.6254969866607407
accuracy = 0.6254969866607407
precision:  min =  0.0 max =  0.9902464065708418 mean =  0.6207565322099907 median =  0.624644315229494
recall:  min =  0.0 max =  0.9809168519979913 mean =  0.6032118706864673 median =  0.6765921670286555
f1_score:  min =  0.0 max =  0.9801117805037382 mean =  0.5878706135994055 median =  0.6192770198212775
___________________________
20 10
0.6454280556408662
accuracy = 0.6454280556408662
precision:  min =  0.0 max =  0.9898599358039101 mean =  0.6252949321279772 median =  0.6232141723284018
recall:  min =  0.0 max =  0.9942031059901238 mean =  0

In [4]:
rf_clf_2 = RandomForestClassifier(criterion='entropy', n_estimators=40, max_depth=12, n_jobs=-1)
rf_clf_2.fit(X_train, y_train)
rf_clf_2.score(X_test, y_test)

0.7158403694963537

In [7]:
joblib.dump(rf_clf_2, 'RandomForestClassifier_v2.sav')

['RandomForestClassifier_v2.sav']

In [6]:
print(classification_report(y_test, rf_clf_2.predict(X_test)))

              precision    recall  f1-score   support

   IEEEannot       0.64      0.27      0.38     20957
    IEEEtran       0.54      0.44      0.48     21931
   IEEEtranN       0.60      0.70      0.64     21967
   IEEEtranS       0.46      0.47      0.47     21680
  IEEEtranSA       0.79      0.86      0.82     22415
  IEEEtranSN       0.73      0.37      0.49     22108
      JHEP-2       0.78      0.71      0.74     21360
  aaai-named       0.85      0.92      0.89     20922
    abstract       0.85      0.94      0.90     21111
    acmtrans       0.71      0.72      0.71     21473
      aichej       0.94      0.94      0.94     20369
         aip       0.29      0.39      0.33     21412
    alphanum       0.45      0.71      0.55     20977
         ama       0.47      0.96      0.63     20961
    amsalpha       0.97      0.94      0.95     16073
    amsplain       0.83      0.94      0.88     16271
    annotate       0.72      0.37      0.49     21166
  annotation       0.81    

In [9]:
print_metrics(rf_clf, X_test, y_test, _average=None)

NameError: name 'print_metrics' is not defined

# Обработка отдельными батчами


In [None]:
chunksize = 6 * 10**4
i = 0
with pd.read_csv('2grams_bib_data.csv', chunksize=chunksize) as reader:
    for chunk in reader:        
        X = chunk.drop(['style_name'], axis=1)
        y = chunk.style_name
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
        train_dataset = catboost.Pool(data=X_train, label=y_train)
        test_data = catboost.Pool(data=X_test, label=y_test)
        
        if i == 0:
            boost_model = catboost.CatBoostClassifier(iterations=50,
                           learning_rate=0.05,
                           depth=4,
                           task_type='GPU',                     
                           loss_function='MultiClass',
                           eval_metric='Accuracy')
            boost_model.fit(train_dataset, eval_set=(X_val, y_val))
        else:
            boost_model.fit(train_dataset, eval_set=(X_val, y_val), init_model='cat_boost_model.cbm')
        print(boost_model.get_best_score())
        boost_model.save_model('cat_boost_model.cbm')
        i += 1
        del X, y, X_train, X_test, y_train, y_test, X_val, y_val, train_dataset, test_data

# Градиентный бустинг (2-граммы) на GPU


In [3]:
df = pd.read_csv('2grams_bib_data.csv')
X = df.drop(['style_name'], axis=1)
y = df.style_name
del df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
del X, y
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
train_dataset = catboost.Pool(data=X_train, label=y_train)
test_data = catboost.Pool(data=X_test, label=y_test)



In [None]:
boost_model = catboost.CatBoostClassifier(iterations=250,
                           learning_rate=0.05,
                           depth=4,
                           task_type='GPU',                                                
                           loss_function='MultiClass',
                           eval_metric='Accuracy')
time_start = time.time()
boost_model.fit(train_dataset, eval_set=(X_val, y_val))
time_finish = time.time()
print(time_finish - time_start)

In [11]:
joblib.dump(boost_model, 'CatBoostModel_2grams_v2.sav')
boost_model.save_model('CatBoostModel_2grams_v2.cbm')

In [14]:
print(classification_report(y_test, boost_model.predict(X_test)))

              precision    recall  f1-score   support

   IEEEannot       0.68      0.57      0.62     13766
    IEEEtran       0.70      0.64      0.67     14624
   IEEEtranN       0.77      0.84      0.81     14765
   IEEEtranS       0.63      0.68      0.65     14383
  IEEEtranSA       0.92      0.92      0.92     14937
  IEEEtranSN       0.85      0.73      0.78     14727
      JHEP-2       0.82      0.83      0.83     14364
  aaai-named       0.95      0.97      0.96     14087
    abstract       0.99      0.96      0.98     13930
    acmtrans       0.88      0.88      0.88     14318
      aichej       0.98      0.97      0.98     13565
         aip       0.59      0.59      0.59     14162
    alphanum       0.71      0.77      0.74     14081
         ama       0.95      0.96      0.95     13835
    amsalpha       0.98      0.97      0.98     10881
    amsplain       0.95      0.95      0.95     10898
    annotate       0.75      0.69      0.72     14175
  annotation       0.99    

# Замеры времени

In [15]:
sgd_clf = joblib.load('SGDClassifier_2grams.sav')
sgd_clf.get_params()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'hinge',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': -1,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': 0,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [None]:
import time

sgd_clf = SGDClassifier(alpha=0.0001, epsilon=0.1, l1_ratio=0.15, loss='hinge', max_iter=1000, n_iter_no_change=5, n_jobs=8, random_state=0)
start = time.time()
sgd_clf.fit(X_train, y_train)
finish = time.time()
print(finish - start)