# IMPORT, CONFIG

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt

import gensim
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import catboost
from catboost import CatBoostClassifier

import lightgbm
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import time
import math
import random
import pickle
import re
from collections import Counter

import warnings
warnings.filterwarnings('ignore')



In [2]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 8, 6

In [3]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# LOAD

In [4]:
X = list()
with open('..\\datasets\\WOS\\WOS46985\\X.txt','r') as X_file:
    lines = X_file.readlines()
    for line in lines:
        X.append(line.strip())

In [5]:
Y = list()
with open('..\\datasets\\WOS\\WOS46985\\Y.txt','r') as Y_file:
    lines = Y_file.readlines()
    for line in lines:
        Y.append(line.strip())

In [6]:
YL1 = list()
with open('..\\datasets\\WOS\\WOS46985\\YL1.txt','r') as YL1_file:
    lines = YL1_file.readlines()
    for line in lines:
        YL1.append(line.strip())

In [7]:
YL2 = list()
with open('..\\datasets\\WOS\\WOS46985\\YL2.txt','r') as YL2_file:
    lines = YL2_file.readlines()
    for line in lines:
        YL2.append(line.strip())

In [8]:
source = pd.DataFrame({'texts':pd.Series(X, dtype='object').apply(lambda x: re.sub('[^A-Za-z0-9 ]+', '', x)),
 'Y':pd.Series(Y, dtype='float'),
 'YL1':pd.Series(YL1, dtype='float'),
 'YL2':pd.Series(YL2, dtype='float')})
source = source.astype({'Y': int, 'YL1': int, 'YL2':int})

In [9]:
source

Unnamed: 0,texts,Y,YL1,YL2
0,2 1dimensional nonlinear optical waves throug...,12,0,12
1,betaamyloid A beta and tau pathology become in...,74,5,2
2,Decreasing of energy consumption and environme...,68,4,7
3,Hybrid electric vehicles are assumed to play a...,26,1,10
4,L34Dihydroxyphenylalanine LDOPA remains the pr...,115,5,43
...,...,...,...,...
46980,Zusammenfassung Hintergrund Karate erfreut sic...,122,5,50
46981,ZWave is an implementation of home automation ...,15,0,15
46982,Zwitterionic peptides were anchored to a condu...,110,5,38
46983,ZY3 has been acquiring high quality imagery si...,10,0,10


In [10]:
source.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46985 entries, 0 to 46984
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   texts   46985 non-null  object
 1   Y       46985 non-null  int32 
 2   YL1     46985 non-null  int32 
 3   YL2     46985 non-null  int32 
dtypes: int32(3), object(1)
memory usage: 917.8+ KB


In [11]:
del X, Y, YL1, YL2 #,tokens

In [12]:
target_column = 'YL1'
data = source[['texts',target_column]] # source source[source['YL1']==0]

In [13]:
data

Unnamed: 0,texts,YL1
0,2 1dimensional nonlinear optical waves throug...,0
1,betaamyloid A beta and tau pathology become in...,5
2,Decreasing of energy consumption and environme...,4
3,Hybrid electric vehicles are assumed to play a...,1
4,L34Dihydroxyphenylalanine LDOPA remains the pr...,5
...,...,...
46980,Zusammenfassung Hintergrund Karate erfreut sic...,5
46981,ZWave is an implementation of home automation ...,0
46982,Zwitterionic peptides were anchored to a condu...,5
46983,ZY3 has been acquiring high quality imagery si...,0


In [14]:
train_df, test_df = train_test_split(data, test_size=0.33, random_state=RANDOM_SEED,stratify=data[target_column].to_list())

In [15]:
len(train_df), len(test_df)

(31479, 15506)

# PROCESSING

## PROBLEMS PROCESSING

---

проблема ",background", 45 символов в скобках, (2 + 1)-dimensional, (D)ecreasing, (L)-3,4-Dihydroxyphenylalanine ((L)-DOPA)

\s*(\(.{0,30}\))\s* для убирания отдельных слов в скобках; они являются group match [надо доработать с пред и пост условиями]

in tokens .replace("[,.]", "")

\s*(\(.{0,5}\))\S+\s* для проблем типо (D)ecreasing (D) является group match [надо доработать с пред и пост условиями]

---

In [16]:
EMBEDDING_SIZE = 300
EMBEDDING_SIZE

300

In [17]:
class MyTokenizer():
    class Encoding:
        def __init__(self):
            super().__init__()
            tokens = list()
            ids = list()

    def __init__(self):
        self.index2token = list()
        self.token2index = dict()
        self.token2count = Counter()
        self.encoding = MyTokenizer.Encoding()
        self.reset()

    def add_sentence(self, sentence):
        for token in self.encode(sentence).tokens:
            self.add_token(token)

    def add_token(self, token):
        if token not in self.token2index:
            self.token2index[token] = len(self.index2token)
            self.token2count[token] += 1
            self.index2token.append(token)
        else:
            self.token2count[token] += 1
    
    def has_token(self, token) -> bool:
        return token in self.token2index

    def add_texts(self):
        sents_texts = train_df.texts.to_list()
        for sent in sents_texts:
            self.add_sentence(sent)

    def token_to_id(self, token):
        if token in self.token2index:
            return self.token2index[token]
        else:    # убрал паддинг - если нет слова в словаре то пропуск
            pass

    def id_to_token(self, id_):
        if id_ in self.index2token:
            return self.index2token[id_]
        else:    # убрал отсутсвие id в словаре - если нет id в словаре то пропуск
            pass
    
    def is_empty(self):
        empty_size = 4
        return self.size() <= empty_size

    def shrink(self, n):
        # для сокращения словаря (выбирает n самых частых слов) (подаётся n)
        best_tokens = self.token2count.most_common(n)
        self.reset()
        for token, count in best_tokens:
            self.add_token(token)
            self.token2count[token] = count

    def reset(self):
        self.token2count = Counter()
        self.index2token = []
        self.token2index = {token: index for index, token in enumerate(self.index2token)}

    def get_vocab(self):
        return self.token2index
    
    def get_vocab_size(self):
        return len(self.index2token)
        
    def encode(self, text: str): # sentence: str, max_length: int
        text = text.lower().strip()
        self.encoding.tokens = text.split(' ')
        self.encoding.tokens = [token for token in self.encoding.tokens]
        return self.encoding

    def decode(self, ids):
        return print([self.index2token[id_] for id_ in ids].join(' '))

In [18]:
tokenizer = MyTokenizer()

In [19]:
tokenizer.add_texts()

In [20]:
tokenizer.token2index

{'the': 0,
 'article': 1,
 'acquaints': 2,
 'with': 3,
 'means': 4,
 'of': 5,
 'dramatherapeutic': 6,
 'intervention': 7,
 'in': 8,
 'two': 9,
 'addictions': 10,
 'departments': 11,
 '': 12,
 'centre': 13,
 'for': 14,
 'secondary': 15,
 'prevention': 16,
 'and': 17,
 'addiction': 18,
 'treatment': 19,
 'at': 20,
 'military': 21,
 'hospital': 22,
 'olomouc': 23,
 'toxicorehabilitation': 24,
 'department': 25,
 'psychiatric': 26,
 'kromeriz': 27,
 'first': 28,
 'runs': 29,
 'a': 30,
 'detoxification': 31,
 'shortterm': 32,
 'one': 33,
 'month': 34,
 'programme': 35,
 'while': 36,
 'mandala': 37,
 'mediumterm': 38,
 'as': 39,
 'long': 40,
 'half': 41,
 'year': 42,
 'is': 43,
 'being': 44,
 'implemented': 45,
 'second': 46,
 'part': 47,
 'text': 48,
 'presents': 49,
 'results': 50,
 'an': 51,
 'enquiry': 52,
 'focused': 53,
 'on': 54,
 'tracing': 55,
 'differences': 56,
 'connections': 57,
 'between': 58,
 'clients': 59,
 'perception': 60,
 'benefits': 61,
 'overall': 62,
 'concept': 63,
 

In [21]:
tokenizer.index2token

['the',
 'article',
 'acquaints',
 'with',
 'means',
 'of',
 'dramatherapeutic',
 'intervention',
 'in',
 'two',
 'addictions',
 'departments',
 '',
 'centre',
 'for',
 'secondary',
 'prevention',
 'and',
 'addiction',
 'treatment',
 'at',
 'military',
 'hospital',
 'olomouc',
 'toxicorehabilitation',
 'department',
 'psychiatric',
 'kromeriz',
 'first',
 'runs',
 'a',
 'detoxification',
 'shortterm',
 'one',
 'month',
 'programme',
 'while',
 'mandala',
 'mediumterm',
 'as',
 'long',
 'half',
 'year',
 'is',
 'being',
 'implemented',
 'second',
 'part',
 'text',
 'presents',
 'results',
 'an',
 'enquiry',
 'focused',
 'on',
 'tracing',
 'differences',
 'connections',
 'between',
 'clients',
 'perception',
 'benefits',
 'overall',
 'concept',
 'above',
 'said',
 'implementation',
 'questionnaire',
 'method',
 'did',
 'not',
 'reveal',
 'evaluations',
 'both',
 'key',
 'factors',
 'dambreak',
 'modeling',
 'are',
 'accuracy',
 'speed',
 'therefore',
 'highperformance',
 'calculations',


In [22]:
tokenizer.token2count

Counter({'the': 346316,
         'article': 1921,
         'acquaints': 1,
         'with': 69287,
         'means': 1028,
         'of': 252243,
         'dramatherapeutic': 3,
         'intervention': 2043,
         'in': 154465,
         'two': 8032,
         'addictions': 38,
         'departments': 96,
         '': 23548,
         'centre': 168,
         'for': 65614,
         'secondary': 1059,
         'prevention': 1053,
         'and': 229808,
         'addiction': 391,
         'treatment': 8493,
         'at': 17828,
         'military': 159,
         'hospital': 1005,
         'olomouc': 4,
         'toxicorehabilitation': 1,
         'department': 389,
         'psychiatric': 881,
         'kromeriz': 1,
         'first': 4622,
         'runs': 90,
         'a': 109553,
         'detoxification': 44,
         'shortterm': 275,
         'one': 6540,
         'month': 347,
         'programme': 252,
         'while': 4388,
         'mandala': 1,
         'mediumterm': 18,
  

In [23]:
VOCAB_SIZE = tokenizer.get_vocab_size()
VOCAB_SIZE

158635

In [24]:
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_SIZE))
embedding_matrix.shape

(158635, 300)

In [25]:
Word2Vec = gensim.models.KeyedVectors.load_word2vec_format('./test_models/GoogleNews-vectors-negative300.bin', binary=True)  

### для незнакомых слов либо !нулевой вектор! либо сумма контекстных

In [26]:
for index, word in zip(tokenizer.token2index.values(),tokenizer.token2index.keys()):
    try:
        embedding_matrix[index] = Word2Vec.get_vector(word).copy()
    except:
        embedding_matrix[index] = np.zeros(EMBEDDING_SIZE)

In [27]:
del Word2Vec

In [28]:
def vectorize(text):
    vector = np.zeros(EMBEDDING_SIZE)
    for token in tokenizer.encode(text).tokens:
        try:
            vector += embedding_matrix[tokenizer.token2index[token]]
        except:
            pass
    return vector

In [29]:
train_X = np.array([vectorize(text) for text in train_df.texts.to_list()])
test_X = np.array([vectorize(text) for text in test_df.texts.to_list()])

# MODEL

In [30]:
rfc = RandomForestClassifier(random_state=RANDOM_SEED,n_jobs=-1)

In [31]:
svc = SVC(random_state=RANDOM_SEED)

In [32]:
lgbm = lightgbm.LGBMClassifier(objective='multiclass', seed=RANDOM_SEED)

In [33]:
cbc = CatBoostClassifier(random_seed=RANDOM_SEED,task_type='GPU')

# TRAIN & PREDICT

## Random Forest

In [37]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [34]:
%%time
rfc.fit(train_X,train_df[target_column])

CPU times: total: 1min 53s
Wall time: 10.2 s


RandomForestClassifier(n_jobs=-1, random_state=42)

In [35]:
%time
accuracy_score(rfc.predict(test_X),test_df[target_column])

CPU times: total: 0 ns
Wall time: 0 ns


0.7097252676383335

In [49]:
print(classification_report(train_df[target_column],rfc.predict(train_X)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4364
           1       1.00      1.00      1.00      3674
           2       1.00      1.00      1.00      4785
           3       1.00      1.00      1.00      2209
           4       1.00      1.00      1.00      2839
           5       1.00      1.00      1.00      9798
           6       1.00      1.00      1.00      3810

    accuracy                           1.00     31479
   macro avg       1.00      1.00      1.00     31479
weighted avg       1.00      1.00      1.00     31479



In [36]:
print(classification_report(test_df[target_column],rfc.predict(test_X)))

              precision    recall  f1-score   support

           0       0.68      0.80      0.73      2150
           1       0.72      0.73      0.72      1809
           2       0.73      0.62      0.67      2357
           3       0.66      0.37      0.47      1088
           4       0.69      0.60      0.64      1398
           5       0.73      0.85      0.79      4827
           6       0.68      0.64      0.66      1877

    accuracy                           0.71     15506
   macro avg       0.70      0.66      0.67     15506
weighted avg       0.71      0.71      0.70     15506



## LightGBM

In [38]:
lgbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'multiclass',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': 'warn',
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'seed': 42}

In [51]:
%%time
lgbm.fit(train_X, train_df[target_column])

CPU times: total: 2min 48s
Wall time: 15.2 s


LGBMClassifier(objective='multiclass', seed=42)

In [52]:
%%time
accuracy_score(lgbm.predict(test_X),test_df[target_column])

CPU times: total: 2.06 s
Wall time: 199 ms


0.7519669805236683

In [53]:
print(classification_report(train_df[target_column],lgbm.predict(train_X)))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      4364
           1       0.98      0.99      0.98      3674
           2       0.96      0.92      0.94      4785
           3       0.99      0.98      0.99      2209
           4       0.98      0.98      0.98      2839
           5       0.95      0.97      0.96      9798
           6       0.96      0.95      0.96      3810

    accuracy                           0.97     31479
   macro avg       0.97      0.97      0.97     31479
weighted avg       0.97      0.97      0.97     31479



In [54]:
print(classification_report(test_df[target_column],lgbm.predict(test_X)))

              precision    recall  f1-score   support

           0       0.76      0.79      0.77      2150
           1       0.78      0.75      0.76      1809
           2       0.75      0.70      0.72      2357
           3       0.68      0.58      0.63      1088
           4       0.74      0.71      0.72      1398
           5       0.78      0.83      0.80      4827
           6       0.70      0.71      0.70      1877

    accuracy                           0.75     15506
   macro avg       0.74      0.72      0.73     15506
weighted avg       0.75      0.75      0.75     15506



## CatBoost

In [47]:
cbc.get_all_params()

{'nan_mode': 'Min',
 'gpu_ram_part': 0.95,
 'eval_metric': 'MultiClass',
 'iterations': 1000,
 'leaf_estimation_method': 'Newton',
 'observations_to_bootstrap': 'TestOnly',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'devices': '-1',
 'eval_fraction': 0,
 'pinned_memory_bytes': '104857600',
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'gpu_cat_features_storage': 'GpuRam',
 'fold_size_loss_normalization': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'use_best_model': False,
 'meta_l2_frequency': 0,
 'class_names': [0, 1, 2, 3, 4, 5, 6],
 'random_seed': 42,
 'depth': 6,
 'border_count': 128,
 'min_fold_size': 100,
 'data_partition': 'DocParallel',
 'bagging_temperature': 1,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'leaf_estimation_backtracking

In [40]:
%%time
cbc.fit(train_X, train_df[target_column])

Learning rate set to 0.131184
0:	learn: 1.7293913	total: 44.7ms	remaining: 44.7s
1:	learn: 1.5951490	total: 68.9ms	remaining: 34.4s
2:	learn: 1.4931727	total: 94.6ms	remaining: 31.4s
3:	learn: 1.4141670	total: 121ms	remaining: 30.1s
4:	learn: 1.3470368	total: 145ms	remaining: 28.9s
5:	learn: 1.2904635	total: 167ms	remaining: 27.7s
6:	learn: 1.2426206	total: 190ms	remaining: 27s
7:	learn: 1.2030093	total: 213ms	remaining: 26.4s
8:	learn: 1.1684087	total: 236ms	remaining: 26s
9:	learn: 1.1365850	total: 259ms	remaining: 25.7s
10:	learn: 1.1081118	total: 281ms	remaining: 25.2s
11:	learn: 1.0834571	total: 301ms	remaining: 24.8s
12:	learn: 1.0618205	total: 323ms	remaining: 24.5s
13:	learn: 1.0413361	total: 344ms	remaining: 24.2s
14:	learn: 1.0235205	total: 366ms	remaining: 24s
15:	learn: 1.0072728	total: 387ms	remaining: 23.8s
16:	learn: 0.9927364	total: 407ms	remaining: 23.6s
17:	learn: 0.9783739	total: 429ms	remaining: 23.4s
18:	learn: 0.9658506	total: 452ms	remaining: 23.3s
19:	learn: 0.9

164:	learn: 0.5782084	total: 3.51s	remaining: 17.8s
165:	learn: 0.5774911	total: 3.54s	remaining: 17.8s
166:	learn: 0.5765476	total: 3.56s	remaining: 17.8s
167:	learn: 0.5756012	total: 3.58s	remaining: 17.7s
168:	learn: 0.5744853	total: 3.6s	remaining: 17.7s
169:	learn: 0.5733695	total: 3.62s	remaining: 17.7s
170:	learn: 0.5725572	total: 3.64s	remaining: 17.7s
171:	learn: 0.5715322	total: 3.66s	remaining: 17.6s
172:	learn: 0.5707574	total: 3.68s	remaining: 17.6s
173:	learn: 0.5696788	total: 3.7s	remaining: 17.6s
174:	learn: 0.5687518	total: 3.73s	remaining: 17.6s
175:	learn: 0.5678111	total: 3.75s	remaining: 17.5s
176:	learn: 0.5667029	total: 3.77s	remaining: 17.5s
177:	learn: 0.5654817	total: 3.79s	remaining: 17.5s
178:	learn: 0.5648621	total: 3.81s	remaining: 17.5s
179:	learn: 0.5640882	total: 3.83s	remaining: 17.4s
180:	learn: 0.5631322	total: 3.85s	remaining: 17.4s
181:	learn: 0.5617487	total: 3.87s	remaining: 17.4s
182:	learn: 0.5604675	total: 3.89s	remaining: 17.4s
183:	learn: 0.

328:	learn: 0.4423370	total: 6.98s	remaining: 14.2s
329:	learn: 0.4413082	total: 7s	remaining: 14.2s
330:	learn: 0.4405871	total: 7.02s	remaining: 14.2s
331:	learn: 0.4402182	total: 7.04s	remaining: 14.2s
332:	learn: 0.4395163	total: 7.06s	remaining: 14.1s
333:	learn: 0.4389259	total: 7.08s	remaining: 14.1s
334:	learn: 0.4383750	total: 7.1s	remaining: 14.1s
335:	learn: 0.4380088	total: 7.12s	remaining: 14.1s
336:	learn: 0.4374646	total: 7.14s	remaining: 14s
337:	learn: 0.4367813	total: 7.16s	remaining: 14s
338:	learn: 0.4362966	total: 7.18s	remaining: 14s
339:	learn: 0.4355715	total: 7.2s	remaining: 14s
340:	learn: 0.4350620	total: 7.22s	remaining: 14s
341:	learn: 0.4343742	total: 7.25s	remaining: 13.9s
342:	learn: 0.4339910	total: 7.27s	remaining: 13.9s
343:	learn: 0.4331110	total: 7.29s	remaining: 13.9s
344:	learn: 0.4320253	total: 7.31s	remaining: 13.9s
345:	learn: 0.4313426	total: 7.33s	remaining: 13.9s
346:	learn: 0.4307761	total: 7.36s	remaining: 13.8s
347:	learn: 0.4300645	total

496:	learn: 0.3525422	total: 10.5s	remaining: 10.6s
497:	learn: 0.3521228	total: 10.5s	remaining: 10.6s
498:	learn: 0.3514814	total: 10.5s	remaining: 10.6s
499:	learn: 0.3511542	total: 10.6s	remaining: 10.6s
500:	learn: 0.3505968	total: 10.6s	remaining: 10.5s
501:	learn: 0.3501497	total: 10.6s	remaining: 10.5s
502:	learn: 0.3497823	total: 10.6s	remaining: 10.5s
503:	learn: 0.3495686	total: 10.6s	remaining: 10.5s
504:	learn: 0.3492404	total: 10.7s	remaining: 10.4s
505:	learn: 0.3486959	total: 10.7s	remaining: 10.4s
506:	learn: 0.3482848	total: 10.7s	remaining: 10.4s
507:	learn: 0.3478732	total: 10.7s	remaining: 10.4s
508:	learn: 0.3474768	total: 10.7s	remaining: 10.4s
509:	learn: 0.3468815	total: 10.8s	remaining: 10.3s
510:	learn: 0.3463870	total: 10.8s	remaining: 10.3s
511:	learn: 0.3461151	total: 10.8s	remaining: 10.3s
512:	learn: 0.3456941	total: 10.8s	remaining: 10.3s
513:	learn: 0.3453717	total: 10.8s	remaining: 10.2s
514:	learn: 0.3448234	total: 10.9s	remaining: 10.2s
515:	learn: 

661:	learn: 0.2860175	total: 13.9s	remaining: 7.12s
662:	learn: 0.2854339	total: 14s	remaining: 7.1s
663:	learn: 0.2851891	total: 14s	remaining: 7.08s
664:	learn: 0.2845482	total: 14s	remaining: 7.06s
665:	learn: 0.2840586	total: 14s	remaining: 7.04s
666:	learn: 0.2836283	total: 14.1s	remaining: 7.02s
667:	learn: 0.2832875	total: 14.1s	remaining: 7s
668:	learn: 0.2828431	total: 14.1s	remaining: 6.98s
669:	learn: 0.2824250	total: 14.1s	remaining: 6.96s
670:	learn: 0.2819138	total: 14.1s	remaining: 6.93s
671:	learn: 0.2816205	total: 14.2s	remaining: 6.91s
672:	learn: 0.2813297	total: 14.2s	remaining: 6.89s
673:	learn: 0.2807758	total: 14.2s	remaining: 6.87s
674:	learn: 0.2803070	total: 14.2s	remaining: 6.85s
675:	learn: 0.2799902	total: 14.2s	remaining: 6.83s
676:	learn: 0.2796086	total: 14.3s	remaining: 6.81s
677:	learn: 0.2792962	total: 14.3s	remaining: 6.79s
678:	learn: 0.2786523	total: 14.3s	remaining: 6.77s
679:	learn: 0.2783480	total: 14.3s	remaining: 6.75s
680:	learn: 0.2781875	to

824:	learn: 0.2346247	total: 17.4s	remaining: 3.69s
825:	learn: 0.2343227	total: 17.4s	remaining: 3.67s
826:	learn: 0.2341333	total: 17.4s	remaining: 3.64s
827:	learn: 0.2339046	total: 17.4s	remaining: 3.62s
828:	learn: 0.2337734	total: 17.5s	remaining: 3.6s
829:	learn: 0.2336157	total: 17.5s	remaining: 3.58s
830:	learn: 0.2334447	total: 17.5s	remaining: 3.56s
831:	learn: 0.2332486	total: 17.5s	remaining: 3.54s
832:	learn: 0.2329613	total: 17.5s	remaining: 3.52s
833:	learn: 0.2326693	total: 17.6s	remaining: 3.5s
834:	learn: 0.2324598	total: 17.6s	remaining: 3.47s
835:	learn: 0.2320302	total: 17.6s	remaining: 3.45s
836:	learn: 0.2318297	total: 17.6s	remaining: 3.43s
837:	learn: 0.2315837	total: 17.6s	remaining: 3.41s
838:	learn: 0.2313830	total: 17.7s	remaining: 3.39s
839:	learn: 0.2311143	total: 17.7s	remaining: 3.37s
840:	learn: 0.2307071	total: 17.7s	remaining: 3.35s
841:	learn: 0.2305551	total: 17.7s	remaining: 3.33s
842:	learn: 0.2303407	total: 17.7s	remaining: 3.31s
843:	learn: 0.

986:	learn: 0.1946993	total: 20.8s	remaining: 274ms
987:	learn: 0.1945091	total: 20.8s	remaining: 253ms
988:	learn: 0.1943798	total: 20.8s	remaining: 232ms
989:	learn: 0.1940794	total: 20.8s	remaining: 211ms
990:	learn: 0.1938522	total: 20.9s	remaining: 190ms
991:	learn: 0.1936071	total: 20.9s	remaining: 168ms
992:	learn: 0.1934340	total: 20.9s	remaining: 147ms
993:	learn: 0.1932877	total: 20.9s	remaining: 126ms
994:	learn: 0.1931067	total: 20.9s	remaining: 105ms
995:	learn: 0.1928500	total: 21s	remaining: 84.2ms
996:	learn: 0.1926803	total: 21s	remaining: 63.2ms
997:	learn: 0.1925113	total: 21s	remaining: 42.1ms
998:	learn: 0.1922256	total: 21s	remaining: 21.1ms
999:	learn: 0.1920431	total: 21.1s	remaining: 0us
CPU times: total: 48 s
Wall time: 26.9 s


<catboost.core.CatBoostClassifier at 0x1e27868ed90>

In [47]:
%time
accuracy_score(cbc.predict(test_X),test_df[target_column])

CPU times: total: 0 ns
Wall time: 0 ns


0.7642847929833613

In [55]:
print(classification_report(train_df[target_column],cbc.predict(train_X)))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4364
           1       0.97      0.98      0.97      3674
           2       0.99      0.96      0.97      4785
           3       0.97      0.95      0.96      2209
           4       0.98      0.97      0.98      2839
           5       0.98      0.99      0.98      9798
           6       0.98      0.97      0.97      3810

    accuracy                           0.98     31479
   macro avg       0.98      0.97      0.97     31479
weighted avg       0.98      0.98      0.98     31479



In [56]:
print(classification_report(test_df[target_column],cbc.predict(test_X)))

              precision    recall  f1-score   support

           0       0.77      0.80      0.78      2150
           1       0.79      0.76      0.77      1809
           2       0.77      0.70      0.73      2357
           3       0.70      0.63      0.67      1088
           4       0.74      0.72      0.73      1398
           5       0.79      0.84      0.81      4827
           6       0.72      0.72      0.72      1877

    accuracy                           0.76     15506
   macro avg       0.75      0.74      0.75     15506
weighted avg       0.76      0.76      0.76     15506



## Support Vector Machine

In [35]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [34]:
%%time
svc.fit(train_X,train_df[target_column])

CPU times: total: 2min 34s
Wall time: 2min 34s


SVC(random_state=42)

In [40]:
%%time
print(accuracy_score(svc.predict(test_X),test_df[target_column]))

0.7691216303366438
CPU times: total: 2min 3s
Wall time: 2min 3s


In [38]:
print(classification_report(train_df[target_column],svc.predict(train_X)))

              precision    recall  f1-score   support

           0       0.77      0.84      0.81      4364
           1       0.81      0.80      0.81      3674
           2       0.81      0.75      0.78      4785
           3       0.74      0.66      0.70      2209
           4       0.78      0.73      0.75      2839
           5       0.83      0.86      0.84      9798
           6       0.76      0.76      0.76      3810

    accuracy                           0.80     31479
   macro avg       0.79      0.77      0.78     31479
weighted avg       0.80      0.80      0.80     31479



In [39]:
print(classification_report(test_df[target_column],svc.predict(test_X)))

              precision    recall  f1-score   support

           0       0.75      0.82      0.78      2150
           1       0.79      0.75      0.77      1809
           2       0.78      0.71      0.74      2357
           3       0.71      0.64      0.67      1088
           4       0.76      0.71      0.74      1398
           5       0.80      0.84      0.82      4827
           6       0.72      0.73      0.72      1877

    accuracy                           0.77     15506
   macro avg       0.76      0.74      0.75     15506
weighted avg       0.77      0.77      0.77     15506

