# IMPORT, CONFIG

In [31]:
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt

import gensim
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import catboost
from catboost import CatBoostClassifier

import lightgbm
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import time
import math
import random
import pickle
import re
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [2]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 8, 6

In [3]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# LOAD

In [4]:
with open('ohsumed_dataset.pkl','rb') as ohsumed_file:
     data = pickle.load(ohsumed_file)

In [5]:
data

Unnamed: 0,texts,Y
0,Haemophilus influenzae meningitis with prolong...,0
1,Augmentation mentoplasty using Mersilene mesh....,0
2,Multiple intracranial mucoceles associated wit...,0
3,Replacement of an aortic valve cusp after neon...,0
4,Mucosal intussusception to avoid ascending cho...,0
...,...,...
56979,Ionized calcium in blood: studies on patients ...,22
56980,Effects of immediate postoperative enteral nut...,22
56981,Effects of enteral fat emulsion on fat absorpt...,22
56982,Inhibition of early atherogenesis in transgeni...,22


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56984 entries, 0 to 56983
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   texts   56984 non-null  object
 1   Y       56984 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 890.5+ KB


In [7]:
target_column = 'Y'

In [8]:
train_df, test_df = train_test_split(data, test_size=0.20, random_state=RANDOM_SEED,stratify=data[target_column])

In [9]:
len(train_df), len(test_df)

(45587, 11397)

# PROCESSING (на всех данных)

## В токенайзере preprocess_token возвращал None

In [10]:
EMBEDDING_SIZE = 300
EMBEDDING_SIZE

300

In [11]:
class MyTokenizer():
    class Encoding:
        def __init__(self):
            super().__init__()
            tokens = list()
            ids = list()        
            
    def __init__(self):
        self.index2token = list()
        self.token2index = dict()
        self.token2count = Counter()
        self.encoding = MyTokenizer.Encoding()
        self.reset()

    def preprocess_token(self,token):
        token = re.sub(r'\d+[\,\.]?\d+','',token)
        token = re.sub(r'[\.\,\"\'\;\:\)\]\(\[\?\!\-\+]{1,}$','',token)
        token = re.sub(r'^[\.\,\(\[\"\'\;\:\)\]\?\!\-\+]{1,}','',token)
        token = token.replace('\n','')
        return token
        
    def add_sentence(self, sentence):
        for token in self.encode(sentence).tokens:
            self.add_token(token)

    def add_token(self, token):
        if token not in self.token2index:
            self.token2index[token] = len(self.index2token)
            self.token2count[token] += 1
            self.index2token.append(token)
        else:
            self.token2count[token] += 1
    
    def has_token(self, token) -> bool:
        return token in self.token2index

    def add_texts(self):
        sents_texts = train_df.texts.to_list() # data, но должно быть train_df
        for sent in sents_texts:
            self.add_sentence(sent)

    def token_to_id(self, token):
        if token in self.token2index:
            return self.token2index[token]
        else:    # убрал паддинг - если нет слова в словаре то пропуск
            pass

    def id_to_token(self, id_):
        if id_ in self.index2token:
            return self.index2token[id_]
        else:    # убрал отсутсвие id в словаре - если нет id в словаре то пропуск
            pass
    
    def is_empty(self):
        empty_size = 4
        return self.size() <= empty_size

    def shrink(self, n):
        # для сокращения словаря (выбирает n самых частых слов) (подаётся n)
        best_tokens = self.token2count.most_common(n)
        self.reset()
        for token, count in best_tokens:
            self.add_token(token)
            self.token2count[token] = count

    def reset(self):
        self.token2count = Counter()
        self.index2token = []
        self.token2index = {token: index for index, token in enumerate(self.index2token)}

    def get_vocab(self):
        return self.token2index
    
    def get_vocab_size(self):
        return len(self.index2token)
        
    def encode(self, text: str): # sentence: str
        text = text.lower().strip()
        tokens = text.split(' ')
        self.encoding.tokens = []
        for token in tokens:
            preproc_token = self.preprocess_token(token)
            if preproc_token is not None or preproc_token != '':
                self.encoding.tokens.append(preproc_token)
        return self.encoding

    def decode(self, ids):
        return print([self.index2token[id_] for id_ in ids].join(' '))

In [12]:
tokenizer = MyTokenizer()

In [13]:
tokenizer.add_texts()

In [14]:
tokenizer.token2index

{'correlation': 0,
 'of': 1,
 'the': 2,
 'blind': 3,
 'spot': 4,
 'size': 5,
 'to': 6,
 'area': 7,
 'optic': 8,
 'disk': 9,
 'and': 10,
 'parapapillary': 11,
 'atrophy': 12,
 'we': 13,
 'evaluated': 14,
 'relationship': 15,
 'between': 16,
 'using': 17,
 'kinetic': 18,
 'goldmann': 19,
 'perimetry': 20,
 'in': 21,
 '': 22,
 'patients': 23,
 'with': 24,
 'open-angle': 25,
 'glaucoma': 26,
 'normal': 27,
 'subjects': 28,
 'was': 29,
 'correlated': 30,
 'significantly': 31,
 'total': 32,
 'peripapillary': 33,
 'scleral': 34,
 'ring': 35,
 'chorioretinal': 36,
 'zone': 37,
 'beta': 38,
 'a': 39,
 'visible': 40,
 'sclera': 41,
 'attributed': 42,
 'an': 43,
 'absolute': 44,
 'scotoma': 45,
 'alpha': 46,
 'irregular': 47,
 'pigmentation': 48,
 'relative': 49,
 'larger': 50,
 'glaucomatous': 51,
 'eyes': 52,
 'than': 53,
 'which': 54,
 'corresponded': 55,
 'intrapapillary': 56,
 'region': 57,
 'nerve': 58,
 'head': 59,
 'included': 60,
 'significant': 61,
 'difference': 62,
 'results': 63,
 'm

In [15]:
tokenizer.index2token

['correlation',
 'of',
 'the',
 'blind',
 'spot',
 'size',
 'to',
 'area',
 'optic',
 'disk',
 'and',
 'parapapillary',
 'atrophy',
 'we',
 'evaluated',
 'relationship',
 'between',
 'using',
 'kinetic',
 'goldmann',
 'perimetry',
 'in',
 '',
 'patients',
 'with',
 'open-angle',
 'glaucoma',
 'normal',
 'subjects',
 'was',
 'correlated',
 'significantly',
 'total',
 'peripapillary',
 'scleral',
 'ring',
 'chorioretinal',
 'zone',
 'beta',
 'a',
 'visible',
 'sclera',
 'attributed',
 'an',
 'absolute',
 'scotoma',
 'alpha',
 'irregular',
 'pigmentation',
 'relative',
 'larger',
 'glaucomatous',
 'eyes',
 'than',
 'which',
 'corresponded',
 'intrapapillary',
 'region',
 'nerve',
 'head',
 'included',
 'significant',
 'difference',
 'results',
 'multicenter',
 'studies',
 'digoxin-specific',
 'antibody',
 'fragments',
 'managing',
 'digitalis',
 'intoxication',
 'pediatric',
 'population',
 'toxicity',
 'continues',
 'be',
 'problem',
 'for',
 'undergoing',
 'therapy',
 'cardiac',
 'glyco

In [16]:
tokenizer.token2count

Counter({'correlation': 2349,
         'of': 380822,
         'the': 376680,
         'blind': 334,
         'spot': 68,
         'size': 1969,
         'to': 135973,
         'area': 2409,
         'optic': 623,
         'disk': 147,
         'and': 255414,
         'parapapillary': 12,
         'atrophy': 551,
         'we': 20212,
         'evaluated': 3360,
         'relationship': 2200,
         'between': 13483,
         'using': 6960,
         'kinetic': 113,
         'goldmann': 15,
         'perimetry': 64,
         'in': 248178,
         '': 212626,
         'patients': 99795,
         'with': 137073,
         'open-angle': 67,
         'glaucoma': 443,
         'normal': 11689,
         'subjects': 7085,
         'was': 85019,
         'correlated': 2364,
         'significantly': 10602,
         'total': 6174,
         'peripapillary': 26,
         'scleral': 134,
         'ring': 220,
         'chorioretinal': 52,
         'zone': 524,
         'beta': 1780,
         'a': 

In [17]:
VOCAB_SIZE = tokenizer.get_vocab_size()
VOCAB_SIZE

85323

In [18]:
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_SIZE))
embedding_matrix.shape

(85323, 300)

In [19]:
Word2Vec = gensim.models.KeyedVectors.load_word2vec_format('./test_models/GoogleNews-vectors-negative300.bin', binary=True)  

### для незнакомых слов либо !нулевой вектор! либо сумма контекстных

In [20]:
for index, word in zip(tokenizer.token2index.values(),tokenizer.token2index.keys()):
    try:
        embedding_matrix[index] = Word2Vec.get_vector(word).copy()
    except:
        embedding_matrix[index] = np.zeros(EMBEDDING_SIZE)

In [21]:
del Word2Vec

In [22]:
def vectorize(text):
    vector = np.zeros(EMBEDDING_SIZE)
    for token in tokenizer.encode(text).tokens:
        try:
            vector += embedding_matrix[tokenizer.token2index[token]]
        except:
            pass
    return vector

In [23]:
train_X = np.array([vectorize(text) for text in train_df.texts.to_list()])
test_X = np.array([vectorize(text) for text in test_df.texts.to_list()])

# MODEL

In [24]:
rfc = RandomForestClassifier(random_state=RANDOM_SEED,n_jobs=-1)

In [25]:
svc = SVC(random_state=RANDOM_SEED)

In [37]:
lgbm = lightgbm.LGBMClassifier(objective='multiclass', seed=RANDOM_SEED)

In [27]:
cbc = CatBoostClassifier(random_seed=RANDOM_SEED,task_type='GPU')

# TRAIN & PREDICT

## Random Forest

In [29]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [28]:
%%time
rfc.fit(train_X,train_df[target_column])

CPU times: total: 3min 23s
Wall time: 18.6 s


RandomForestClassifier(n_jobs=-1, random_state=42)

In [33]:
%%time
print(accuracy_score(rfc.predict(test_X),test_df[target_column]))

0.16627182591910153
CPU times: total: 1.64 s
Wall time: 189 ms


In [34]:
print(classification_report(train_df[target_column],rfc.predict(train_X)))

              precision    recall  f1-score   support

           0       0.60      0.62      0.61      2032
           1       0.59      0.55      0.57       937
           2       0.69      0.73      0.71       342
           3       0.69      0.76      0.73      5061
           4       0.64      0.63      0.64      1342
           5       0.66      0.63      0.64      2391
           6       0.65      0.56      0.60       421
           7       0.63      0.58      0.60      2071
           8       0.63      0.56      0.59       572
           9       0.67      0.67      0.67      3081
          10       0.69      0.64      0.66       798
          11       0.69      0.63      0.65      2014
          12       0.67      0.60      0.63      1298
          13       0.73      0.79      0.76      4882
          14       0.60      0.57      0.58      1022
          15       0.66      0.62      0.64       869
          16       0.68      0.65      0.66      1294
          17       0.73    

In [35]:
print(classification_report(test_df[target_column],rfc.predict(test_X)))

              precision    recall  f1-score   support

           0       0.10      0.09      0.09       508
           1       0.05      0.03      0.04       234
           2       0.00      0.00      0.00        85
           3       0.30      0.39      0.34      1266
           4       0.02      0.01      0.01       336
           5       0.05      0.04      0.04       598
           6       0.01      0.01      0.01       105
           7       0.01      0.01      0.01       518
           8       0.00      0.00      0.00       143
           9       0.17      0.15      0.16       770
          10       0.06      0.04      0.05       200
          11       0.02      0.01      0.01       504
          12       0.10      0.07      0.08       325
          13       0.36      0.43      0.39      1220
          14       0.00      0.00      0.00       255
          15       0.00      0.00      0.00       217
          16       0.05      0.03      0.04       323
          17       0.18    

## LightGBM

In [38]:
lgbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'multiclass',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': 'warn',
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'seed': 42}

In [42]:
%%time
lgbm.fit(train_X, train_df[target_column])

CPU times: total: 11min 42s
Wall time: 1min 3s


LGBMClassifier(objective='multiclass', seed=42)

In [46]:
%%time
accuracy_score(lgbm.predict(test_X),test_df[target_column])

CPU times: total: 4.7 s
Wall time: 408 ms


0.23479863121874178

In [47]:
print(classification_report(train_df[target_column],lgbm.predict(train_X)))

              precision    recall  f1-score   support

           0       0.58      0.63      0.60      2032
           1       0.55      0.70      0.61       937
           2       0.63      0.90      0.74       342
           3       0.64      0.74      0.69      5061
           4       0.64      0.67      0.65      1342
           5       0.64      0.60      0.62      2391
           6       0.53      0.86      0.66       421
           7       0.63      0.55      0.59      2071
           8       0.55      0.83      0.66       572
           9       0.67      0.57      0.62      3081
          10       0.61      0.84      0.70       798
          11       0.70      0.60      0.65      2014
          12       0.61      0.73      0.66      1298
          13       0.70      0.77      0.73      4882
          14       0.58      0.63      0.60      1022
          15       0.63      0.74      0.68       869
          16       0.67      0.68      0.67      1294
          17       0.73    

In [49]:
print(classification_report(test_df[target_column],lgbm.predict(test_X)))

              precision    recall  f1-score   support

           0       0.15      0.15      0.15       508
           1       0.06      0.06      0.06       234
           2       0.04      0.04      0.04        85
           3       0.38      0.47      0.42      1266
           4       0.13      0.13      0.13       336
           5       0.21      0.16      0.18       598
           6       0.06      0.07      0.06       105
           7       0.16      0.13      0.14       518
           8       0.03      0.03      0.03       143
           9       0.29      0.24      0.26       770
          10       0.14      0.13      0.13       200
          11       0.20      0.16      0.17       504
          12       0.20      0.20      0.20       325
          13       0.45      0.51      0.48      1220
          14       0.04      0.04      0.04       255
          15       0.09      0.07      0.08       217
          16       0.18      0.14      0.15       323
          17       0.23    

## CatBoost

In [56]:
cbc.get_all_params()

{'nan_mode': 'Min',
 'gpu_ram_part': 0.95,
 'eval_metric': 'MultiClass',
 'iterations': 1000,
 'leaf_estimation_method': 'Newton',
 'observations_to_bootstrap': 'TestOnly',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'devices': '-1',
 'eval_fraction': 0,
 'pinned_memory_bytes': '104857600',
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'gpu_cat_features_storage': 'GpuRam',
 'fold_size_loss_normalization': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'use_best_model': False,
 'meta_l2_frequency': 0,
 'class_names': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22],
 'random_seed': 42,
 'depth': 6,
 'border_count': 128,
 'min_fold_size': 100,
 'data_partition': 'DocParallel',
 'ba

In [50]:
%%time
cbc.fit(train_X, train_df[target_column])

Learning rate set to 0.141477
0:	learn: 2.9317380	total: 69.7ms	remaining: 1m 9s
1:	learn: 2.8296290	total: 120ms	remaining: 1m
2:	learn: 2.7543459	total: 170ms	remaining: 56.4s
3:	learn: 2.6954004	total: 219ms	remaining: 54.5s
4:	learn: 2.6467885	total: 270ms	remaining: 53.7s
5:	learn: 2.6014741	total: 321ms	remaining: 53.2s
6:	learn: 2.5628638	total: 373ms	remaining: 52.9s
7:	learn: 2.5269937	total: 422ms	remaining: 52.3s
8:	learn: 2.4973404	total: 471ms	remaining: 51.8s
9:	learn: 2.4717158	total: 520ms	remaining: 51.4s
10:	learn: 2.4475049	total: 570ms	remaining: 51.2s
11:	learn: 2.4267998	total: 618ms	remaining: 50.9s
12:	learn: 2.4075125	total: 669ms	remaining: 50.8s
13:	learn: 2.3875202	total: 719ms	remaining: 50.6s
14:	learn: 2.3680389	total: 767ms	remaining: 50.4s
15:	learn: 2.3504199	total: 818ms	remaining: 50.3s
16:	learn: 2.3350374	total: 866ms	remaining: 50.1s
17:	learn: 2.3201164	total: 917ms	remaining: 50s
18:	learn: 2.3062702	total: 965ms	remaining: 49.8s
19:	learn: 2.29

164:	learn: 1.6616925	total: 7.93s	remaining: 40.2s
165:	learn: 1.6591804	total: 7.98s	remaining: 40.1s
166:	learn: 1.6573807	total: 8.03s	remaining: 40s
167:	learn: 1.6549847	total: 8.08s	remaining: 40s
168:	learn: 1.6522258	total: 8.13s	remaining: 40s
169:	learn: 1.6501391	total: 8.17s	remaining: 39.9s
170:	learn: 1.6482368	total: 8.22s	remaining: 39.9s
171:	learn: 1.6458408	total: 8.27s	remaining: 39.8s
172:	learn: 1.6441063	total: 8.32s	remaining: 39.8s
173:	learn: 1.6428596	total: 8.36s	remaining: 39.7s
174:	learn: 1.6403527	total: 8.41s	remaining: 39.6s
175:	learn: 1.6372611	total: 8.46s	remaining: 39.6s
176:	learn: 1.6349041	total: 8.51s	remaining: 39.6s
177:	learn: 1.6326651	total: 8.56s	remaining: 39.5s
178:	learn: 1.6309760	total: 8.6s	remaining: 39.5s
179:	learn: 1.6292419	total: 8.65s	remaining: 39.4s
180:	learn: 1.6276399	total: 8.7s	remaining: 39.4s
181:	learn: 1.6248521	total: 8.74s	remaining: 39.3s
182:	learn: 1.6235356	total: 8.79s	remaining: 39.2s
183:	learn: 1.621612

324:	learn: 1.4188201	total: 15.2s	remaining: 31.6s
325:	learn: 1.4177579	total: 15.3s	remaining: 31.6s
326:	learn: 1.4165219	total: 15.3s	remaining: 31.5s
327:	learn: 1.4158914	total: 15.4s	remaining: 31.5s
328:	learn: 1.4148498	total: 15.4s	remaining: 31.4s
329:	learn: 1.4142546	total: 15.4s	remaining: 31.4s
330:	learn: 1.4130819	total: 15.5s	remaining: 31.3s
331:	learn: 1.4120204	total: 15.5s	remaining: 31.3s
332:	learn: 1.4112519	total: 15.6s	remaining: 31.2s
333:	learn: 1.4103576	total: 15.6s	remaining: 31.1s
334:	learn: 1.4089077	total: 15.7s	remaining: 31.1s
335:	learn: 1.4070351	total: 15.7s	remaining: 31.1s
336:	learn: 1.4060512	total: 15.8s	remaining: 31s
337:	learn: 1.4045060	total: 15.8s	remaining: 31s
338:	learn: 1.4033436	total: 15.9s	remaining: 30.9s
339:	learn: 1.4022671	total: 15.9s	remaining: 30.9s
340:	learn: 1.4011052	total: 15.9s	remaining: 30.8s
341:	learn: 1.3998928	total: 16s	remaining: 30.8s
342:	learn: 1.3985111	total: 16s	remaining: 30.7s
343:	learn: 1.397282

484:	learn: 1.2550753	total: 22.5s	remaining: 23.9s
485:	learn: 1.2539899	total: 22.5s	remaining: 23.8s
486:	learn: 1.2532010	total: 22.6s	remaining: 23.8s
487:	learn: 1.2526148	total: 22.6s	remaining: 23.7s
488:	learn: 1.2522112	total: 22.7s	remaining: 23.7s
489:	learn: 1.2512385	total: 22.7s	remaining: 23.6s
490:	learn: 1.2505248	total: 22.8s	remaining: 23.6s
491:	learn: 1.2499695	total: 22.8s	remaining: 23.5s
492:	learn: 1.2492712	total: 22.9s	remaining: 23.5s
493:	learn: 1.2483979	total: 22.9s	remaining: 23.5s
494:	learn: 1.2472528	total: 22.9s	remaining: 23.4s
495:	learn: 1.2463486	total: 23s	remaining: 23.4s
496:	learn: 1.2456354	total: 23s	remaining: 23.3s
497:	learn: 1.2448953	total: 23.1s	remaining: 23.3s
498:	learn: 1.2439594	total: 23.1s	remaining: 23.2s
499:	learn: 1.2423709	total: 23.2s	remaining: 23.2s
500:	learn: 1.2417471	total: 23.2s	remaining: 23.1s
501:	learn: 1.2405397	total: 23.3s	remaining: 23.1s
502:	learn: 1.2392271	total: 23.3s	remaining: 23.1s
503:	learn: 1.23

644:	learn: 1.1357203	total: 29.8s	remaining: 16.4s
645:	learn: 1.1350118	total: 29.8s	remaining: 16.3s
646:	learn: 1.1343950	total: 29.9s	remaining: 16.3s
647:	learn: 1.1337402	total: 29.9s	remaining: 16.3s
648:	learn: 1.1328198	total: 30s	remaining: 16.2s
649:	learn: 1.1323261	total: 30s	remaining: 16.2s
650:	learn: 1.1319533	total: 30.1s	remaining: 16.1s
651:	learn: 1.1313954	total: 30.1s	remaining: 16.1s
652:	learn: 1.1307982	total: 30.2s	remaining: 16s
653:	learn: 1.1298905	total: 30.2s	remaining: 16s
654:	learn: 1.1292340	total: 30.3s	remaining: 15.9s
655:	learn: 1.1286560	total: 30.3s	remaining: 15.9s
656:	learn: 1.1280973	total: 30.4s	remaining: 15.8s
657:	learn: 1.1278042	total: 30.4s	remaining: 15.8s
658:	learn: 1.1273490	total: 30.4s	remaining: 15.8s
659:	learn: 1.1269252	total: 30.5s	remaining: 15.7s
660:	learn: 1.1256785	total: 30.5s	remaining: 15.7s
661:	learn: 1.1250697	total: 30.6s	remaining: 15.6s
662:	learn: 1.1243560	total: 30.6s	remaining: 15.6s
663:	learn: 1.123778

804:	learn: 1.0382802	total: 37.2s	remaining: 9.01s
805:	learn: 1.0377538	total: 37.3s	remaining: 8.97s
806:	learn: 1.0374006	total: 37.3s	remaining: 8.92s
807:	learn: 1.0369943	total: 37.3s	remaining: 8.87s
808:	learn: 1.0366226	total: 37.4s	remaining: 8.83s
809:	learn: 1.0361224	total: 37.4s	remaining: 8.78s
810:	learn: 1.0356314	total: 37.5s	remaining: 8.73s
811:	learn: 1.0352682	total: 37.5s	remaining: 8.69s
812:	learn: 1.0348824	total: 37.6s	remaining: 8.64s
813:	learn: 1.0341528	total: 37.6s	remaining: 8.6s
814:	learn: 1.0336096	total: 37.7s	remaining: 8.55s
815:	learn: 1.0328751	total: 37.7s	remaining: 8.5s
816:	learn: 1.0324855	total: 37.8s	remaining: 8.46s
817:	learn: 1.0318715	total: 37.8s	remaining: 8.41s
818:	learn: 1.0314787	total: 37.8s	remaining: 8.36s
819:	learn: 1.0308567	total: 37.9s	remaining: 8.32s
820:	learn: 1.0304628	total: 37.9s	remaining: 8.27s
821:	learn: 1.0299728	total: 38s	remaining: 8.22s
822:	learn: 1.0291477	total: 38s	remaining: 8.18s
823:	learn: 1.0287

964:	learn: 0.9651810	total: 44.5s	remaining: 1.61s
965:	learn: 0.9649503	total: 44.6s	remaining: 1.57s
966:	learn: 0.9643859	total: 44.6s	remaining: 1.52s
967:	learn: 0.9640929	total: 44.7s	remaining: 1.48s
968:	learn: 0.9636516	total: 44.7s	remaining: 1.43s
969:	learn: 0.9630797	total: 44.8s	remaining: 1.38s
970:	learn: 0.9625198	total: 44.8s	remaining: 1.34s
971:	learn: 0.9621421	total: 44.9s	remaining: 1.29s
972:	learn: 0.9612210	total: 44.9s	remaining: 1.25s
973:	learn: 0.9608161	total: 45s	remaining: 1.2s
974:	learn: 0.9602934	total: 45s	remaining: 1.15s
975:	learn: 0.9599784	total: 45s	remaining: 1.11s
976:	learn: 0.9595356	total: 45.1s	remaining: 1.06s
977:	learn: 0.9590985	total: 45.1s	remaining: 1.01s
978:	learn: 0.9585625	total: 45.2s	remaining: 969ms
979:	learn: 0.9583568	total: 45.2s	remaining: 923ms
980:	learn: 0.9578821	total: 45.3s	remaining: 877ms
981:	learn: 0.9576132	total: 45.3s	remaining: 831ms
982:	learn: 0.9571326	total: 45.4s	remaining: 785ms
983:	learn: 0.95674

<catboost.core.CatBoostClassifier at 0x1f6c1835370>

In [51]:
%time
accuracy_score(cbc.predict(test_X),test_df[target_column])

CPU times: total: 0 ns
Wall time: 0 ns


0.2471703079757831

In [52]:
print(classification_report(train_df[target_column],cbc.predict(train_X)))

              precision    recall  f1-score   support

           0       0.56      0.63      0.59      2032
           1       0.60      0.56      0.58       937
           2       0.68      0.79      0.73       342
           3       0.63      0.77      0.70      5061
           4       0.62      0.63      0.62      1342
           5       0.62      0.62      0.62      2391
           6       0.61      0.65      0.63       421
           7       0.60      0.57      0.58      2071
           8       0.62      0.62      0.62       572
           9       0.65      0.62      0.64      3081
          10       0.64      0.73      0.68       798
          11       0.67      0.62      0.64      2014
          12       0.61      0.66      0.63      1298
          13       0.70      0.79      0.74      4882
          14       0.61      0.51      0.56      1022
          15       0.67      0.61      0.64       869
          16       0.67      0.64      0.65      1294
          17       0.71    

In [53]:
print(classification_report(test_df[target_column],cbc.predict(test_X)))

              precision    recall  f1-score   support

           0       0.17      0.17      0.17       508
           1       0.12      0.10      0.11       234
           2       0.12      0.08      0.10        85
           3       0.38      0.48      0.42      1266
           4       0.20      0.19      0.19       336
           5       0.21      0.18      0.19       598
           6       0.08      0.07      0.07       105
           7       0.18      0.14      0.16       518
           8       0.09      0.07      0.08       143
           9       0.27      0.24      0.26       770
          10       0.22      0.20      0.21       200
          11       0.23      0.18      0.20       504
          12       0.22      0.22      0.22       325
          13       0.43      0.49      0.46      1220
          14       0.12      0.10      0.11       255
          15       0.12      0.09      0.10       217
          16       0.23      0.20      0.21       323
          17       0.29    

## Support Vector Machine

In [57]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [54]:
%%time
svc.fit(train_X,train_df[target_column])

CPU times: total: 10min 42s
Wall time: 10min 42s


SVC(random_state=42)

In [61]:
%%time
print(accuracy_score(svc.predict(test_X),test_df[target_column]))

0.3937000965166272
CPU times: total: 3min 24s
Wall time: 3min 24s


In [59]:
print(classification_report(train_df[target_column],svc.predict(train_X)))

              precision    recall  f1-score   support

           0       0.38      0.48      0.42      2032
           1       0.38      0.18      0.25       937
           2       0.76      0.16      0.26       342
           3       0.50      0.72      0.59      5061
           4       0.44      0.25      0.32      1342
           5       0.49      0.32      0.39      2391
           6       0.80      0.02      0.04       421
           7       0.45      0.32      0.38      2071
           8       0.45      0.02      0.03       572
           9       0.48      0.35      0.40      3081
          10       0.47      0.41      0.44       798
          11       0.54      0.31      0.39      2014
          12       0.44      0.48      0.46      1298
          13       0.58      0.69      0.63      4882
          14       0.56      0.09      0.15      1022
          15       0.55      0.06      0.12       869
          16       0.51      0.27      0.35      1294
          17       0.51    

In [60]:
print(classification_report(test_df[target_column],svc.predict(test_X)))

              precision    recall  f1-score   support

           0       0.36      0.44      0.40       508
           1       0.32      0.16      0.22       234
           2       0.64      0.08      0.15        85
           3       0.48      0.70      0.57      1266
           4       0.39      0.22      0.28       336
           5       0.47      0.29      0.36       598
           6       1.00      0.01      0.02       105
           7       0.41      0.25      0.31       518
           8       0.44      0.03      0.05       143
           9       0.49      0.32      0.39       770
          10       0.47      0.39      0.43       200
          11       0.44      0.24      0.31       504
          12       0.40      0.44      0.42       325
          13       0.55      0.69      0.61      1220
          14       0.44      0.09      0.15       255
          15       0.41      0.06      0.10       217
          16       0.47      0.24      0.32       323
          17       0.46    