In [2]:
import pandas as pd
import numpy as np
import re
import pymorphy2 as pm
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from multiprocessing import Pool, Lock, Value
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn import svm

In [3]:
docs_titles = pd.read_csv('anomaly-detection-competition-ml1-ts-fall-2019/docs_titles.tsv', sep='\t', encoding='utf-8', lineterminator='\n')

In [4]:
docs_titles.shape

(27950, 2)

In [6]:
from bs4 import BeautifulSoup
import os
import codecs
from lxml import etree, html as lhtml

In [8]:
%%time

texts_titles = {}

mutex = Lock()
n_processed = Value('i', 0)
def text_wrapper(filename):
    text = get_page_text(filename)
    with mutex:
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return text


def get_page_text(filename):
    with codecs.open(path + str(filename) + '.dat', 'r', 'utf-8') as f:
        url = f.readline().strip()
        soup = BeautifulSoup(f, 'lxml')
        text = ''.join(map(lambda x: ' ' + x.text, soup.find_all(re.compile('^h[1-6]$'))))
        text = re.sub('[^0-9A-Za-zА-Яа-я]+', ' ', text)
        return text

path = 'anomaly-detection-competition-ml1-ts-fall-2019/content/content/'
filenames = os.listdir(path)
filenames = list(range(1, 28027))
texts = []

with Pool(10) as pool:
    texts = texts + pool.map(text_wrapper, filenames)

28020 objects are processed...CPU times: user 2.43 s, sys: 1.63 s, total: 4.05 s
Wall time: 7min 45s


In [9]:
texts[15730]

' ВАЗ 21213 16 13 Замена подшипников ступицы'

In [None]:
texts[5345]

In [11]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/azelentsov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

n_processed = Value('i', 0)

def wrapper(filename):
    text = preprocess_text(filename)
    with mutex:
        global n_processed
        n_processed.value += 1
        if n_processed.value % 100 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return text

def preprocess_text(text):
    tokens = mystem.lemmatize(text)
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

In [19]:
docs_titles['title'] = docs_titles['title'].astype(str)
docs_titles['title'] = [text.lower() for text in docs_titles['title']]
with Pool(10) as pool:
    docs_titles['title'] = list(pool.map(preprocess_text, docs_titles['title']))

In [20]:
with Pool(10) as pool:
    texts = list(pool.map(wrapper, texts))

28000 objects are processed...

In [21]:
docs_to_texts = {'doc_id': range(1, 28027), 'target': texts}
res = pd.DataFrame(docs_to_texts)
res.to_csv('docs_texts.csv', index=False)

In [22]:
docs_titles['title']

0                  ваз 21213 замена подшипник ступица нива
1        ваз 2107 опт сочи сравнивать цена купить потре...
2        купить ступица лад калина2 трансмиссия переход...
3                                     классика 21010 21074
4          ступица нива    —    замена подшипник свой рука
                               ...                        
27945    ответ mail ru полезный кушать творог утро худе...
27946    творог полезный свойство лечение творог женски...
27947              творог полезный опасный свойство творог
27948                        ответ mail ru полезный творог
27949    творог польза вред    ::    выбирать продукт  ...
Name: title, Length: 27950, dtype: object

In [170]:
test_groups = pd.read_csv('anomaly-detection-competition-ml1-ts-fall-2019/test_groups.csv')

In [171]:
train_groups = pd.read_csv('anomaly-detection-competition-ml1-ts-fall-2019/train_groups.csv')

In [172]:
train_groups.shape

(11690, 4)

In [173]:
test_groups.shape

(16627, 3)

In [174]:
%%time
groups_train = []
target_groups = []
for group_id in range(1, 130):
    part = train_groups[train_groups.group_id == group_id]
    arr = []
    arr_t = []
    for doc_id in part['doc_id']:
        arr.append(docs_titles[docs_titles.doc_id == doc_id].iloc[0]['title'])
        arr_t.append(part[part.doc_id == doc_id].iloc[0]['target'])
    groups_train.append(arr)
    target_groups.append(arr_t)
groups_train[0]

CPU times: user 15.9 s, sys: 25.2 ms, total: 15.9 s
Wall time: 15.9 s


['ваз 21213 замена подшипник ступица нива',
 'ваз 2107 опт сочи сравнивать цена купить потребительский товар tiu ru',
 'купить ступица лад калина2 трансмиссия переходный ступица цена замена тюнинг',
 'классика 21010 21074',
 'ступица нива     —     замена подшипник свой рука',
 'ваз 2110',
 'обзор подшипник полуось ваз 2101 07 2121 2123',
 'купить подшипник ступица fag страница 23',
 'horsepowers     —     автомобильный интернет портал     »     отзыв владелец ваз 2121 нива 2007 год',
 'новость сообщение официальный группа вконтакте торговый компания 33 sport магазин тольятти',
 'инструкция замена подшипник передний ступица ивеко дейли dorognoekam ru',
 'ступица olx ua страница 80',
 'маааленькая проблемка     —     бортжурнал авток 2160 ╬ 1994 год drive2',
 'разгружать полуось нива 24 шлиц 765 мм',
 'прошивка нива м 7.9 7 скачать файлообменник emqraty6 foxkirov ru',
 'страница 6 раздел каталог подвеска',
 'продавать нива 2121',
 'рекомендация проведение независимый экспертиза осаго',


In [175]:
%%time
groups_test = []
for group_id in range(130, 310):
    part = test_groups[test_groups.group_id == group_id]
    arr = []
    for doc_id in part['doc_id']:
        if len(docs_titles[docs_titles.doc_id == doc_id]) == 0:
            arr.append('')
        else:
            arr.append(docs_titles[docs_titles.doc_id == doc_id].iloc[0]['title'])
    groups_test.append(arr)
groups_test[0]

CPU times: user 22 s, sys: 34.7 ms, total: 22 s
Wall time: 22 s


['прописывать админк кс 1.6 друг youtube',
 'скачать sgl rp доработка слива мода mysql    ]  |     rp role play готовый сервер samp 0.3 7 0.3 z 0.3 x 0.3 e v sampe ru samp gta',
 'прописывать админк кс 1.6 counter strike каталог статья игровой сообщество dream x ru counter strike портал',
 'прописывать простой админк кс 1 6',
 'подбор админ сервер код 4 архив    ]  -     форум ozone',
 'каталог статья the best original portal in ukraine the best original portal in ukrain',
 'eugene kirian     »     блог',
 'файл htaccess основной параметр увеличивать безопасность блог',
 'давать админк cs 1.6 хостинг my arena ответ игра вокруг свет игра',
 'joomla 2.5 убирать хлебный крошка выбирать страница',
 'sa mp беседка архив    ]  -     страница 3 форум ozone',
 'чужой wifi     ->     форум исходник ру',
 'спасибо комментарий плагин',
 'cs amxmodmenu прописывать админк',
 'пароль admin hack 3 7 c s 1 6',
 'пользоваться сервер кс видео',
 'сделать свой сервер кс 1.6 админка паутинка граб    ) -  

In [244]:
cv = CountVectorizer(min_df=2, max_df=0.6)
tf = TfidfTransformer()
matrix_train = []
for group in groups_train:
    m = cv.fit_transform(group).toarray()
    matrix_train.append(tf.fit_transform(m).toarray())

In [245]:
matrix_test = []
for group in groups_test:
    m = cv.fit_transform(group).toarray()
    matrix_test.append(tf.fit_transform(m).toarray())

In [246]:
np.any(matrix_train[0] == np.nan)

False

In [247]:
len(matrix_train)

129

In [248]:
len(matrix_test)

180

In [249]:
from scipy.spatial.distance import pdist, squareform

In [250]:
dists = []
for i in range(len(matrix_train)):
    dist = pdist(matrix_train[i], metric='cosine')
    dist = squareform(dist)
    dists.append(dist)
    print("\r {} done".format(i))

 0 done
 1 done
 2 done
 3 done
 4 done
 5 done
 6 done
 7 done
 8 done
 9 done
 10 done
 11 done
 12 done
 13 done
 14 done
 15 done
 16 done
 17 done
 18 done
 19 done
 20 done
 21 done
 22 done
 23 done
 24 done
 25 done
 26 done
 27 done
 28 done
 29 done
 30 done
 31 done
 32 done
 33 done
 34 done
 35 done
 36 done
 37 done
 38 done
 39 done
 40 done
 41 done
 42 done
 43 done
 44 done
 45 done
 46 done
 47 done
 48 done
 49 done
 50 done
 51 done
 52 done
 53 done
 54 done
 55 done
 56 done
 57 done
 58 done
 59 done
 60 done
 61 done
 62 done
 63 done
 64 done
 65 done
 66 done
 67 done
 68 done
 69 done
 70 done
 71 done
 72 done
 73 done
 74 done
 75 done
 76 done
 77 done
 78 done
 79 done
 80 done
 81 done
 82 done
 83 done
 84 done
 85 done
 86 done
 87 done
 88 done
 89 done
 90 done
 91 done
 92 done
 93 done
 94 done
 95 done
 96 done
 97 done
 98 done
 99 done
 100 done

In [251]:
np.any(dists[1] is np.nan)

False

In [252]:
dists[0].shape

(102, 102)

In [253]:
for i in range(129):
    dists[i] = np.sort(dists[i], axis=1)
    dists[i] = dists[i][:, 1:21]

In [254]:
dists[0].shape

(102, 20)

In [255]:
X = dists[0]
for i in range(1, 129):
    X = np.vstack((X, dists[i]))

In [282]:
X[np.isnan(X)] = 1
X

array([[0.        , 0.15246438, 0.36250281, ..., 0.74378225, 0.74633308,
        0.75779982],
       [0.32427187, 0.34722545, 0.35323427, ..., 0.86263384, 0.86642534,
        0.86681234],
       [0.44874844, 0.53692262, 0.53964096, ..., 0.75550835, 0.75718391,
        0.76228723],
       ...,
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [0.45285815, 0.53357501, 0.66450756, ..., 1.        , 1.        ,
        1.        ]])

In [256]:
dists = []
for i in range(len(matrix_test)):
    dist = pdist(matrix_test[i], metric='cosine')
    dist = squareform(dist)
    dists.append(dist)
    print("\r {} done".format(i))

 0 done
 1 done
 2 done
 3 done
 4 done
 5 done
 6 done
 7 done
 8 done
 9 done
 10 done
 11 done
 12 done
 13 done
 14 done
 15 done
 16 done
 17 done
 18 done
 19 done
 20 done
 21 done
 22 done
 23 done
 24 done
 25 done
 26 done
 27 done
 28 done
 29 done
 30 done
 31 done
 32 done
 33 done
 34 done
 35 done
 36 done
 37 done
 38 done
 39 done
 40 done
 41 done
 42 done
 43 done
 44 done
 45 done
 46 done
 47 done
 48 done
 49 done
 50 done
 51 done
 52 done
 53 done
 54 done
 55 done
 56 done
 57 done
 58 done
 59 done
 60 done
 61 done
 62 done
 63 done
 64 done
 65 done
 66 done
 67 done
 68 done
 69 done
 70 done
 71 done
 72 done
 73 done
 74 done
 75 done
 76 done
 77 done
 78 done
 79 done
 80 done
 81 done
 82 done
 83 done
 84 done
 85 done
 86 done
 87 done
 88 done
 89 done
 90 done
 91 done
 92 done
 93 done
 94 done
 95 done
 96 done
 97 done
 98 done
 99 done
 100 done

In [257]:
for i in range(180):
    dists[i] = np.sort(dists[i], axis=1)
    dists[i] = dists[i][:, 1:21]

In [258]:
X_test = dists[0]
for i in range(1, 180):
    X_test = np.vstack((X_test, dists[i]))

In [283]:
X_test[np.isnan(X_test)] = 1

In [260]:
y_train = []
for gr in target_groups:
    y_train = y_train + gr

In [261]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

In [262]:
params = {
    'objective' :'binary',
    'learning_rate' : 0.02,
    'num_leaves' : 76,
    'feature_fraction': 0.64, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'gbdt',
    'metric': 'f1_score'
}
X_train, X_valid, Y_train, Y_valid = train_test_split(X,  y_train, random_state=34523, test_size=0.33)
    
    
d_train = lgbm.Dataset(X_train, Y_train)
d_valid = lgbm.Dataset(X_valid, Y_valid)

bst = lgbm.train(params, d_train, 5000, valid_sets=[d_valid], verbose_eval=50)
pred = bst.predict(X_test)

In [263]:
lg = LGBMClassifier(num_leaves=4095)
X_train, X_val, y, y_val = train_test_split(X,  y_train, random_state=345433, test_size=0.33)

In [264]:
lg.fit(X_train, y)
res = lg.predict(X_val)
print(f1_score(res, y_val))

0.5415384615384615


In [269]:
for i in range(pred.size):
    if pred[i] >= 0.5:
        pred[i] = 1
    else:
        pred[i] = 0
data = {'pair_id' : range(11691, 28318), 'target': pred.astype(int)}
res = pd.DataFrame(data)
res.to_csv('sec_pred.csv', index=False)

In [266]:
from sklearn.linear_model import SGDClassifier

In [None]:
j = 0
for loss in ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']:
    for penalty in ['none', 'l2', 'l1', 'elasticnet']:
        for alpha in [0.0001, 0.001, 0.01, 0.1]:
            j += 1
            score = 0.0
            for i in range(3):
                X_train, X_val, y, y_val = train_test_split(X,  y_train, random_state=j+i, test_size=0.33)
                clf = SGDClassifier(loss=loss, penalty=penalty, alpha=alpha, max_iter=100000).fit(X_train, y)
                pred = clf.predict(X_val)
                score += f1_score(pred, y_val)
            score /= 3
            print('DONE for loss: {}, penalty: {}, alpha: {}, score = {}'.format(loss, penalty, alpha, score))

DONE for loss: hinge, penalty: none, alpha: 0.0001, score = 0.4037499848424218
DONE for loss: hinge, penalty: none, alpha: 0.001, score = 0.4390788267906209
DONE for loss: hinge, penalty: none, alpha: 0.01, score = 0.415224848709044
DONE for loss: hinge, penalty: none, alpha: 0.1, score = 0.2524193089976482
DONE for loss: hinge, penalty: l2, alpha: 0.0001, score = 0.4169921895553936
DONE for loss: hinge, penalty: l2, alpha: 0.001, score = 0.3820414240634715
DONE for loss: hinge, penalty: l2, alpha: 0.01, score = 0.3739837799160552
DONE for loss: hinge, penalty: l2, alpha: 0.1, score = 0.11671024724715999
DONE for loss: hinge, penalty: l1, alpha: 0.0001, score = 0.3726942995295805
DONE for loss: hinge, penalty: l1, alpha: 0.001, score = 0.3784639957679909
DONE for loss: hinge, penalty: l1, alpha: 0.01, score = 0.14419693506144843
DONE for loss: hinge, penalty: l1, alpha: 0.1, score = 0.0


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DONE for loss: hinge, penalty: elasticnet, alpha: 0.0001, score = 0.4656167398474811
DONE for loss: hinge, penalty: elasticnet, alpha: 0.001, score = 0.43721278413820186
DONE for loss: hinge, penalty: elasticnet, alpha: 0.01, score = 0.36881446341826796
DONE for loss: hinge, penalty: elasticnet, alpha: 0.1, score = 0.001777777777777778


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DONE for loss: log, penalty: none, alpha: 0.0001, score = 0.5228751953248173
DONE for loss: log, penalty: none, alpha: 0.001, score = 0.4679057393379635
DONE for loss: log, penalty: none, alpha: 0.01, score = 0.45617998436949847
DONE for loss: log, penalty: none, alpha: 0.1, score = 0.35562082828124675
DONE for loss: log, penalty: l2, alpha: 0.0001, score = 0.4160808093809658
DONE for loss: log, penalty: l2, alpha: 0.001, score = 0.42841492953181043
DONE for loss: log, penalty: l2, alpha: 0.01, score = 0.43482153033365084
DONE for loss: log, penalty: l2, alpha: 0.1, score = 0.15162304614314195
DONE for loss: log, penalty: l1, alpha: 0.0001, score = 0.44280682063150456
DONE for loss: log, penalty: l1, alpha: 0.001, score = 0.44544730291661166
DONE for loss: log, penalty: l1, alpha: 0.01, score = 0.3509036500217937
DONE for loss: log, penalty: l1, alpha: 0.1, score = 0.0


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DONE for loss: log, penalty: elasticnet, alpha: 0.0001, score = 0.5490332401793477
DONE for loss: log, penalty: elasticnet, alpha: 0.001, score = 0.43756109889421224
DONE for loss: log, penalty: elasticnet, alpha: 0.01, score = 0.4319804462846741
DONE for loss: log, penalty: elasticnet, alpha: 0.1, score = 0.012507329751660091
DONE for loss: modified_huber, penalty: none, alpha: 0.0001, score = 0.48064646630305646
DONE for loss: modified_huber, penalty: none, alpha: 0.001, score = 0.4029266081243053
DONE for loss: modified_huber, penalty: none, alpha: 0.01, score = 0.4554532261574447
DONE for loss: modified_huber, penalty: none, alpha: 0.1, score = 0.4502102526441676
DONE for loss: modified_huber, penalty: l2, alpha: 0.0001, score = 0.46779974189376183
DONE for loss: modified_huber, penalty: l2, alpha: 0.001, score = 0.426727518395196
DONE for loss: modified_huber, penalty: l2, alpha: 0.01, score = 0.44108296867255065
DONE for loss: modified_huber, penalty: l2, alpha: 0.1, score = 0.42

  'recall', 'true', average, warn_for)


DONE for loss: modified_huber, penalty: elasticnet, alpha: 0.0001, score = 0.48418027343150616
DONE for loss: modified_huber, penalty: elasticnet, alpha: 0.001, score = 0.45671443152469154
DONE for loss: modified_huber, penalty: elasticnet, alpha: 0.01, score = 0.4895941310084893
DONE for loss: modified_huber, penalty: elasticnet, alpha: 0.1, score = 0.3840559589731864
DONE for loss: squared_hinge, penalty: none, alpha: 0.0001, score = 0.43982718762257994
DONE for loss: squared_hinge, penalty: none, alpha: 0.001, score = 0.46654333500967343


In [21]:
docs_titles['title'] = [text.lower() for text in docs_titles['title']]
docs_titles['title'] = [re.sub('[^0-9a-zA-ZА-Яа-я]+', ' ', text) for text in docs_titles['title']]
def norm(text):
    res = ''
    for word in text.split():
        if word != '':
            t = ma.parse(word)[0].normal_form
            res = res + t
            res = res + ' '
    return res

with Pool(10) as pool:
    docs_titles['title'] = list(pool.map(norm, docs_titles['title']))

In [22]:
test_groups = pd.read_csv('anomaly-detection-competition-ml1-ts-fall-2019/test_groups.csv')

In [23]:
train_groups = pd.read_csv('anomaly-detection-competition-ml1-ts-fall-2019/train_groups.csv')

Переведем заголовки в нужный нам формат, чтобы потом сделать из них матрицу - отдельную для каждой группы

In [24]:
%%time
groups_train = []
target_groups = []
for group_id in range(1, 130):
    part = train_groups[train_groups.group_id == group_id]
    arr = []
    arr_t = []
    for doc_id in part['doc_id']:
        arr.append(docs_titles[docs_titles.doc_id == doc_id].iloc[0]['title'])
        arr_t.append(part[part.doc_id == doc_id].iloc[0]['target'])
    groups_train.append(arr)
    target_groups.append(arr_t)
groups_train[0]

CPU times: user 16.3 s, sys: 35.5 ms, total: 16.3 s
Wall time: 16.4 s


['ваза 21213 замена подшипник ступица нива ',
 'ваза 2107 оптом в сочи сравнить цена купить потребительский товар на tiu ru ',
 'купить ступица лада калина2 трансмиссия переходный ступица цена замена тюнинг ',
 'классика 21010 21074 ',
 'ступица нива замена подшипник свой рука ',
 'ваза 2110 ',
 'обзор подшипник полуось ваза 2101 07 2121 2123 ',
 'купить подшипник и ступица fag страница 23 ',
 'horsepowers автомобильный интернет портал отзыв владелец ваза 2121 нива 2007 год ',
 'новость и сообщение из официальный группа вконтакте торговый компания 33 sport магазин тольятти ',
 'инструкция по замена подшипник передний ступица ивеко дейли через dorognoekam ru ',
 'ступица olx ua страница 80 ',
 'маааленький проблёмкий бортжурнал автокам 2160 1994 год на drive2 ',
 'разгрузить полуось для нива 24 шлиц 765 мм ',
 'прошивка для нива м7 9 7 скачать файлообменник emqraty6 foxkirov ru ',
 'страница 6 раздел каталог подвеска ',
 'продать нива 2121 ',
 'рекомендация по проведение независимый экс

In [25]:
%%time
groups_test = []
for group_id in range(130, 310):
    part = test_groups[test_groups.group_id == group_id]
    arr = []
    for doc_id in part['doc_id']:
        if len(docs_titles[docs_titles.doc_id == doc_id]) == 0:
            arr.append('')
        else:
            arr.append(docs_titles[docs_titles.doc_id == doc_id].iloc[0]['title'])
    groups_test.append(arr)
groups_test[0]

CPU times: user 21.9 s, sys: 24.3 ms, total: 21.9 s
Wall time: 21.9 s


['как прописать админк в кс 1 6 себя или друг youtube ',
 'скачать sgl rp доработка слив мода mysql rp role play готовый сервер для samp 0 3 7 0 3z 0 3x 0 3e v sampe ru вс для samp и gta ',
 'как прописать админк в кс 1 6 counter strike каталог стать игровой сообщество dream x ru counter strike портал ',
 'как прописать простой админк в кс 1 6 ',
 'подбор админовый для сервер по код 4 архив форум ozone ',
 'каталог стать the best original portal in ukraine the best original portal in ukrain ',
 'eugene kirian блог ',
 'файл htaccess основной параметр увеличивать безопасность блог ',
 'как дать себя админк в cs 1 6 на хостинг my arena ответ на игра вокруг свет и к другой игра ',
 'joomla 2 5 убирать хлебный крошка с выбрать страница ',
 'sa mp беседка архив страница 3 форум ozone ',
 'чужой wifi форум на исходник ру ',
 'спасибо за комментарий с плагин и без он ',
 'cs amxmodmenu как прописать админк ',
 'пароль на admin hack 3 7 в c s 1 6 ',
 'как пользоваться сервер в кс видео ',
 'ка

Переведём в матрицу и сразу применим tf-idf

In [28]:
magic_forest = IsolationForest(n_estimators=300, behaviour='new', contamination='auto')
all_targ = np.array(train_groups['target'])
n, c = np.unique(all_targ, return_counts=True)
n, c

(array([0, 1]), array([8329, 3361]))

Подобный подбор параметров для Isolation forest я решил убрать, так как score получался в лучшем случае 0.3

In [29]:
scores =[]
preds = []
for k in ['rbf', 'poly', 'linear', 'sigmoid']:
    for degree in range(1, 10):
        for nu in [0.3, 0.4, 0.5, 0.6, 0.7]:
            y_pred = []
            magic_forest = svm.OneClassSVM(kernel=k, degree=degree, gamma='scale', nu=nu)
            for m in matrix_train:
                y = magic_forest.fit_predict(m)
                for i in range(y.size):
                    if y[i] == -1:
                        y[i] = 0
                y =list(y)
                y_pred += y
            score = f1_score(y_pred, all_targ)
            scores.append(score)
            preds.append(y_pred)
            print('DONE FOR KERNEL = {} DEGREE = {} NU = {}, SCORE = {}'.format(k, degree, nu, score))

DONE FOR KERNEL = rbf DEGREE = 1 NU = 0.3, SCORE = 0.4117306579201234
DONE FOR KERNEL = rbf DEGREE = 1 NU = 0.4, SCORE = 0.4025044722719141
DONE FOR KERNEL = rbf DEGREE = 1 NU = 0.5, SCORE = 0.3947564744751146
DONE FOR KERNEL = rbf DEGREE = 1 NU = 0.6, SCORE = 0.3715834118755891
DONE FOR KERNEL = rbf DEGREE = 1 NU = 0.7, SCORE = 0.34838709677419355
DONE FOR KERNEL = rbf DEGREE = 2 NU = 0.3, SCORE = 0.4117306579201234
DONE FOR KERNEL = rbf DEGREE = 2 NU = 0.4, SCORE = 0.4025044722719141
DONE FOR KERNEL = rbf DEGREE = 2 NU = 0.5, SCORE = 0.3947564744751146
DONE FOR KERNEL = rbf DEGREE = 2 NU = 0.6, SCORE = 0.3715834118755891
DONE FOR KERNEL = rbf DEGREE = 2 NU = 0.7, SCORE = 0.34838709677419355
DONE FOR KERNEL = rbf DEGREE = 3 NU = 0.3, SCORE = 0.4117306579201234
DONE FOR KERNEL = rbf DEGREE = 3 NU = 0.4, SCORE = 0.4025044722719141
DONE FOR KERNEL = rbf DEGREE = 3 NU = 0.5, SCORE = 0.3947564744751146
DONE FOR KERNEL = rbf DEGREE = 3 NU = 0.6, SCORE = 0.3715834118755891
DONE FOR KERNEL = 

DONE FOR KERNEL = linear DEGREE = 6 NU = 0.3, SCORE = 0.4644619038918327
DONE FOR KERNEL = linear DEGREE = 6 NU = 0.4, SCORE = 0.44494163424124517
DONE FOR KERNEL = linear DEGREE = 6 NU = 0.5, SCORE = 0.4393151278716949
DONE FOR KERNEL = linear DEGREE = 6 NU = 0.6, SCORE = 0.42106570612346295
DONE FOR KERNEL = linear DEGREE = 6 NU = 0.7, SCORE = 0.3927648578811369
DONE FOR KERNEL = linear DEGREE = 7 NU = 0.3, SCORE = 0.4644619038918327
DONE FOR KERNEL = linear DEGREE = 7 NU = 0.4, SCORE = 0.44494163424124517
DONE FOR KERNEL = linear DEGREE = 7 NU = 0.5, SCORE = 0.4393151278716949
DONE FOR KERNEL = linear DEGREE = 7 NU = 0.6, SCORE = 0.42106570612346295
DONE FOR KERNEL = linear DEGREE = 7 NU = 0.7, SCORE = 0.3927648578811369
DONE FOR KERNEL = linear DEGREE = 8 NU = 0.3, SCORE = 0.4644619038918327
DONE FOR KERNEL = linear DEGREE = 8 NU = 0.4, SCORE = 0.44494163424124517
DONE FOR KERNEL = linear DEGREE = 8 NU = 0.5, SCORE = 0.4393151278716949
DONE FOR KERNEL = linear DEGREE = 8 NU = 0.6, 

In [30]:
y_pred = preds[np.argmax(scores)]
print(np.max(scores))
u,c = np.unique(y_pred, return_counts=True)

0.4644619038918327


In [31]:
c, u

(array([4105, 7585]), array([0, 1]))