# Task 2 JULEBINO TEAM
Participants:
- Соколанов Алексей
- Ешеров Амин
- Белянинов Илья

## Описание:
Данное решение использует матрицу отношение (корреляцию) между исходными данными в качестве признаков, на которых обучаются модели. Но перед преобразование исходных данных, мы применяем oversampling, который дублирует случайные наблюдения минорного класса из датасета bnu (и только его!). Из 20 предоставленных наблюдений датасета ihb 10, отходит на тест, и 10 на обучение. \
С помощью CatBoost feature importance мы отбираем топ 40 лучших фичей и работаем с ними. GreedSearch помогает нам подобрать гиперпараметры \
### Дополнительное решение:
Catboost выдает лучший результат, но так же неплохо себя показал ансамбль из трех моделей: *Кэтбуста*, который хорошо определяет отрицательные классы,*Логистической регрессии*, которая хорошо определяет положительный класс, и *Перцептрона*, который делает ошибки вообще в других местах

### Достоинтсва решения:
- Высокая скорость обучения
- Малый вес модели
- Возможность доработать до лучшего результата дополнительное решение при наличии большего количества данных

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scripts.data_utils import get_connectome
from scripts.classification_models import CLModels


In [None]:
hcp_table = pd.read_excel('./data/atlas/HCPex_Atlas_Description.xlsx',
                          index_col='HCPex_ID').sort_index()
hcp_table

Unnamed: 0_level_0,NEW_ID,Label,Short_label,Cortical Division,Cortical_Division_Number,X,Y,Z,ColeAnticevic_functional_network,ColeAnticevic_functional_network_label
HCPex_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,Primary_Visual_Cortex_L,L_V1,Primary_Visual,1,-10.078,-84.159,1.759,1,Visual1
2,3,Second_Visual_Area_L,L_V2,Early_Visual,2,-12.044,-82.059,4.029,2,Visual2
3,5,Third_Visual_Area_L,L_V3,Early_Visual,2,-16.607,-84.793,7.259,2,Visual2
4,7,Fourth_Visual_Area_L,L_V4,Early_Visual,2,-29.556,-86.009,-1.542,2,Visual2
5,9,IntraParietal_Sulcus_Area_1_L,L_IPS1,Dorsal_Stream_Visual,3,-24.456,-74.776,38.047,2,Visual2
...,...,...,...,...,...,...,...,...,...,...
422,418,Substantia_nigra_pars_reticulata_R,R_SNpr,Subcortical,24,10.038,-15.978,-13.499,14,Subcortical
423,420,Ventral_tegmenta_area_R,R_VTA,Subcortical,24,3.738,-20.690,-16.786,14,Subcortical
424,422,Mammillary_bodies_R,R_MB,Subcortical,24,2.396,-7.792,-14.783,14,Subcortical
425,424,Septal_nucleus_R,R_Septum,Subcortical,24,1.183,6.383,-3.046,14,Subcortical


## Рукописный оверсэмплинг входных данных:

In [None]:
from sklearn.utils import resample
import random

def random_sampler(X, y, sampling_strategy=1,  seed=42):
    random.seed(seed)

    X_majority = X[(y==0).flatten()]
    X_minority = X[(y==1).flatten()]

    X_minority_upsampled = resample(X_minority,
                                     replace=True,
                                     n_samples=int(len(X_majority) * sampling_strategy - len(X_minority)),
                                     random_state=42)


    X_res = np.concatenate([X_majority, X_minority, X_minority_upsampled])
    y_res = np.concatenate([[0]*len(X_majority), [1]*(len(X_minority_upsampled) + len(X_minority))]).reshape(-1, 1)
    return X_res, y_res

# Начало обучения

In [None]:
bnu_series_path = './data/ts_cut/HCPex/bnu{}.npy'
bnu_labels_path = './data/ts_cut/HCPex/bnu.csv'
ihb_series_path = './data/ts_cut/HCPex/ihb.npy'
ihb_labels_path = './data/ts_cut/HCPex/ihb.csv'

X_bnu = np.concatenate([np.load(bnu_series_path.format(i)) for i in (1, 2)], axis=0)
Y_bnu = pd.read_csv(bnu_labels_path).to_numpy()
X_ihb = np.load(ihb_series_path)
Y_ihb = pd.read_csv(ihb_labels_path)

# Увеличиваем количество миноритарного класса:
sampling_strategy = 0.75
X_bnu, Y_bnu = random_sampler(X_bnu, Y_bnu, sampling_strategy=sampling_strategy)

X_bnu = get_connectome(X_bnu)
X_ihb = get_connectome(X_ihb)
print(f'X_bhu: {X_bnu.shape}')
print(f'X_ihb: {X_ihb.shape}')

# Выделяю на тест только выборку из ihb
x_ihb_train, x_validate, y_ihb_train, y_validate = train_test_split(X_ihb, Y_ihb,
                                                           test_size=0.5, random_state=10, stratify=Y_ihb)

x_train = np.concatenate([X_bnu, x_ihb_train, x_validate])
y_train = np.concatenate([Y_bnu, y_ihb_train, y_validate])


X_bhu: (166, 419, 419)
X_ihb: (20, 419, 419)


In [None]:
x_train.shape, y_train.shape

((186, 419, 419), (186, 1))

In [None]:
cat_boost = CLModels(pca=True, model="catboost", is_ensemble=False)
cat_boost.model.set_params(**{'l2_leaf_reg': 0.001, "n_estimators": 40, "early_stopping_rounds": 5})
cat_boost.pca.set_params(**{'n_components': 10})

acc = cat_boost.model_training(x_train, y_train.flatten())

Preprocesses result shape: (186, 40)
0:	learn: 0.6704864	total: 73.1ms	remaining: 2.85s
1:	learn: 0.6471557	total: 74.7ms	remaining: 1.42s
2:	learn: 0.6254643	total: 76.6ms	remaining: 944ms
3:	learn: 0.6040238	total: 77.9ms	remaining: 701ms
4:	learn: 0.5850882	total: 84.1ms	remaining: 588ms
5:	learn: 0.5662706	total: 86ms	remaining: 487ms
6:	learn: 0.5478272	total: 86.8ms	remaining: 409ms
7:	learn: 0.5290870	total: 87.6ms	remaining: 350ms
8:	learn: 0.5131130	total: 91.9ms	remaining: 317ms
9:	learn: 0.4976376	total: 95.2ms	remaining: 286ms
10:	learn: 0.4832730	total: 97ms	remaining: 256ms
11:	learn: 0.4698738	total: 98.8ms	remaining: 231ms
12:	learn: 0.4570436	total: 99.8ms	remaining: 207ms
13:	learn: 0.4450321	total: 100ms	remaining: 187ms
14:	learn: 0.4323920	total: 101ms	remaining: 168ms
15:	learn: 0.4209369	total: 102ms	remaining: 153ms
16:	learn: 0.4100084	total: 103ms	remaining: 140ms
17:	learn: 0.3982143	total: 104ms	remaining: 127ms
18:	learn: 0.3885854	total: 105ms	remaining: 1

In [None]:
conf_mat, acc, f1 = cat_boost.model_testing(X_ihb, Y_ihb)
conf_mat

Preprocesses result shape: (20, 40)
Accuracy on test: 0.85
F1 score on test: 0.84211


array([[9, 1],
       [2, 8]])

### Код для нахождения важных фичей:

In [None]:
# importances = cat_boost.model.get_feature_importance(type='PredictionValuesChange')
# feature_importances = pd.Series(importances, index=range(logreg.processes_x.shape[-1])).sort_values()
# feature_025 = feature_importances[feature_importances > 0.25]
# plt.figure(figsize=(10, 15))
# plt.barh(feature_025.index, feature_025.values, height=3, color='yellow', linewidth=5)
# feature_025 = feature_importances[feature_importances > 0.3]
# plt.title('CatBoost Feature Importance')
# plt.xlabel('Importance')
# plt.ylabel('Features')
# plt.show()

### Еще одна модель (иногда работает лучше)
Это ансамбль из трех моделей: Кэтбуста, который хорошо определяет отрицательные классы, Логистической регрессии, которая хорошо определяет положительный класс, и Перцептрона, который делает ошибки вообще в других местах

In [None]:
ensemble = CLModels(pca=True, model="catboost", is_ensemble=True)
ensemble.model.set_params(**{'l2_leaf_reg': 0.001, "n_estimators": 40, "early_stopping_rounds": 5})
ensemble.pca.set_params(**{'n_components': 10})

acc = ensemble.model_training(x_train, y_train.flatten())
conf_mat, acc, f1 = ensemble.model_testing(X_ihb, Y_ihb)
conf_mat

Preprocesses result shape: (186, 40)
0:	learn: 0.6696089	total: 1.72ms	remaining: 67ms
1:	learn: 0.6467509	total: 3.04ms	remaining: 57.8ms
2:	learn: 0.6251433	total: 4.32ms	remaining: 53.3ms
3:	learn: 0.6053225	total: 5.23ms	remaining: 47ms
4:	learn: 0.5856346	total: 6.58ms	remaining: 46.1ms
5:	learn: 0.5672214	total: 7.92ms	remaining: 44.9ms
6:	learn: 0.5501062	total: 9.23ms	remaining: 43.5ms
7:	learn: 0.5314700	total: 78.8ms	remaining: 315ms
8:	learn: 0.5173419	total: 85.7ms	remaining: 295ms
9:	learn: 0.5022413	total: 90.3ms	remaining: 271ms
10:	learn: 0.4867570	total: 91.7ms	remaining: 242ms
11:	learn: 0.4728938	total: 93ms	remaining: 217ms
12:	learn: 0.4602388	total: 126ms	remaining: 261ms
13:	learn: 0.4485570	total: 129ms	remaining: 239ms
14:	learn: 0.4369593	total: 133ms	remaining: 221ms
15:	learn: 0.4252493	total: 145ms	remaining: 217ms
16:	learn: 0.4146526	total: 146ms	remaining: 197ms
17:	learn: 0.4047742	total: 147ms	remaining: 180ms
18:	learn: 0.3944579	total: 148ms	remainin

array([[ 9,  1],
       [ 0, 10]])

In [None]:
import pickle

# save model and weights

pkl_filename = "./model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(cat_boost, file)

In [None]:
# create local environment same as Yandex Contest
import os
import shutil

if not os.path.exists('./data/ts_cut/HCPex/'):
    os.makedirs('./data/ts_cut/HCPex/')

np.save('./data/ts_cut/HCPex/predict.npy', np.concatenate([np.load(bnu_series_path.format(i)) for i in (1, 2)], axis=0))


In [None]:
# create script, which loads model, does all preprocessing and outputs solution.csv

import numpy as np
import pandas as pd
import pickle

from scripts.data_utils import get_connectome
from scripts.classification_models import CLModels

X = np.load('./data/ts_cut/HCPex/predict.npy')
print(X.shape)
X = get_connectome(X)

with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

y_pred = model.model_predict(X)
print(y_pred)

solution = pd.DataFrame(data=y_pred, columns=['prediction'])
solution.to_csv('./solution.csv', index=False)

(142, 240, 419)
Preprocesses result shape: (142, 40)
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1.
 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0.
 0. 1. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0.
 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0.]


In [None]:
# build the .zip to submit
import zipfile
import datetime

# save source from previous cell into file
# will produce the correct result only in case of running previous cell just before
with open('run.py', 'w') as f_run:
    f_run.write(_ih[-2])

with open('run.sh', 'w') as f_run_sh:
    f_run_sh.write('export PATH=/usr/conda/bin:$PATH\npython run.py')

with open('train.py', 'w') as f_run:
    f_run.write('print("\\n".join(map(str, range(100))))')

with open('train.sh', 'w') as f_run_sh:
    f_run_sh.write('export PATH=/usr/conda/bin:$PATH\npython train.py')

with open('Makefile', 'w') as f_makefile:
    f_makefile.write('''all: build

build:
	@echo 'starting....'
	bash train.sh
run:
	bash run.sh
train:
	bash train.sh
''')

submission_zip = zipfile.ZipFile(f"submission-{datetime.datetime.now()}.zip".replace(':', '-').replace(' ', '-'), "w")
submission_zip.write('./Makefile', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('run.py', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('run.sh', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('train.py', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('train.sh', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('model.pkl', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('scripts', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('scripts/__init__.py', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('scripts/classification_models.py', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('scripts/data_utils.py', compress_type=zipfile.ZIP_DEFLATED)

submission_zip.close()
