In [1]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from itertools import combinations

from nets_algo import *

## Функция потерь

Создание функции потерь в torch
https://neptune.ai/blog/pytorch-loss-functions. Для задач бинарной классификации рекомендуют использовать CrossEntropy функцию потерь

In [2]:
loss_fn = nn.BCELoss()

# Загрузка и подговтовка данных

In [3]:
data = pd.read_csv(
    "/home/dranik/KFA/university/proc_2.csv", 
    index_col = 0
)

save_ind = data[data['Y'] == 0].sample(
    sum(data['Y']), random_state = 0
).index.union(data[data['Y'] == 1].index)
data = data.loc[save_ind]

Нужно провести One Hot Encoding

In [4]:
Y = np.array(data[['Y']])
X = data.drop('Y', axis = 1)

X = np.concatenate([
    OneHotEncoder(sparse = False).\
    fit_transform(X.loc[:,X.dtypes == "O"]),
    X.loc[:,X.dtypes != "O"].to_numpy()
], axis = 1)

Y.shape

(13926, 1)

Разбивка на Train/Test

In [5]:
X_train, X_test, y_train, y_test = \
    train_test_split(
        X,Y, random_state = 0, stratify = Y
)

## Создание набора данных и загрузчика данных

In [6]:
train_data = My_data_set(
    torch.tensor(X_train.astype('float32')), 
    torch.tensor(y_train.astype('float32'))
)

In [7]:
train_data_loader =\
torch.utils.data.DataLoader(
    train_data, batch_size=1000
)

# Различные варианты модели

## Модели с одним скрытым слоем

Опишем вещи общие для модлей этой группы

In [8]:
import os

sfn = "fiting_results2/"
auc_full = pd.DataFrame()

In [15]:
def save_folder_porecessor(result_name):
    if not result_name in os.listdir("fiting_results2"):
        os.mkdir("fiting_results2/" + result_name)

def model_info_save(
    hlr, epochs, file_name, 
    lc_plot_param = {},
    fit_params = {}
):
    learning_info, auc_info, nets = \
    model_fit_get_perfomance(
        hlr, epochs, loss_fn, 
        train_data_loader, X_test, y_test,
        **fit_params
    )
    
    save_folder_porecessor(file_name)
    learning_info.to_excel(sfn + file_name + '/lc.xlsx')
    
    global auc_full
    if any(auc_full.columns == file_name):
        auc_full.drop(
            file_name, inplace = True, axis = 1
        )
        
    auc_full = auc_full.join(
        pd.DataFrame(auc_info, columns = [file_name]),
        how = 'outer'
    )
    
    for col in learning_info:

        fig = plt.figure(figsize = [14, 7])
        plot_learning_curve(
            learning_info[col],
            **lc_plot_param
        )
        fig.savefig(
            sfn + file_name + "/" +\
            str(col) + '.png'
        )
        plt.close()
    
    return nets

### Обобщенный построитель модели с одним слоем

Базовый вариант

In [16]:
torch.manual_seed(0)

nets = model_info_save(
    [[i] for i in range(2, 20)],
    50, "one_hiden_initial"
)

[2] AUC = 0.5009034730513635
[3] AUC = 0.5440255473212362
[4] AUC = 0.6294614033739119
[5] AUC = 0.4919797590364625
[6] AUC = 0.5645355567865062
[7] AUC = 0.49350099848865797
[8] AUC = 0.4967485197525239
[9] AUC = 0.49711340607525817
[10] AUC = 0.5073533171828796
[11] AUC = 0.49234085133323724
[12] AUC = 0.5168548778472102
[13] AUC = 0.5593926721192869
[14] AUC = 0.5466059798467939
[15] AUC = 0.4800138960324716
[16] AUC = 0.49760184567815907
[17] AUC = 0.518184436509615
[18] AUC = 0.507833343945609
[19] AUC = 0.4830571997251145


Понижение learning rate

In [20]:
torch.manual_seed(0)

nets = model_info_save(
    [[i] for i in range(2, 20)],
    50, "one_hiden_lr_0.01",
    fit_params = {'lr':0.01}
)

[2] AUC = 0.644856076099583
[3] AUC = 0.6276041451878057
[4] AUC = 0.6499595358883513
[5] AUC = 0.6597085330283157
[6] AUC = 0.6600552740095035
[7] AUC = 0.6471428840073888
[8] AUC = 0.6593113150060985
[9] AUC = 0.680565448432424
[10] AUC = 0.6797301028906849
[11] AUC = 0.6697729291958875
[12] AUC = 0.6693465136695456
[13] AUC = 0.6737625949290039
[14] AUC = 0.6768702321053116
[15] AUC = 0.6781775214849091
[16] AUC = 0.6773253502628269
[17] AUC = 0.6209433202213996
[18] AUC = 0.6843243384125993
[19] AUC = 0.6834515474842144


In [22]:
torch.manual_seed(0)

nets = model_info_save(
    [[i] for i in range(2, 20)],
    50, "one_hiden_lr_0.001",
    fit_params = {'lr':0.001}
)

[2] AUC = 0.6582315022264335
[3] AUC = 0.6030587767202527
[4] AUC = 0.6416803114136508
[5] AUC = 0.6060250451901482
[6] AUC = 0.6271297269851912
[7] AUC = 0.6378419118459718
[8] AUC = 0.552448944782406
[9] AUC = 0.5719091637603877
[10] AUC = 0.6317140650480803
[11] AUC = 0.6187427851647647
[12] AUC = 0.6423035214169467
[13] AUC = 0.5814760476542857
[14] AUC = 0.6074327937788531
[15] AUC = 0.5864147147502822
[16] AUC = 0.6317201684811459
[17] AUC = 0.6143603552659926
[18] AUC = 0.6388336372403113
[19] AUC = 0.6269646043771183


## Модели с двумя скрытыми слоями

In [53]:
auc_full = pd.DataFrame()

Варианты числа нейронов в крытых слоях

In [51]:
neurs = [
    list(comb) for comb in 
    list(combinations([2,2,3,3,4,4,5,5,6,6],2))
]

Базовые модели с двумя слоями

In [52]:
nets = model_info_save(
    neurs,
    50, "two_hiden_initial"
)

[2, 2] AUC = 0.6523354209273853
[2, 3] AUC = 0.6508971551733523
[2, 3] AUC = 0.6485463436971826
[2, 4] AUC = 0.6516813638434606
[2, 4] AUC = 0.6501139362491467
[2, 5] AUC = 0.6500911720933885
[2, 5] AUC = 0.633563075351665
[2, 6] AUC = 0.6686040392849943
[2, 6] AUC = 0.6360031289167132
[2, 3] AUC = 0.6470988403147259
[2, 3] AUC = 0.6685492733450541
[2, 4] AUC = 0.6522220950215452
[2, 4] AUC = 0.6399838869367067
[2, 5] AUC = 0.672958591340845
[2, 5] AUC = 0.620668995648747
[2, 6] AUC = 0.6481616624563976
[2, 6] AUC = 0.6689501204355806
[3, 3] AUC = 0.6790534466086521
[3, 4] AUC = 0.6737583060300929
[3, 4] AUC = 0.6553694869916047
[3, 5] AUC = 0.49856272399186957
[3, 5] AUC = 0.6287197537776128
[3, 6] AUC = 0.6667076861357383
[3, 6] AUC = 0.6767826395929374
[3, 4] AUC = 0.649994506910241
[3, 4] AUC = 0.6692242800505827
[3, 5] AUC = 0.6763563890242459
[3, 5] AUC = 0.6686703522604642
[3, 6] AUC = 0.6501076678584307
[3, 6] AUC = 0.6240436992610887
[4, 4] AUC = 0.6730465137685202
[4, 5] AUC 

In [None]:
nets = model_info_save(
    neurs,
    200, "100ep_lr_0.5__lr_decr_1.5_two"
)