### Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

In [2]:
import torch
from torch import Tensor
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from torchmetrics import MetricCollection, AUROC, Recall, Precision, F1Score, Accuracy

In [4]:
import gc
import tqdm
import random
import pickle
import joblib

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate
# from scikeras.wrappers import KerasClassifier

In [6]:
SEED = 13
random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
PATH = './'
EVAL_SIZE = 0.05
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [7]:
df_train = pd.read_csv('train.csv', index_col='id')
df_test = pd.read_csv('test.csv', index_col='id')

In [8]:
display(df_train.head())
display(df_train.info())
display(df_train.describe())

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 11504798 entries, 0 to 11504797
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Gender                object 
 1   Age                   int64  
 2   Driving_License       int64  
 3   Region_Code           float64
 4   Previously_Insured    int64  
 5   Vehicle_Age           object 
 6   Vehicle_Damage        object 
 7   Annual_Premium        float64
 8   Policy_Sales_Channel  float64
 9   Vintage               int64  
 10  Response              int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 1.0+ GB


None

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0
mean,38.38356,0.998022,26.41869,0.4629966,30461.37,112.4254,163.8977,0.1229973
std,14.99346,0.0444312,12.99159,0.4986289,16454.75,54.03571,79.97953,0.3284341
min,20.0,0.0,0.0,0.0,2630.0,1.0,10.0,0.0
25%,24.0,1.0,15.0,0.0,25277.0,29.0,99.0,0.0
50%,36.0,1.0,28.0,0.0,31824.0,151.0,166.0,0.0
75%,49.0,1.0,35.0,1.0,39451.0,152.0,232.0,0.0
max,85.0,1.0,52.0,1.0,540165.0,163.0,299.0,1.0


In [9]:
display(df_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7669866 entries, 11504798 to 19174663
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Gender                object 
 1   Age                   int64  
 2   Driving_License       int64  
 3   Region_Code           float64
 4   Previously_Insured    int64  
 5   Vehicle_Age           object 
 6   Vehicle_Damage        object 
 7   Annual_Premium        float64
 8   Policy_Sales_Channel  float64
 9   Vintage               int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 643.7+ MB


None

### Data Preprocess

In [10]:
display(df_train.duplicated().sum())
display(df_test.duplicated().sum())

0

0

In [11]:
display(df_train.isna().sum())
display(df_test.isna().sum())

Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
dtype: int64

In [12]:
df = pd.concat([df_train, df_test], axis=0)

polsalchan_vc = df['Policy_Sales_Channel'].value_counts()
sparce_polsalchan = polsalchan_vc[polsalchan_vc < 100].index.to_list()
df.loc[df['Policy_Sales_Channel'].isin(sparce_polsalchan), 'Policy_Sales_Channel'] = -1

df.loc[df['Region_Code']==39.2, 'Region_Code'] = 39.0


In [13]:
df_train.loc[df_train['Policy_Sales_Channel'].isin(sparce_polsalchan), 'Policy_Sales_Channel'] = -1
df_test.loc[df_test['Policy_Sales_Channel'].isin(sparce_polsalchan), 'Policy_Sales_Channel'] = -1

df_train.loc[df_train['Region_Code']==39.2, 'Region_Code'] = 39.0
df_test.loc[df_test['Region_Code']==39.2, 'Region_Code'] = 39.0


In [14]:
bin_cols = ['Gender', 'Vehicle_Damage']
cat_cols = ['Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Vintage']
num_cols = ['Age', 'Driving_License', 'Previously_Insured', 'Annual_Premium']
target   = ['Response']

In [15]:
def to_float32(x):
    return(x.astype(np.float32))

pipeline = make_pipeline(
    ColumnTransformer([('bin_encode',
                        make_pipeline(OrdinalEncoder(),
                                      # FunctionTransformer(lambda x: x.astype(np.float32)),
                                      FunctionTransformer(func=to_float32),
                                      StandardScaler()
                                     ), bin_cols),

                       ('num_encode',
                        make_pipeline(StandardScaler(),
                                      # FunctionTransformer(lambda x: x.astype(np.float32))
                                      FunctionTransformer(func=to_float32),
                                     ), num_cols),
                      
                       ('cat_encode',
                       make_pipeline(OrdinalEncoder(),
                                     # FunctionTransformer(lambda x: x.astype(np.int32))
                                     FunctionTransformer(func=to_float32),
                                    ), cat_cols)],
                       
                       remainder='drop')
)


### Embeddings training (Non-Linear)

##### Dataset

In [16]:
class CosEmb_Dataset(Dataset):
    def __init__(self, df, is_eval=False, is_test=False):
        
        self.df = df.copy()
        self.df.loc[:, target] = self.df.loc[:, target].where(self.df[target]==1, -1)
        self.is_eval = is_eval
        self.is_test = is_test
        self.length = len(self.df)

        if self.is_test:
            self.X = self.df
        else:
            self.X, self.y = self.df.drop(target, axis=1), self.df[target].values
        
        if self.is_test or self.is_eval:
            self.X = pipeline.transform(self.X)
        else:
            self.X = pipeline.fit_transform(self.X)
        gc.collect()

    def __getitem__(self, index):
        contr_index = random.randint(0, self.length-1) 
        # print(contr_index)
        # if self.is_test: return self.X[index], None
        return self.X[index], self.X[contr_index], self.y[index]*self.y[contr_index]

    def __len__(self):
        return self.X.shape[0]


##### CosEmb Model Structure

In [17]:
class CosEmb(nn.Module):
    def __init__(self,
                 input_dim: int = 6+8+2+12+18,
                 emb_szs = [(53, 8), (3, 2), (123, 12), (290, 18)],
                 add_num = False,
                 use_fc = False
                ):
        super(CosEmb, self).__init__()
    
        self.embeddings = nn.ModuleList([nn.Embedding(in_sz, out_sz) for in_sz, out_sz in emb_szs])
        self.add_num = add_num
        self.use_fc = use_fc
        if not add_num: input_dim = sum([i[1] for i in emb_szs])
        self.fc = nn.Sequential(
                                nn.Linear(input_dim, 256),
                                nn.LazyBatchNorm1d(256),
                                nn.ReLU(),
                                nn.Linear(256, 256),
                                # nn.LazyBatchNorm1d(256),
                                # nn.ReLU(),
                                # nn.Linear(256, 256)
        )
    

    def forward(self, x1, x2):
        x1_num = x1[:, :6]
        x1_cat = x1[:, 6:].long()
        x1_cat = [emb_layer(x1_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        x1_cat = torch.cat(x1_cat, dim=-1)
        
        x2_num = x2[:, :6]
        x2_cat = x2[:, 6:].long()
        x2_cat = [emb_layer(x2_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        x2_cat = torch.cat(x2_cat, dim=-1)

        if self.add_num:
            x1 = torch.cat([x1_num, x1_cat], dim=-1).float()
            x2 = torch.cat([x2_num, x2_cat], dim=-1).float()
        else:
            x1 = x1_cat.float()
            x2 = x2_cat.float()

        if self.use_fc:
            x1 = self.fc(x1)
            x2 = self.fc(x2)
        
        return x1, x2

    def make_embs(self, x):
        x_num = x[:, :6]
        x_cat = x[:, 6:].long()
        x_cat = [emb_layer(x_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, dim=-1)
        
        if self.add_num:
            x = torch.cat([x_num, x_cat], dim=-1).float()

        if self.use_fc:
            x = self.fc(x)

        return x        
    

In [18]:
def train_epoch_cosemb(model, optimizer, loss_fn, train_dataloader):
    model.train()
    losses = 0

    for X1, X2, y in tqdm.tqdm(train_dataloader):
        X1, X2, y = X1.to(DEVICE), X2.to(DEVICE), y.to(DEVICE).squeeze_()

        preds1, preds2 = model(X1, X2)
        optimizer.zero_grad()
        loss = loss_fn(preds1, preds2, y.float())
        
        loss.backward()
        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)
    
def evaluate_cosemb(model, loss_fn, test_dataloader, altmetric=None):
    model.eval()
    losses = 0

    for X1, X2, y in test_dataloader:
        X1, X2, y = X1.to(DEVICE), X2.to(DEVICE), y.to(DEVICE).squeeze_()
        
        preds1, preds2 = model(X1, X2)
        loss = loss_fn(preds1, preds2, y.float())

        losses += loss.item()

    return losses / len(test_dataloader)


##### CosEmb Model Training

In [19]:
# df_train_train, df_train_eval = train_test_split(df_train, test_size=EVAL_SIZE, random_state=SEED,
#                                                  shuffle=True, stratify=df_train[target])

# dataset_train = CosEmb_Dataset(df_train_train)
# dataset_eval = CosEmb_Dataset(df_train_eval, is_eval=True)
# # dataset_train = Base_Dataset(df_train)

# display(len(dataset_train))
# display(len(dataset_eval))


In [20]:
# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# EMB_0, EMB_1, EMB_2, EMB_3 = 25, 2, 50, 50
# # EMB_0, EMB_1, EMB_2, EMB_3 = 15, 2, 25, 25
# INPUT_DIM = 6+EMB_0+EMB_1+EMB_2+EMB_3
# EMB_SZS = [(53, EMB_0), (3, EMB_1), (123, EMB_2), (290, EMB_3)]
# ADD_NUM = True
# USE_FC = True

# NUM_EPOCHS = 128
# BATCH_SIZE = 1024*4
# LR = 0.001 #0.001
# WEIGHT_DECAY = 1e-4

# cosemb = CosEmb(input_dim=INPUT_DIM,
#               emb_szs=EMB_SZS,
#               add_num=ADD_NUM,
#               use_fc=USE_FC
#               )


# cosemb.to(DEVICE)

# sampler_weights = torch.Tensor(np.where(df_train_train['Response']==0, 1, 8))
# num_samples = len(df_train_train)
# sampler = torch.utils.data.WeightedRandomSampler(num_samples=num_samples, weights=sampler_weights)
# trainloader = torch.utils.data.DataLoader(dataset_train, sampler=sampler,
#                                           batch_size=BATCH_SIZE, shuffle=False,
#                                           num_workers=8, drop_last=False)

# evalloader = torch.utils.data.DataLoader(dataset_eval,
#                                          batch_size=BATCH_SIZE, shuffle=True,
#                                          num_workers=8, drop_last=False)

# loss_fn = nn.CosineEmbeddingLoss()
# optimizer = torch.optim.AdamW(cosemb.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)


In [21]:
# %%time

# for epoch in range(1, NUM_EPOCHS+1):
#     train_loss = train_epoch_cosemb(cosemb, optimizer, loss_fn, trainloader)
#     eval_loss = evaluate_cosemb(cosemb, loss_fn, evalloader)
#     print((f"Epoch: {epoch}, Train loss: {train_loss:.5f}, Val loss: {eval_loss:.5f}"))
    

In [22]:
# sns.heatmap(cosemb.embeddings[2].weight.cpu().detach())


In [23]:
# sns.heatmap(cosemb.embeddings[1].weight.cpu().detach())


In [24]:
# with open(PATH+'pipeline', 'wb') as fp:
#     pickle.dump(pipeline, fp)

In [25]:
# with open(PATH+'embeddings_cosemb-wnum-128.pth', 'wb') as fp:
#     pickle.dump(cosemb.embeddings, fp)

In [26]:
# with open(PATH+'cosemb256-wnum-128.pth', 'wb') as fp:
#     torch.save(cosemb, fp)

##### CosEmb optuna

In [27]:
# df_train_train, df_train_eval = train_test_split(df_train, test_size=EVAL_SIZE, random_state=SEED,
#                                                  shuffle=True, stratify=df_train[target])

# dataset_train = CosEmb_Dataset(df_train_train)
# dataset_eval = CosEmb_Dataset(df_train_eval, is_eval=True)
# # dataset_train = Base_Dataset(df_train)

# display(len(dataset_train))
# display(len(dataset_eval))

In [28]:
def objective(trial):

    # model's params
    EMB_0, EMB_1, EMB_2, EMB_3 = 25, 2, 50, 50
    # EMB_0, EMB_1, EMB_2, EMB_3 = trial.suggest_int('EMB_0', 7, 15, step=1),\
    #                              trial.suggest_int('EMB_1', 2, 3, step=1),\
    #                              trial.suggest_int('EMB_2', 23, 27, step=1),\
    #                              trial.suggest_int('EMB_3', 17, 23, step=1)
    INPUT_DIM = 6+EMB_0+EMB_1+EMB_2+EMB_3
    EMB_SZS = [(53, EMB_0), (3, EMB_1), (123, EMB_2), (290, EMB_3)]
    
    # learning params
    NUM_EPOCHS = trial.suggest_int('NUM_EPOCHS', 16, 64, step=8)
    BATCH_SIZE = 1024*4 #trial.suggest_int('BATCH_SIZE', 2048, 8192, step=2048) #2048
    LR = trial.suggest_float('LR', 1e-5, 1e-2, log=True)
    WEIGHT_DECAY = trial.suggest_float('WEIGHT_DECAY', 1e-6, 1e-3, log=True)
    
    cosemb = CosEmb(input_dim=INPUT_DIM,
                  emb_szs=EMB_SZS,
                  add_num=False
                  )
    cosemb.to(DEVICE)
    
    sampler_weights = torch.Tensor(np.where(df_train_train['Response']==0, 1, 8))
    num_samples = len(df_train_train)
    sampler = torch.utils.data.WeightedRandomSampler(num_samples=num_samples, weights=sampler_weights)
    trainloader = torch.utils.data.DataLoader(dataset_train, sampler=sampler,
                                              batch_size=BATCH_SIZE, shuffle=False,
                                              num_workers=8, drop_last=False)
    
    evalloader = torch.utils.data.DataLoader(dataset_eval,
                                             batch_size=BATCH_SIZE, shuffle=True,
                                             num_workers=8, drop_last=False)
    
    loss_fn = nn.CosineEmbeddingLoss()
    optimizer = torch.optim.AdamW(cosemb.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

    for epoch in range(1, NUM_EPOCHS+1):
        train_loss = train_epoch_cosemb(cosemb, optimizer, loss_fn, trainloader)
        
        if ((epoch)%4==0):
            eval_loss = evaluate_cosemb(cosemb, loss_fn, evalloader)
            trial.report(eval_loss, epoch)
            if trial.should_prune():
                raise optuna.TrialPruned()

    eval_loss = evaluate_cosemb(cosemb, loss_fn, evalloader)

    torch.cuda.empty_cache()
    gc.collect()
    return eval_loss

In [29]:
# sampler = optuna.samplers.TPESampler(seed=SEED)
# # storage = optuna.storages.InMemoryStorage()

# study = optuna.create_study(direction='minimize', sampler=sampler,
#                             study_name='cosemb-study_cosdist', storage='sqlite:///cosemb-study_cosdist.db', load_if_exists=True,
#                             pruner=optuna.pruners.MedianPruner(n_startup_trials=16,
#                                                                n_warmup_steps=16)
#                            )
# study.optimize(objective, n_trials=100)


### Embeddings training (Linear)

##### DataSet

In [30]:
class Base_Dataset(Dataset):
    def __init__(self, df, is_eval=False, is_test=False):
        
        self.df = df
        self.is_eval = is_eval
        self.is_test = is_test

        if self.is_test:
            self.X = self.df
        else:
            self.X, self.y = self.df.drop(target, axis=1), self.df[target].values
        
        if self.is_test or self.is_eval:
            self.X = pipeline.transform(self.X)
        else:
            self.X = pipeline.fit_transform(self.X)
        gc.collect()

    def __getitem__(self, index):
        # if self.is_test: return self.X[index], None
        return self.X[index], self.y[index]

    def __len__(self):
        return self.X.shape[0]


##### FCNNet Model Structure

In [31]:
class FCNNet(nn.Module):
    def __init__(self,
                 input_dim: int = 6+8+2+12+18,
                 layers_num: int = 2,
                 layers_dim: int = 32,
                 activation = nn.ReLU,
                 emb_szs = [(53, 8), (3, 2), (123, 12), (290, 18)],
                 dropout: float = 0.,
                ):
        super(FCNNet, self).__init__()
    
        self.embeddings = nn.ModuleList([nn.Embedding(in_sz, out_sz) for in_sz, out_sz in emb_szs])
    
        fc_layers = []
        fc_layers.append(nn.Linear(input_dim, layers_dim))
        fc_layers.append(nn.LazyBatchNorm1d())
        fc_layers.append(activation())
        fc_layers.append(nn.Dropout(p=dropout))
        for i in range(layers_num):
            fc_layers.append(nn.Linear(layers_dim, layers_dim))
            fc_layers.append(nn.LazyBatchNorm1d())
            fc_layers.append(activation())
            fc_layers.append(nn.Dropout(p=dropout))
        fc_layers.append(nn.Linear(layers_dim, 1))
    
        self.fc_layers = nn.Sequential(*fc_layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x_num = x[:, :6]
        x_cat = x[:, 6:].long()
        x_cat = [emb_layer(x_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, dim=-1)

        x = torch.cat([x_num, x_cat], dim=-1).float()
        x = self.fc_layers(x)
        return x
    

In [32]:
def train_epoch_fcnn(model, optimizer, loss_fn, train_dataloader, altmetric=None):
    model.train()
    losses = 0

    for X, y in tqdm.tqdm(train_dataloader):
        X, y = X.to(DEVICE), y.to(DEVICE)

        preds = model(X)
        optimizer.zero_grad()
        loss = loss_fn(preds, y.float())
        if altmetric: altmetric.update(preds,y.float())
        
        loss.backward()
        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)
    
def evaluate_fcnn(model, loss_fn, test_dataloader, altmetric=None):
    model.eval()
    losses = 0

    for X, y in test_dataloader:
        X, y = X.to(DEVICE), y.to(DEVICE)
        
        preds = model(X)
        loss = loss_fn(preds, y.float())
        if altmetric: altmetric.update(preds,y.float())
        losses += loss.item()

    return losses / len(test_dataloader)


##### FCNNet Model training

In [33]:
# cosemb = torch.load('cosemb256-wnum-128.pth')

# with open(PATH+'pipeline', 'rb') as fp:
#     pipeline = pickle.load(fp)

In [34]:
# df_train_train, df_train_eval = train_test_split(df_train, test_size=EVAL_SIZE, random_state=SEED,
#                                                  shuffle=True, stratify=df_train[target])

# dataset_train = Base_Dataset(df_train_train, is_eval=True)
# dataset_eval = Base_Dataset(df_train_eval, is_eval=True)
# # dataset_train = Base_Dataset(df_train)

# display(len(dataset_train))
# display(len(dataset_eval))


10929558

575240

In [35]:
# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# EMB_0, EMB_1, EMB_2, EMB_3 = 25, 2, 50, 50
# INPUT_DIM = 6+EMB_0+EMB_1+EMB_2+EMB_3
# LAYERS_NUM = 2
# LAYERS_DIM = 192
# ACTIVATION = nn.ReLU
# EMB_SZS = [(53, EMB_0), (3, EMB_1), (123, EMB_2), (290, EMB_3)]
# DROPOUT = 0.1

# NUM_EPOCHS = 8
# BATCH_SIZE = 1024*4
# LR = 0.0002 #0.001
# WEIGHT_DECAY = 3e-4

# fcnn = FCNNet(input_dim=INPUT_DIM,
#               layers_num=LAYERS_NUM,
#               layers_dim=LAYERS_DIM,
#               activation=ACTIVATION,
#               emb_szs=EMB_SZS,
#               dropout=DROPOUT
#               )

# altmetric_train = MetricCollection([AUROC(task='binary'),
#                                     Recall(task='binary'),
#                                     Precision(task='binary'),
#                                     F1Score(task='binary'),
#                                     Accuracy(task='binary')
#                                    ])
# altmetric_eval = MetricCollection([AUROC(task='binary'),
#                                    Recall(task='binary'),
#                                    Precision(task='binary'),
#                                    F1Score(task='binary'),
#                                    Accuracy(task='binary')
#                                   ])

# fcnn.to(DEVICE)
# altmetric_train.to(DEVICE)
# altmetric_eval.to(DEVICE)

# trainloader = torch.utils.data.DataLoader(dataset_train,
#                                           batch_size=BATCH_SIZE, shuffle=True,
#                                           num_workers=8, drop_last=False)
# evalloader = torch.utils.data.DataLoader(dataset_eval,
#                                          batch_size=BATCH_SIZE, shuffle=True,
#                                          num_workers=8, drop_last=False)

# loss_fn = nn.BCEWithLogitsLoss(
#                                # weight=torch.Tensor([8]).to(DEVICE),
#                                # pos_weight=torch.Tensor([8]).to(DEVICE)
#                               )
# optimizer = torch.optim.AdamW(fcnn.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)




In [47]:
# fcnn.embeddings = cosemb.embeddings
# for p in fcnn.embeddings.parameters():
#     p.requires_grad = False

In [48]:
# %%time
# PRINT_EVERY = 4

# for epoch in range(1, NUM_EPOCHS+1):
#     train_loss = train_epoch_fcnn(fcnn, optimizer, loss_fn, trainloader, altmetric=altmetric_train)
#     eval_loss = evaluate_fcnn(fcnn, loss_fn, evalloader, altmetric=altmetric_eval)
#     print((f"Epoch: {epoch}, Train loss: {train_loss:.5f}, Val loss: {eval_loss:.5f}"))

#     if ((epoch)%PRINT_EVERY==0):
#         print('Train')
#         for j in [(i, round(altmetric_train[i].compute().item(), 5))
#                   for i in altmetric_train.keys()]: print(j)
#         print()
#         print('Test')
#         for j in [(i, round(altmetric_eval[i].compute().item(), 5))
#                   for i in altmetric_eval.keys()]: print(j)
#         print()

#     altmetric_train.reset()
#     altmetric_eval.reset()
    

100%|██████████| 2669/2669 [00:21<00:00, 124.33it/s]


Epoch: 1, Train loss: 0.34209, Val loss: 0.26878


100%|██████████| 2669/2669 [00:22<00:00, 117.03it/s]


Epoch: 2, Train loss: 0.26567, Val loss: 0.25878


100%|██████████| 2669/2669 [00:22<00:00, 120.47it/s]


Epoch: 3, Train loss: 0.25962, Val loss: 0.25611


100%|██████████| 2669/2669 [00:22<00:00, 116.80it/s]


Epoch: 4, Train loss: 0.25754, Val loss: 0.25511
Train
('BinaryAUROC', 0.87184)
('BinaryRecall', 0.0816)
('BinaryPrecision', 0.54992)
('BinaryF1Score', 0.14211)
('BinaryAccuracy', 0.87882)

Test
('BinaryAUROC', 0.87517)
('BinaryRecall', 0.11823)
('BinaryPrecision', 0.541)
('BinaryF1Score', 0.19405)
('BinaryAccuracy', 0.87921)



100%|██████████| 2669/2669 [00:22<00:00, 120.08it/s]


Epoch: 5, Train loss: 0.25650, Val loss: 0.25454


100%|██████████| 2669/2669 [00:21<00:00, 125.72it/s]


Epoch: 6, Train loss: 0.25590, Val loss: 0.25439


100%|██████████| 2669/2669 [00:22<00:00, 119.27it/s]


Epoch: 7, Train loss: 0.25545, Val loss: 0.25402


100%|██████████| 2669/2669 [00:22<00:00, 116.92it/s]


Epoch: 8, Train loss: 0.25509, Val loss: 0.25372
Train
('BinaryAUROC', 0.87495)
('BinaryRecall', 0.09721)
('BinaryPrecision', 0.55691)
('BinaryF1Score', 0.16553)
('BinaryAccuracy', 0.87945)

Test
('BinaryAUROC', 0.87683)
('BinaryRecall', 0.1211)
('BinaryPrecision', 0.54895)
('BinaryF1Score', 0.19842)
('BinaryAccuracy', 0.87966)



100%|██████████| 2669/2669 [00:22<00:00, 118.50it/s]


Epoch: 9, Train loss: 0.25477, Val loss: 0.25360


100%|██████████| 2669/2669 [00:23<00:00, 114.45it/s]


Epoch: 10, Train loss: 0.25456, Val loss: 0.25336


100%|██████████| 2669/2669 [00:22<00:00, 119.24it/s]


Epoch: 11, Train loss: 0.25432, Val loss: 0.25310


100%|██████████| 2669/2669 [00:23<00:00, 115.48it/s]


Epoch: 12, Train loss: 0.25415, Val loss: 0.25308
Train
('BinaryAUROC', 0.87625)
('BinaryRecall', 0.10585)
('BinaryPrecision', 0.55774)
('BinaryF1Score', 0.17793)
('BinaryAccuracy', 0.8797)

Test
('BinaryAUROC', 0.87769)
('BinaryRecall', 0.13228)
('BinaryPrecision', 0.54808)
('BinaryF1Score', 0.21312)
('BinaryAccuracy', 0.87986)



100%|██████████| 2669/2669 [00:23<00:00, 113.65it/s]


Epoch: 13, Train loss: 0.25401, Val loss: 0.25301


100%|██████████| 2669/2669 [00:22<00:00, 120.95it/s]


Epoch: 14, Train loss: 0.25385, Val loss: 0.25286


 57%|█████▋    | 1514/2669 [00:12<00:09, 125.88it/s]

KeyboardInterrupt



In [23]:
# with open(PATH+'pipeline', 'wb') as fp:
#     pickle.dump(pipeline, fp)

In [24]:
# with open(PATH+'embeddings', 'wb') as fp:
#     pickle.dump(fcnn.embeddings, fp)

##### FCNNet optuna

In [24]:
# df_train_train, df_train_eval = train_test_split(df_train, test_size=EVAL_SIZE, random_state=SEED,
#                                                  shuffle=True, stratify=df_train[target])

# dataset_train = Base_Dataset(df_train_train)
# dataset_eval = Base_Dataset(df_train_eval, is_eval=True)

In [25]:
ACTIVATIONS = {'ReLU': nn.ReLU,
               'SELU': nn.SELU,
               'GELU': nn.GELU,
               'RReLU': nn.RReLU,
               'SiLU': nn.SiLU,
               'LeakyReLU': nn.LeakyReLU,
               'IDENTITY': nn.Identity,
              }

def objective(trial):

    # model's params
    EMB_0, EMB_1, EMB_2, EMB_3 = trial.suggest_int('EMB_0', 7, 15, step=1),\
                                 trial.suggest_int('EMB_1', 2, 3, step=1),\
                                 trial.suggest_int('EMB_2', 23, 27, step=1),\
                                 trial.suggest_int('EMB_3', 17, 23, step=1)
    INPUT_DIM = 6+EMB_0+EMB_1+EMB_2+EMB_3
    LAYERS_NUM = 0 #trial.suggest_int('LAYERS_NUM', 0, 4, step=1) #4
    LAYERS_DIM = 192 #trial.suggest_int('LAYERS_DIM', 128, 384, step=64) #64
    ACTIVATION_OPTIONS = 'ReLU' #trial.suggest_categorical('ACTIVATION', ['ReLU', 'SELU', 'GELU', 'RReLU'])
    ACTIVATION = ACTIVATIONS[ACTIVATION_OPTIONS]
    EMB_SZS = [(53, EMB_0), (3, EMB_1), (123, EMB_2), (290, EMB_3)]
    DROPOUT = 0.15 #trial.suggest_float('DROPOUT', 0, 0.2)
    
    # learning params
    NUM_EPOCHS = trial.suggest_int('NUM_EPOCHS', 16, 64, step=8)
    BATCH_SIZE = 6144 #trial.suggest_int('BATCH_SIZE', 2048, 8192, step=2048) #2048
    LR = trial.suggest_float('LR', 7e-4, 3e-3, log=True)
    WEIGHT_DECAY = trial.suggest_float('WEIGHT_DECAY', 1e-4, 1e-3, log=True)
    # POS_WEIGHT = trial.suggest_float('POS_WEIGHT', 1/4, 8, log=True)
    
    fcnn = FCNNet(input_dim=INPUT_DIM,
                  layers_num=LAYERS_NUM,
                  layers_dim=LAYERS_DIM,
                  activation=ACTIVATION,
                  emb_szs=EMB_SZS,
                  dropout=DROPOUT
                  )
    # altmetric_train = MetricCollection([AUROC(task='binary'),
    #                                     # Recall(task='binary'),
    #                                     # Precision(task='binary'),
    #                                     # F1Score(task='binary'),
    #                                     # Accuracy(task='binary')
    #                                    ])
    altmetric_eval = MetricCollection([AUROC(task='binary'),
                                       # Recall(task='binary'),
                                       # Precision(task='binary'),
                                       # F1Score(task='binary'),
                                       # Accuracy(task='binary')
                                      ])
    
    fcnn.to(DEVICE)
    # altmetric_train.to(DEVICE)
    altmetric_eval.to(DEVICE)
    
    trainloader = torch.utils.data.DataLoader(dataset_train,
                                              batch_size=BATCH_SIZE, shuffle=True,
                                              num_workers=8, drop_last=False)
    evalloader = torch.utils.data.DataLoader(dataset_eval,
                                             batch_size=BATCH_SIZE, shuffle=True,
                                             num_workers=8, drop_last=False)
    
    loss_fn = nn.BCEWithLogitsLoss(
        # pos_weight=torch.Tensor([POS_WEIGHT]).to(DEVICE)
    )
    optimizer = torch.optim.AdamW(fcnn.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

    for epoch in range(1, NUM_EPOCHS+1):
        train_loss = train_epoch_fcnn(fcnn, optimizer, loss_fn, trainloader)
        
        if ((epoch)%4==0):
            eval_loss = evaluate_fcnn(fcnn, loss_fn, evalloader, altmetric=altmetric_eval)
            intermid_value = altmetric_eval['BinaryAUROC'].compute().item()
            # trial.report(eval_loss, epoch)
            trial.report(intermid_value, epoch)
            altmetric_eval.reset()
            if trial.should_prune():
                raise optuna.TrialPruned()

    eval_loss = evaluate_fcnn(fcnn, loss_fn, evalloader, altmetric=altmetric_eval)

    torch.cuda.empty_cache()
    gc.collect()
    # return eval_loss
    return altmetric_eval['BinaryAUROC'].compute().item()

In [26]:
# sampler = optuna.samplers.TPESampler(seed=SEED)
# # storage = optuna.storages.InMemoryStorage()

# study = optuna.create_study(direction='maximize', sampler=sampler,
#                             study_name='fcnn-study_auc3', storage='sqlite:///fcnn-study_bce.db', load_if_exists=True,
#                             pruner=optuna.pruners.MedianPruner(n_startup_trials=16,
#                                                                n_warmup_steps=16)
#                            )
# study.optimize(objective, n_trials=100)


### DAE training

##### DAE Model Structure

In [41]:
class DAE(nn.Module):
    def __init__(self,
                 input_dim: int = 6+8+2+12+18,
                 layers_num: int = 3,
                 layers_dim: int = 64,
                 activation = nn.ReLU,
                 emb_szs = [(53, 8), (3, 2), (123, 12), (290, 18)],
                 emb_weights = False,
                 dropout: float = 0.,
                 swapnoise_ratio = 0.15,
                 return_obfuscation_mask = False,
                ):
        super(DAE, self).__init__()
    
        self.embeddings = nn.ModuleList([nn.Embedding(in_sz, out_sz) for in_sz, out_sz in emb_szs])
        if (emb_weights==True):
            for i, emb in enumerate(embeddings):
                emb.weight = emb_weights[0].weight
                for p in emb.parameters():
                    p.requires_grad = False
        self.swapnoise_ratio = swapnoise_ratio
        self.return_obfuscation_mask = return_obfuscation_mask
    
        dae_layers = []
        dae_layers.append(nn.Linear(input_dim, layers_dim))
        dae_layers.append(activation())
        dae_layers.append(nn.Dropout(p=dropout))
        for i in range(layers_num):
            dae_layers.append(nn.Linear(layers_dim, layers_dim))
            dae_layers.append(activation())
            dae_layers.append(nn.Dropout(p=dropout))
        dae_layers.append(nn.Linear(layers_dim, input_dim))
    
        self.dae = nn.Sequential(*dae_layers)

        ###########################
        # distinctions for make_denoise
        self.dae_layers = []
        self.dae_layers.append([list(self.dae.children())[0]])
        for i in range(1, len(self.dae)-3, 3):
            self.dae_layers.append(list(self.dae.children())[:i+2+1])
        self.dae_layers = [nn.Sequential(*i) for i in self.dae_layers]

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x_num = x[:, :6]
        x_cat = x[:, 6:].long()
        x_cat = [emb_layer(x_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, dim=-1)

        x = torch.cat([x_num, x_cat], dim=-1).float()
        x_orig = x.clone().detach() # for backforwarding
        x, _ = self.add_swapnoise(x, ratio=self.swapnoise_ratio)
        x = self.dae(x)

        if (self.return_obfuscation_mask==True): return x, x_orig, _
        return x, x_orig, None

    def make_denoise(self, x: torch.Tensor):
        if len(x.shape) == 1: x = x.unsqueeze(0)
        x_num = x[:, :6]
        x_cat = x[:, 6:].long()
        x_cat = [emb_layer(x_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, dim=-1)
        x = torch.cat([x_num, x_cat], dim=-1).float()
        return torch.cat([i(x) for i in self.dae_layers][1:], dim=-1) #dropping first output

    # https://www.kaggle.com/code/ryanzhang/pytorch-dae-starter-code
    def add_swapnoise(self, x, ratio=0.15):
        obfuscation_mask = torch.bernoulli(ratio * torch.ones(x.shape)).to(DEVICE)
        obfuscated_x = torch.where(obfuscation_mask == 1, x[torch.randperm(x.shape[0])], x)
        return obfuscated_x, obfuscation_mask

class BottleDAE(nn.Module):
    def __init__(self,
                 input_dim: int = 6+8+2+12+18,
                 layers_num: int = 2,
                 layers_dim: int = 1024,
                 gist_dim: int = 128,
                 activation = nn.ReLU,
                 emb_szs = [(53, 8), (3, 2), (123, 12), (290, 18)],
                 emb_weights = False,
                 dropout: float = 0.,
                 swapnoise_ratio = 0.15,
                 return_obfuscation_mask = False,
                ):
        super(BottleDAE, self).__init__()
    
        self.embeddings = nn.ModuleList([nn.Embedding(in_sz, out_sz) for in_sz, out_sz in emb_szs])
        if (emb_weights==True):
            for i, emb in enumerate(embeddings):
                emb.weight = emb_weights[0].weight
                for p in emb.parameters():
                    p.requires_grad = False
        self.swapnoise_ratio = swapnoise_ratio
        self.return_obfuscation_mask = return_obfuscation_mask
    
        dae_layers = []
        dae_layers.append(nn.Linear(input_dim, layers_dim))
        dae_layers.append(activation())
        dae_layers.append(nn.Dropout(p=dropout))
        for i in range(layers_num-1):
            dae_layers.append(nn.Linear(layers_dim, layers_dim))
            dae_layers.append(activation())
            dae_layers.append(nn.Dropout(p=dropout))
        dae_layers.append(nn.Linear(layers_dim, gist_dim))
        dae_layers.append(activation())
        dae_layers.append(nn.Dropout(p=dropout))
        dae_layers.append(nn.Linear(gist_dim, layers_dim))
        dae_layers.append(activation())
        dae_layers.append(nn.Dropout(p=dropout))
        for i in range(layers_num-1):
            dae_layers.append(nn.Linear(layers_dim, layers_dim))
            dae_layers.append(activation())
            dae_layers.append(nn.Dropout(p=dropout))
        dae_layers.append(nn.Linear(layers_dim, input_dim))

        self.dae = nn.Sequential(*dae_layers)

        ###########################
        # distinctions for make_denoise
        self.dae_layers = nn.Sequential(*list(self.dae.children())[:3+(layers_num-1)*3+1])

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x_num = x[:, :6]
        x_cat = x[:, 6:].long()
        x_cat = [emb_layer(x_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, dim=-1)

        x = torch.cat([x_num, x_cat], dim=-1).float()
        x_orig = x.clone().detach() # for backforwarding
        x, _ = self.add_swapnoise(x, ratio=self.swapnoise_ratio)
        x = self.dae(x)

        if (self.return_obfuscation_mask==True): return x, x_orig, _
        return x, x_orig, None

    def make_denoise(self, x: torch.Tensor):
        if len(x.shape) == 1: x = x.unsqueeze(0)
        x_num = x[:, :6]
        x_cat = x[:, 6:].long()
        x_cat = [emb_layer(x_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, dim=-1)
        x = torch.cat([x_num, x_cat], dim=-1).float()
        return self.dae_layers(x)

    # https://www.kaggle.com/code/ryanzhang/pytorch-dae-starter-code
    def add_swapnoise(self, x, ratio=0.15):
        obfuscation_mask = torch.bernoulli(ratio * torch.ones(x.shape)).to(DEVICE)
        obfuscated_x = torch.where(obfuscation_mask == 1, x[torch.randperm(x.shape[0])], x)
        return obfuscated_x, obfuscation_mask
    

In [42]:
def train_epoch_dae(model, optimizer, loss_fn, train_dataloader):
    model.train()
    losses = 0

    for X, y in tqdm.tqdm(train_dataloader):
    # for X, y in train_dataloader:
        X = X.to(DEVICE)

        preds, orig, mask = model(X)
        optimizer.zero_grad()
        loss = loss_fn(preds, orig, mask)
        
        loss.backward()
        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)
    
def evaluate_dae(model, loss_fn, test_dataloader):
    model.eval()
    losses = 0

    for X, y in test_dataloader:
        X = X.to(DEVICE)
        
        preds, orig, mask = model(X)
        loss = loss_fn(preds, orig, mask)
        losses += loss.item()

    return losses / len(test_dataloader)


In [43]:
class MSE_Weighted(nn.Module):
    # when no mask and emphasis = 1 - equvivalent to MSE
    def __init__(self, emphasis=1):
        self.emphasis = emphasis
        # emphasis between 0 and 1
        super().__init__()

    def forward(self, pred, actual, mask=None):
        if (mask is None): mask = torch.ones(pred.shape).to(DEVICE)
        loss_weights = mask * self.emphasis + (1 - mask) * (1 - self.emphasis)
        unweighted_loss = nn.functional.mse_loss(pred, actual, reduction='none')
        weighted_loss = loss_weights * unweighted_loss
        return weighted_loss.mean()


##### DAE Training

In [23]:
# with open(PATH+'pipeline', 'rb') as fp:
#     pipeline = pickle.load(fp)

In [44]:
# with open(PATH+'embeddings', 'rb') as fp:
#     embeddings = pickle.load(fp)

In [51]:
# df_train_, df_eval_ = train_test_split(df, test_size=EVAL_SIZE, random_state=SEED,
#                                      shuffle=True, stratify=df['Response'].fillna(2)) # without stratify because of  NaNs in test

# dataset_train = Base_Dataset(df_train_, is_eval=True) #eval to not to override the pipeline
# dataset_eval = Base_Dataset(df_eval_, is_eval=True) #eval to not to override the pipeline

# display(len(dataset_train))
# display(len(dataset_eval))


18215930

958734

In [52]:
# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# EMB_0, EMB_1, EMB_2, EMB_3 = 25, 2, 50, 50
# INPUT_DIM = 6+EMB_0+EMB_1+EMB_2+EMB_3
# LAYERS_NUM = 4
# LAYERS_DIM = 1024
# # GIST = 128
# ACTIVATION = nn.ReLU
# EMB_SZS = [(53, EMB_0), (3, EMB_1), (123, EMB_2), (290, EMB_3)]
# EMB_WEIGHTS = embeddings
# DROPOUT = 0.
# SWAPNOISE_RATIO = 0.2

# NUM_EPOCHS = 32
# BATCH_SIZE = 1024*4 #96
# LR = 1e-4 #2e-4 
# WEIGHT_DECAY = 5e-5 #6e-5

# dae = DAE(input_dim = INPUT_DIM,
#               layers_num = LAYERS_NUM,
#               layers_dim = LAYERS_DIM,
#               # gist_dim = GIST,
#               activation = ACTIVATION,
#               emb_szs = EMB_SZS,
#               emb_weights = EMB_WEIGHTS,
#               dropout = DROPOUT,
#               swapnoise_ratio = SWAPNOISE_RATIO,
#               return_obfuscation_mask=True)
# dae.to(DEVICE)

# trainloader = torch.utils.data.DataLoader(dataset_train,
#                                           batch_size=BATCH_SIZE, shuffle=True,
#                                           num_workers=8, drop_last=False)
# evalloader = torch.utils.data.DataLoader(dataset_eval,
#                                          batch_size=BATCH_SIZE, shuffle=True,
#                                          num_workers=8, drop_last=False)

# loss_fn = MSE_Weighted(emphasis=4/5)
# # loss_fn = MSE_Weighted(emphasis=1)
# # loss_fn = nn.MSELoss()
# optimizer = torch.optim.AdamW(dae.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)


In [53]:
# %%time
# for epoch in range(1, NUM_EPOCHS+1):
#     train_loss = train_epoch_dae(dae, optimizer, loss_fn, trainloader)
#     eval_loss = evaluate_dae(dae, loss_fn, evalloader)
#     print((f"Epoch: {epoch}, Train loss: {train_loss:.5f}, Val loss: {eval_loss:.5f}"))


100%|██████████| 4448/4448 [01:20<00:00, 55.53it/s]


Epoch: 1, Train loss: 0.06353, Val loss: 0.02430


100%|██████████| 4448/4448 [01:20<00:00, 55.10it/s]


Epoch: 2, Train loss: 0.01864, Val loss: 0.01535


100%|██████████| 4448/4448 [01:19<00:00, 55.61it/s]


Epoch: 3, Train loss: 0.01380, Val loss: 0.01267


100%|██████████| 4448/4448 [01:20<00:00, 55.54it/s]


Epoch: 4, Train loss: 0.01191, Val loss: 0.01137


100%|██████████| 4448/4448 [01:20<00:00, 55.39it/s]


Epoch: 5, Train loss: 0.01086, Val loss: 0.01055


100%|██████████| 4448/4448 [01:20<00:00, 55.28it/s]


Epoch: 6, Train loss: 0.01020, Val loss: 0.00990


100%|██████████| 4448/4448 [01:20<00:00, 55.31it/s]


Epoch: 7, Train loss: 0.00968, Val loss: 0.00941


100%|██████████| 4448/4448 [01:20<00:00, 55.37it/s]


Epoch: 8, Train loss: 0.00930, Val loss: 0.00919


100%|██████████| 4448/4448 [01:20<00:00, 55.42it/s]


Epoch: 9, Train loss: 0.00900, Val loss: 0.00892


100%|██████████| 4448/4448 [01:19<00:00, 55.67it/s]


Epoch: 10, Train loss: 0.00878, Val loss: 0.00864


100%|██████████| 4448/4448 [01:18<00:00, 56.40it/s]


Epoch: 11, Train loss: 0.00858, Val loss: 0.00847


100%|██████████| 4448/4448 [01:20<00:00, 55.26it/s]


Epoch: 12, Train loss: 0.00841, Val loss: 0.00836


100%|██████████| 4448/4448 [01:19<00:00, 55.66it/s]


Epoch: 13, Train loss: 0.00828, Val loss: 0.00819


100%|██████████| 4448/4448 [01:19<00:00, 55.73it/s]


Epoch: 14, Train loss: 0.00819, Val loss: 0.00805


100%|██████████| 4448/4448 [01:19<00:00, 55.95it/s]


Epoch: 15, Train loss: 0.00810, Val loss: 0.00806


100%|██████████| 4448/4448 [01:19<00:00, 55.71it/s]


Epoch: 16, Train loss: 0.00802, Val loss: 0.00790


100%|██████████| 4448/4448 [01:20<00:00, 55.37it/s]


Epoch: 17, Train loss: 0.00794, Val loss: 0.00795


100%|██████████| 4448/4448 [01:20<00:00, 55.24it/s]


Epoch: 18, Train loss: 0.00786, Val loss: 0.00785


100%|██████████| 4448/4448 [01:20<00:00, 55.34it/s]


Epoch: 19, Train loss: 0.00782, Val loss: 0.00777


100%|██████████| 4448/4448 [01:20<00:00, 55.16it/s]


Epoch: 20, Train loss: 0.00777, Val loss: 0.00784


100%|██████████| 4448/4448 [01:20<00:00, 55.35it/s]


Epoch: 21, Train loss: 0.00773, Val loss: 0.00771


100%|██████████| 4448/4448 [01:20<00:00, 55.26it/s]


Epoch: 22, Train loss: 0.00769, Val loss: 0.00764


100%|██████████| 4448/4448 [01:20<00:00, 55.33it/s]


Epoch: 23, Train loss: 0.00766, Val loss: 0.00763


100%|██████████| 4448/4448 [01:19<00:00, 55.74it/s]


Epoch: 24, Train loss: 0.00764, Val loss: 0.00762


100%|██████████| 4448/4448 [01:19<00:00, 55.69it/s]


Epoch: 25, Train loss: 0.00759, Val loss: 0.00756


100%|██████████| 4448/4448 [01:20<00:00, 55.57it/s]


Epoch: 26, Train loss: 0.00756, Val loss: 0.00751


100%|██████████| 4448/4448 [01:20<00:00, 55.58it/s]


Epoch: 27, Train loss: 0.00754, Val loss: 0.00750


100%|██████████| 4448/4448 [01:20<00:00, 55.38it/s]


Epoch: 28, Train loss: 0.00752, Val loss: 0.00756


100%|██████████| 4448/4448 [01:20<00:00, 55.33it/s]


Epoch: 29, Train loss: 0.00752, Val loss: 0.00759


100%|██████████| 4448/4448 [01:19<00:00, 55.60it/s]


Epoch: 30, Train loss: 0.00749, Val loss: 0.00743


100%|██████████| 4448/4448 [01:20<00:00, 55.34it/s]


Epoch: 31, Train loss: 0.00749, Val loss: 0.00740


100%|██████████| 4448/4448 [01:20<00:00, 55.52it/s]


Epoch: 32, Train loss: 0.00747, Val loss: 0.00751
CPU times: user 1h 4min 7s, sys: 2min 51s, total: 1h 6min 59s
Wall time: 44min 43s


In [55]:
# torch.save(dae, 'bottle-dae-model_in-17_ln-2_ld-1024_gist-128_sr-0.2_epch-32_btch-4096_lr-5e4_wd-5e5_unweight.pth')
# torch.save(dae, 'bottle-dae-model_in-133_test.pth')
# torch.save(dae, 'dae-model_in-133_test.pth')

##### DAE optuna

In [36]:
# df_train, df_eval = train_test_split(df, test_size=EVAL_SIZE, random_state=SEED,
#                                      shuffle=True, stratify=df['Response'].fillna(2)) #2 for random in NaNs

# dataset_train = Base_Dataset(df_train, is_eval=True) #eval to not to override the pipeline
# dataset_eval = Base_Dataset(df_eval, is_eval=True) #eval to not to override the pipeline

# display(len(dataset_train))
# display(len(dataset_eval))

In [37]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EMPHASIS = 0.8
ACTIVATIONS = {'ReLU': nn.ReLU,
               'SELU': nn.SELU,
               'GELU': nn.GELU,
               'RReLU': nn.RReLU,
               'SiLU': nn.SiLU,
               'LeakyReLU': nn.LeakyReLU,
               'IDENTITY': nn.Identity,
              }

def objective(trial):

    # model's params
    INPUT_DIM = 6+25+2+50+50
    LAYERS_NUM = trial.suggest_int('LAYERS_NUM', 2, 5, step=1) #4
    LAYERS_DIM = trial.suggest_int('LAYERS_DIM', 256, 2048, step=256) #64
    ACTIVATION_OPTIONS = trial.suggest_categorical('ACTIVATION', ['ReLU', 'SELU', 'GELU', 'RReLU'])
    ACTIVATION = ACTIVATIONS[ACTIVATION_OPTIONS]
    EMB_SZS = [(53, 25), (3, 2), (123, 50), (290, 50)]
    EMB_WEIGHTS = embeddings
    DROPOUT = trial.suggest_float('DROPOUT', 0, 0.2)
    
    # learning params
    NUM_EPOCHS = trial.suggest_int('NUM_EPOCHS', 16, 64, step=8)
    BATCH_SIZE = trial.suggest_int('BATCH_SIZE', 1024, 4096, step=1024) #2048
    LR = trial.suggest_float('LR', 1e-5, 1e-3, log=True)
    WEIGHT_DECAY = trial.suggest_float('WEIGHT_DECAY', 1e-6, 1e-3, log=True)
    
    dae = DAE(input_dim = INPUT_DIM,
              layers_num = LAYERS_NUM,
              layers_dim = LAYERS_DIM,
              activation = ACTIVATION,
              emb_szs = EMB_SZS,
              emb_weights = EMB_WEIGHTS,
              dropout = DROPOUT,
              return_obfuscation_mask=True)
    dae.to(DEVICE)
    
    trainloader = torch.utils.data.DataLoader(dataset_train,
                                              batch_size=BATCH_SIZE, shuffle=True,
                                              num_workers=4, drop_last=False)
    evalloader = torch.utils.data.DataLoader(dataset_eval,
                                             batch_size=BATCH_SIZE, shuffle=True,
                                             num_workers=4, drop_last=False)
    
    loss_fn = MSE_Weighted(emphasis=EMPHASIS)
    # loss_fn = nn.MSELoss()
    optimizer = torch.optim.AdamW(dae.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

    for epoch in range(1, NUM_EPOCHS+1):
        train_loss = train_epoch_dae(dae, optimizer, loss_fn, trainloader)
        
        if ((epoch)%4==0):
            eval_loss = evaluate_dae(dae, loss_fn, evalloader)
            trial.report(eval_loss, epoch)
            if trial.should_prune():
                raise optuna.TrialPruned()

    eval_loss = evaluate_dae(dae, loss_fn, evalloader)

    torch.cuda.empty_cache()
    gc.collect()
    return eval_loss

In [38]:
# sampler = optuna.samplers.TPESampler(seed=42)
# # storage = optuna.storages.InMemoryStorage()

# study = optuna.create_study(direction='minimize', sampler=sampler,
#                             study_name='dae-study_mse0', storage='sqlite:///dae-study_mse.db', load_if_exists=True,
#                             pruner=optuna.pruners.MedianPruner(n_startup_trials=16,
#                                                                n_warmup_steps=16) #8
#                            )
# study.optimize(objective, n_trials=100)

### Сompresser training

##### Сompresser Model

In [61]:
class FCHead(nn.Module):
    def __init__(self,
                 dae_model,
                 dae_out_dim: int = 1024*4,
                 layers_num: int = 2,
                 layers_dim: int = 32,
                 activation = nn.ReLU,
                 dropout: float = 0.,
                 feature_dim = 256,
                ):
        super(FCHead, self).__init__()

        for p in dae_model.parameters():
            p.requires_grad = False
        
        fc_layers = []
        fc_layers.append(nn.Linear(dae_out_dim, dae_out_dim//4))
        fc_layers.append(nn.LazyBatchNorm1d())
        fc_layers.append(activation())
        fc_layers.append(nn.Dropout(p=dropout))
        fc_layers.append(nn.Linear(dae_out_dim//4, feature_dim))
        fc_layers.append(nn.LazyBatchNorm1d())
        fc_layers.append(activation())
        fc_layers.append(nn.Dropout(p=dropout))
        fc_layers.append(nn.Linear(feature_dim, 1))
        
        self.fc_layers = nn.Sequential(*fc_layers)
        self.feature_extractor = nn.Sequential(*list(self.fc_layers.children())[0:5])

        # self.fc_layers = nn.Sequential(nn.Linear(dae_out_dim, dae_out_dim),
        #                                nn.LazyBatchNorm1d(),
        #                                activation(),
        #                                nn.Linear(dae_out_dim, dae_out_dim),
        #                                nn.LazyBatchNorm1d(),
        #                                activation(),
        #                                nn.Linear(dae_out_dim, 1))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = dae.make_denoise(x)
        x = self.fc_layers(x)
        return x

    def feature_maker(self, x: torch.Tensor):
        x = dae.make_denoise(x)
        x = self.feature_extractor(x)
        return x
    

##### Сompresser Model Training

In [40]:
# dae = torch.load('dae_model.pth')
# dae = torch.load('bottle-dae-model_ln-2_ld-1024_sr-0.2_epch-64_btch-2048_lr-7e5_wd-2e5_unweight.pth')

In [56]:
# df_train_train, df_train_eval = train_test_split(df_train, test_size=EVAL_SIZE, random_state=SEED,
#                                                  shuffle=True, stratify=df_train[target])

# dataset_train = Base_Dataset(df_train_train, is_eval=True)
# dataset_eval = Base_Dataset(df_train_eval, is_eval=True)

In [62]:
# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ACTIVATION = nn.SELU
# DAE_OUT_DIM = 1024*4 #128 #1024*4
# FEATURE_DIM = 1024
# DROPOUT = 0.1

# NUM_EPOCHS = 16
# BATCH_SIZE = 1024*4 #96
# LR = 0.001
# WEIGHT_DECAY = 0.001 #5e-4

# compresser = FCHead(dae_model=dae,
#                     dae_out_dim = DAE_OUT_DIM,
#                     activation=ACTIVATION,
#                     dropout=DROPOUT,
#                     feature_dim=FEATURE_DIM,
#                     )

# altmetric_train = MetricCollection([AUROC(task='binary'),
#                                     Recall(task='binary'),
#                                     Precision(task='binary'),
#                                     F1Score(task='binary'),
#                                     Accuracy(task='binary')
#                                    ])
# altmetric_eval = MetricCollection([AUROC(task='binary'),
#                                    Recall(task='binary'),
#                                    Precision(task='binary'),
#                                    F1Score(task='binary'),
#                                    Accuracy(task='binary')
#                                   ])

# compresser.to(DEVICE)
# altmetric_train.to(DEVICE)
# altmetric_eval.to(DEVICE)

# trainloader = torch.utils.data.DataLoader(dataset_train,
#                                           batch_size=BATCH_SIZE, shuffle=True,
#                                           num_workers=8, drop_last=False)
# evalloader = torch.utils.data.DataLoader(dataset_eval,
#                                          batch_size=BATCH_SIZE, shuffle=True,
#                                          num_workers=8, drop_last=False)

# loss_fn = nn.BCEWithLogitsLoss(
#                                # weight=torch.Tensor([8]).to(DEVICE),
#                                # pos_weight=torch.Tensor([8]).to(DEVICE)
#                               )
# optimizer = torch.optim.AdamW(compresser.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)




In [67]:
# %%time
# PRINT_EVERY = 4

# for epoch in range(1, NUM_EPOCHS+1):
#     train_loss = train_epoch_fcnn(compresser, optimizer, loss_fn, trainloader, altmetric=altmetric_train)
#     eval_loss = evaluate_fcnn(compresser, loss_fn, evalloader, altmetric=altmetric_eval)
#     print((f"Epoch: {epoch}, Train loss: {train_loss:.5f}, Val loss: {eval_loss:.5f}"))

#     if ((epoch)%PRINT_EVERY==0):
#         print('Train')
#         for j in [(i, round(altmetric_train[i].compute().item(), 5))
#                   for i in altmetric_train.keys()]: print(j)
#         print()
#         print('Test')
#         for j in [(i, round(altmetric_eval[i].compute().item(), 5))
#                   for i in altmetric_eval.keys()]: print(j)
#         print()

#     altmetric_train.reset()
#     altmetric_eval.reset()
    

100%|██████████| 2669/2669 [00:41<00:00, 64.51it/s]


Epoch: 1, Train loss: 0.25105, Val loss: 0.25159


100%|██████████| 2669/2669 [00:41<00:00, 64.90it/s]


Epoch: 2, Train loss: 0.25097, Val loss: 0.25165


100%|██████████| 2669/2669 [00:41<00:00, 64.49it/s]


Epoch: 3, Train loss: 0.25092, Val loss: 0.25153


100%|██████████| 2669/2669 [00:41<00:00, 64.50it/s]


Epoch: 4, Train loss: 0.25089, Val loss: 0.25156
Train
('BinaryAUROC', 0.88083)
('BinaryRecall', 0.13293)
('BinaryPrecision', 0.56902)
('BinaryF1Score', 0.21551)
('BinaryAccuracy', 0.88097)

Test
('BinaryAUROC', 0.87979)
('BinaryRecall', 0.12503)
('BinaryPrecision', 0.56251)
('BinaryF1Score', 0.20458)
('BinaryAccuracy', 0.88042)



100%|██████████| 2669/2669 [00:41<00:00, 64.11it/s]


Epoch: 5, Train loss: 0.25085, Val loss: 0.25146


100%|██████████| 2669/2669 [00:41<00:00, 64.28it/s]


Epoch: 6, Train loss: 0.25085, Val loss: 0.25157


100%|██████████| 2669/2669 [00:41<00:00, 64.24it/s]


Epoch: 7, Train loss: 0.25081, Val loss: 0.25157


100%|██████████| 2669/2669 [00:41<00:00, 64.42it/s]


Epoch: 8, Train loss: 0.25079, Val loss: 0.25157
Train
('BinaryAUROC', 0.88096)
('BinaryRecall', 0.13386)
('BinaryPrecision', 0.56962)
('BinaryF1Score', 0.21677)
('BinaryAccuracy', 0.88103)

Test
('BinaryAUROC', 0.87976)
('BinaryRecall', 0.12439)
('BinaryPrecision', 0.56269)
('BinaryF1Score', 0.20374)
('BinaryAccuracy', 0.88041)



100%|██████████| 2669/2669 [00:41<00:00, 64.54it/s]


Epoch: 9, Train loss: 0.25077, Val loss: 0.25159


100%|██████████| 2669/2669 [00:41<00:00, 64.60it/s]


Epoch: 10, Train loss: 0.25075, Val loss: 0.25159


100%|██████████| 2669/2669 [00:41<00:00, 64.50it/s]


Epoch: 11, Train loss: 0.25074, Val loss: 0.25163


100%|██████████| 2669/2669 [00:41<00:00, 64.30it/s]


Epoch: 12, Train loss: 0.25073, Val loss: 0.25166
Train
('BinaryAUROC', 0.88105)
('BinaryRecall', 0.13397)
('BinaryPrecision', 0.56931)
('BinaryF1Score', 0.2169)
('BinaryAccuracy', 0.88101)

Test
('BinaryAUROC', 0.87975)
('BinaryRecall', 0.13598)
('BinaryPrecision', 0.55696)
('BinaryF1Score', 0.21859)
('BinaryAccuracy', 0.88042)



100%|██████████| 2669/2669 [00:42<00:00, 63.13it/s]


Epoch: 13, Train loss: 0.25070, Val loss: 0.25159


100%|██████████| 2669/2669 [00:41<00:00, 64.35it/s]


Epoch: 14, Train loss: 0.25068, Val loss: 0.25158


100%|██████████| 2669/2669 [00:41<00:00, 64.01it/s]


Epoch: 15, Train loss: 0.25067, Val loss: 0.25158


100%|██████████| 2669/2669 [00:41<00:00, 63.98it/s]


Epoch: 16, Train loss: 0.25068, Val loss: 0.25171
Train
('BinaryAUROC', 0.88114)
('BinaryRecall', 0.13494)
('BinaryPrecision', 0.57056)
('BinaryF1Score', 0.21825)
('BinaryAccuracy', 0.88111)

Test
('BinaryAUROC', 0.87969)
('BinaryRecall', 0.13003)
('BinaryPrecision', 0.55849)
('BinaryF1Score', 0.21095)
('BinaryAccuracy', 0.88035)

CPU times: user 10min 14s, sys: 56 s, total: 11min 10s
Wall time: 11min 45s


In [44]:
# torch.save(compresser, 'compresser_model-128.pth')

### GBM HEAD training (from Compressed)

##### LGBM Dataset

In [46]:
class Denoised_Compressed_Dataset():
    def __init__(self, df, compresser,
                 is_eval=False, is_test=False):
        
        self.df = df
        # self.dae_model = dae_model
        # self.dae_model.to(DEVICE)
        # self.dae_model.eval()
        # for p in dae_model.parameters():
        #     p.requires_grad = False
        self.compresser = compresser
        self.compresser.to(DEVICE)
        self.compresser.eval()
        for p in compresser.parameters():
            p.requires_grad = False
        self.is_eval = is_eval
        self.is_test = is_test
        self.dataset = Base_Dataset(df, is_eval=self.is_eval, is_test=self.is_test)
        gc.collect()

    def __getitem__(self, index):
        X, y = self.dataset[index]
        X, y = torch.Tensor(X).to(DEVICE),\
               torch.Tensor(y).to(DEVICE)
        X = compresser.feature_maker(X)
        X, y = X.detach().cpu().numpy(), y.detach().cpu().numpy()

        return X, y

    def __len__(self):
        return len(self.dataset)


##### LGBM HEAD Data Preparing

In [47]:
# # df_train_train, df_train_eval = train_test_split(df_train, test_size=EVAL_SIZE, random_state=SEED,
# #                                                  shuffle=True, stratify=df_train[target])

# # dataset_train = Denoised_Compressed_Dataset(df_train_train, compresser=compresser, is_eval=True)
# # dataset_eval = Denoised_Compressed_Dataset(df_train_eval, compresser=compresser, is_eval=True)
# dataset_train = Denoised_Compressed_Dataset(df_train, compresser=compresser, is_eval=True)

In [48]:
# BATCH_SIZE = 1024*4

# X_train, y_train = [], []
# batches = (-(-len(dataset_train)//BATCH_SIZE))
# index_start = 0
# index_end = BATCH_SIZE

# for batch in range(0, batches):
#     X, y = dataset_train[index_start:index_end]
#     X_train.append(X)
#     y_train.append(y)
#     index_start = index_end
#     index_end += BATCH_SIZE

# # X_eval, y_eval = [], []
# # batches = (-(-len(dataset_eval)//BATCH_SIZE))
# # index_start = 0
# # index_end = BATCH_SIZE

# # for batch in range(0, batches):
# #     X, y = dataset_eval[index_start:index_end]
# #     X_eval.append(X)
# #     y_eval.append(y)
# #     index_start = index_end
# #     index_end += BATCH_SIZE

# X_train, y_train = np.concatenate(X_train, axis=0), np.concatenate(y_train, axis=0)
# y_train = y_train.reshape(-1)

# # X_eval, y_eval = np.concatenate(X_eval, axis=0), np.concatenate(y_eval, axis=0)
# # y_eval = y_eval.reshape(-1)

In [49]:
%%time
# np.save('X_train.npy', X_train)
# np.save('y_train.npy', y_train)
# X_train = np.load('X_train.npy')
# y_train = np.load('y_train.npy')

CPU times: user 3 μs, sys: 1e+03 ns, total: 4 μs
Wall time: 7.87 μs


##### LGBM HEAD Training

In [50]:
# from lightgbm import LGBMClassifier
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier


In [51]:
# del dataset_train
# del dataset_eval
# gc.collect()

In [52]:
# %%time
# # gbm = XGBClassifier(n_esimators=300, eval_metric='roc_auc', verbose=0, scale_pos_weight=5, device='cuda')
# # gbm = CatBoostClassifier(iterations=128, devices='cuda')
# gbm = LGBMClassifier(device='gpu')
# # gbm.fit(X_train, y_train)
# cv_results = cross_validate(gbm, X_train, y_train, scoring='roc_auc', cv=5)

##### LGBM HEAD optuna

In [53]:
# def objective(trial):

#     params = {
#         'objective': 'binary',
#         'boosting': 'gbdt', #trial.suggest_categorical('boosting', ['gbdt', 'dart']),
#         'num_leaves': trial.suggest_int('num_leaves', 16, 64, step=4)-1,
#         'tree_learner': trial.suggest_categorical('tree_learner', ['serial', 'feature', 'data', 'voting']),
#         'max_depth': trial.suggest_int('max_depth', 8, 64, step=8),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 4, 32, step=4),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 10, log=True),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 10, log=True),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-2, 1, log=True),
#         'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 8),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1, log=True),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1, log=True),
#         # 'max_bin': trial.suggest_int('max_bin', 64, 512, step=64)-1,
#         'n_estimators': trial.suggest_int('n_estimators', 128, 1024, step=128),
#         # 'n_jobs': 8,
#         'device': 'gpu',
#         'random_state': SEED,
#         'verbose': 0,
#         # 'force_col_wise':True,
#         # 'force_row_wise':True,
#     }

#     gbm = LGBMClassifier(**params)
#     cv_results = cross_validate(gbm, X_train, y_train, scoring='roc_auc', cv=5)
#     gc.collect()
    
#     return cv_results['test_score'].mean()
    

In [54]:
# # sampler = optuna.samplers.TPESampler(seed=SEED)

# study = optuna.create_study(direction='maximize', sampler=sampler,
#                             study_name='lgbm-study_auc0', storage='sqlite:///lgbm-study_auc.db', load_if_exists=True,
#                            )
# study.optimize(objective, n_trials=300)

### GBM HEAD training (from DAE)

##### GBM Dataset

In [25]:
# dae = torch.load('bottle-dae-model_in-133_test.pth')

In [26]:
class Denoised_Dataset():
    def __init__(self, df, dae_model,
                 is_eval=False, is_test=False,
                 dtype='float32'):
        
        self.df = df
        self.is_eval = is_eval
        self.is_test = is_test
        self.dataset = Base_Dataset(df, is_eval=self.is_eval, is_test=self.is_test)
        self.dae_model = dae_model
        self.dae_model.to(DEVICE)
        self.dae_model.eval()
        for p in self.dae_model.parameters():
            p.requires_grad = False
        self.dtype = dtype
        gc.collect()

    def __getitem__(self, index):
        X, y = self.dataset[index]
        X, y = torch.Tensor(X).to(DEVICE),\
               torch.Tensor(y).to(DEVICE)
        X = dae.make_denoise(X)
        X, y = X.detach().cpu().numpy(),\
               y.detach().cpu().numpy()

        return X, y

    def __len__(self):
        return len(self.dataset)


##### GBM HEAD Data Preparing

In [33]:
# df_train_train, df_train_eval = train_test_split(df_train, test_size=EVAL_SIZE, random_state=SEED,
#                                                  shuffle=True, stratify=df_train[target])

# dataset_train = Denoised_Dataset(df_train_train, dae_model=dae, is_eval=True)
# dataset_eval = Denoised_Dataset(df_train_eval, dae_model=dae, is_eval=True)
# dataset_full = Denoised_Dataset(df_train, dae_model=dae, is_eval=True, dtype='float16')

In [34]:
# BATCH_SIZE = 1024*4

# X_train, y_train = [], []
# batches = (-(-len(dataset_train)//BATCH_SIZE))
# index_start = 0
# index_end = BATCH_SIZE

# for batch in range(0, batches):
#     X, y = dataset_train[index_start:index_end]
#     X_train.append(X)
#     y_train.append(y)
#     index_start = index_end
#     index_end += BATCH_SIZE

# X_eval, y_eval = [], []
# batches = (-(-len(dataset_eval)//BATCH_SIZE))
# index_start = 0
# index_end = BATCH_SIZE

# for batch in range(0, batches):
#     X, y = dataset_eval[index_start:index_end]
#     X_eval.append(X)
#     y_eval.append(y)
#     index_start = index_end
#     index_end += BATCH_SIZE

# X_train, y_train = np.concatenate(X_train, axis=0), np.concatenate(y_train, axis=0)
# y_train = y_train.reshape(-1)

# X_eval, y_eval = np.concatenate(X_eval, axis=0), np.concatenate(y_eval, axis=0)
# y_eval = y_eval.reshape(-1)


In [35]:
# BATCH_SIZE = 1024*4

# X_, y_ = [], []
# batches = (-(-len(dataset_full)//BATCH_SIZE))
# index_start = 0
# index_end = BATCH_SIZE

# for batch in range(0, batches):
#     X, y = dataset_full[index_start:index_end]
#     X_.append(X)
#     y_.append(y)
#     index_start = index_end
#     index_end += BATCH_SIZE

# X_, y_ = np.concatenate(X_, axis=0), np.concatenate(y_, axis=0)
# y_ = y_.reshape(-1)

In [57]:
# # %%time
# name = '256_float32.npy'

# # np.save('X_train_'+name, X_train)
# # np.save('y_train_'+name, y_train)
# # np.save('X_eval_'+name, X_eval)
# # np.save('y_eval_'+name, y_eval)

# X_train = np.load('X_train_'+name)
# y_train = np.load('y_train_'+name)
# X_eval = np.load('X_eval_'+name)
# y_eval = np.load('y_eval_'+name)

##### GBM HEAD Training

In [36]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


In [60]:
%%time
# gbm = XGBClassifier(n_esimators=2000,
#                     max_depth=16,
#                     # early_stopping_rounds=50,
#                     # eval_metric='auc',
#                     device='cuda')
# gbm = CatBoostClassifier(loss_function='Logloss',
#                          eval_metric='AUC',
#                          # learning_rate=0.03,
#                          # depth=4,
#                          iterations=1024,
#                          early_stopping_rounds=50,
#                          # random_strength=0,
#                          # max_leaves=512,
#                          # fold_permutation_block=64,                         
#                          task_type='GPU')

# gbm = LGBMClassifier(n_estimators=1024,
#                      verbose=500,
#                      learning_rate=0.3,
#                      # max_depth=8,
#                      device='gpu')
# gbm.fit(X_train, y_train, eval_set=[(X_eval, y_eval)])

# gbm.fit(X_train, y_train,
#         eval_set=[(X_eval, y_eval)])

# cv_results = cross_validate(gbm, X_train, y_train, eval_set=[X_eval, y_eval], scoring='roc_auc', cv=5)

[LightGBM] [Info] Number of positive: 1344306, number of negative: 9585252
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32640
[LightGBM] [Info] Number of data points in the train set: 10929558, number of used features: 128
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3090 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 128 dense feature groups (1334.17 MB) transferred to GPU in 0.765896 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122997 -> initscore=-1.964348
[LightGBM] [Info] Start training from score -1.964348
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and d

In [61]:
# roc_auc_score(y_eval, gbm.predict_proba(X_eval)[:, 1])

0.8724192420189854

##### GBM HEAD optuna

In [38]:
def objective(trial):

    # params = {
    #     'objective': 'binary',
    #     'boosting': 'gbdt', #trial.suggest_categorical('boosting', ['gbdt', 'dart']),
    #     'num_leaves': trial.suggest_int('num_leaves', 16, 64, step=4)-1,
    #     'tree_learner': trial.suggest_categorical('tree_learner', ['serial', 'feature', 'data', 'voting']),
    #     'max_depth': trial.suggest_int('max_depth', 4, 64, step=4),
    #     'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 4, 32, step=4),
    #     'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 10, log=True),
    #     'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 10, log=True),
    #     'learning_rate': trial.suggest_float('learning_rate', 1e-2, 0.5, log=True),
    #     'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1/4, 8, log=True),
    #     'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1, log=True),
    #     'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1, log=True),
    #     # 'max_bin': trial.suggest_int('max_bin', 64, 512, step=64)-1,
    #     'n_estimators': trial.suggest_int('n_estimators', 128, 2048, step=128),
    #     # 'n_jobs': 8,
    #     'device': 'gpu',
    #     'random_state': SEED,
    #     'verbose': 0,
    #     # 'force_col_wise':True,
    #     # 'force_row_wise':True,
    # }

    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.5, log=True),
        'random_seed': SEED,
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.001, 10, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.01, 10, log=True),
        'depth': trial.suggest_int('depth', 4, 16, step=1),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 31, step=5),
        # 'max_leaves': trial.suggest_int('max_leaves', 16, 64, step=10)-1,
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1/4, 8, step=1),
        'task_type': 'GPU',
        'early_stopping_rounds': 50,
        'verbose': 0,
    }

    # gbm = LGBMClassifier(**params)
    gbm = CatBoostClassifier(**params)
    gbm.fit(X_train, y_train, eval_set=[(X_eval, y_eval)])
    eval_metric = roc_auc_score(y_eval, gbm.predict_proba(X_eval)[:, 1])
    # cv_results = cross_validate(gbm, X_train, y_train, scoring='roc_auc', cv=5)
    gc.collect()
    
    # return cv_results['test_score'].mean()
    return eval_metric
    

In [39]:
# sampler = optuna.samplers.TPESampler(seed=SEED)

# study = optuna.create_study(direction='maximize', sampler=sampler,
#                             study_name='cgbm-study_auc0', storage='sqlite:///lgbm-study_auc.db', load_if_exists=True,
#                            )
# study.optimize(objective, n_trials=100)

[I 2024-07-24 20:46:02,388] A new study created in RDB with name: cgbm-study_auc0
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-07-24 20:48:30,958] Trial 0 finished with value: 0.8656141091836125 and parameters: {'learning_rate': 0.17962832718526836, 'l2_leaf_reg': 0.00891589366282252, 'bagging_temperature': 2.970541321550833, 'depth': 16, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 26, 'scale_pos_weight': 5}. Best is trial 0 with value: 0.8656141091836125.
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-07-24 20:49:47,782] Trial 1 finished with value: 0.8724092635264595 and parameters: {'learning_rate': 0.13899733172806242, 'l2_leaf_reg': 0.0013808487031636222, 'bagging_temperature': 0.0785865842510549, 'depth': 4, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 6, 'scale_pos_weight': 3}. Best is trial 1 with value: 0.8724092635264595.
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024

### GBM HEAD training (from CosEmb)

##### GBM Dataset

In [21]:
# embedder = torch.load('cosemb256-wnum-128.pth')

# with open(PATH+'pipeline', 'rb') as fp:
#     pipeline = pickle.load(fp)

In [22]:
class Embedded_Dataset():
    def __init__(self, df, embedder,
                 is_eval=False, is_test=False,
                 dtype='float32'):
        
        self.df = df
        self.is_eval = is_eval
        self.is_test = is_test
        self.dataset = Base_Dataset(df, is_eval=self.is_eval, is_test=self.is_test)
        self.embedder = embedder
        self.embedder.to(DEVICE)
        self.embedder.eval()
        for p in self.embedder.parameters():
            p.requires_grad = False
        self.dtype = dtype
        gc.collect()

    def __getitem__(self, index):
        X, y = self.dataset[index]
        X, y = torch.Tensor(X).to(DEVICE),\
               torch.Tensor(y).to(DEVICE)
        X = embedder.make_embs(X)
        X, y = X.detach().cpu().numpy(),\
               y.detach().cpu().numpy()

        return X, y

    def __len__(self):
        return len(self.dataset)


##### GBM HEAD Data Preparing

In [25]:
# df_train_train, df_train_eval = train_test_split(df_train, test_size=EVAL_SIZE, random_state=SEED,
#                                                  shuffle=True, stratify=df_train[target])

# dataset_train = Embedded_Dataset(df_train_train, embedder=embedder, is_eval=True)
# dataset_eval = Embedded_Dataset(df_train_eval, embedder=embedder, is_eval=True)
# dataset_full = Denoised_Dataset(df_train, dae_model=dae, is_eval=True, dtype='float16')

In [26]:
# BATCH_SIZE = 1024*4

# X_train, y_train = [], []
# batches = (-(-len(dataset_train)//BATCH_SIZE))
# index_start = 0
# index_end = BATCH_SIZE

# for batch in range(0, batches):
#     X, y = dataset_train[index_start:index_end]
#     X_train.append(X)
#     y_train.append(y)
#     index_start = index_end
#     index_end += BATCH_SIZE

# X_eval, y_eval = [], []
# batches = (-(-len(dataset_eval)//BATCH_SIZE))
# index_start = 0
# index_end = BATCH_SIZE

# for batch in range(0, batches):
#     X, y = dataset_eval[index_start:index_end]
#     X_eval.append(X)
#     y_eval.append(y)
#     index_start = index_end
#     index_end += BATCH_SIZE

# X_train, y_train = np.concatenate(X_train, axis=0), np.concatenate(y_train, axis=0)
# y_train = y_train.reshape(-1)

# X_eval, y_eval = np.concatenate(X_eval, axis=0), np.concatenate(y_eval, axis=0)
# y_eval = y_eval.reshape(-1)


In [35]:
# BATCH_SIZE = 1024*4

# X_, y_ = [], []
# batches = (-(-len(dataset_full)//BATCH_SIZE))
# index_start = 0
# index_end = BATCH_SIZE

# for batch in range(0, batches):
#     X, y = dataset_full[index_start:index_end]
#     X_.append(X)
#     y_.append(y)
#     index_start = index_end
#     index_end += BATCH_SIZE

# X_, y_ = np.concatenate(X_, axis=0), np.concatenate(y_, axis=0)
# y_ = y_.reshape(-1)

In [57]:
# # %%time
# name = '256_float32.npy'

# # np.save('X_train_'+name, X_train)
# # np.save('y_train_'+name, y_train)
# # np.save('X_eval_'+name, X_eval)
# # np.save('y_eval_'+name, y_eval)

# X_train = np.load('X_train_'+name)
# y_train = np.load('y_train_'+name)
# X_eval = np.load('X_eval_'+name)
# y_eval = np.load('y_eval_'+name)

##### GBM HEAD Training

In [37]:
# %%time
# gbm = XGBClassifier(n_esimators=1024,
#                     max_depth=9,
#                     # early_stopping_rounds=50,
#                     eval_metric='auc',
#                     device='cuda')
# # gbm = CatBoostClassifier(loss_function='Logloss',
# #                          eval_metric='AUC',
# #                          # learning_rate=0.03,
# #                          # depth=4,
# #                          iterations=2048,
# #                          early_stopping_rounds=50,
# #                          # random_strength=0,
# #                          # max_leaves=512,
# #                          # fold_permutation_block=64,                         
# #                          task_type='GPU')

# # gbm = LGBMClassifier(n_estimators=1024,
# #                      verbose=500,
# #                      # learning_rate=0.2,
# #                      # max_depth=8,
# #                      device='gpu')
# gbm.fit(X_train, y_train, eval_set=[(X_eval, y_eval)])

# # gbm.fit(X_train, y_train,
# #         eval_set=[(X_eval, y_eval)])

# # cv_results = cross_validate(gbm, X_train, y_train, eval_set=[X_eval, y_eval], scoring='roc_auc', cv=5)

In [44]:
# roc_auc_score(y_eval, gbm.predict_proba(X_eval)[:, 1])

0.8712647421976389

##### GBM HEAD optuna

In [38]:
def objective(trial):

    # params = {
    #     'objective': 'binary',
    #     'boosting': 'gbdt', #trial.suggest_categorical('boosting', ['gbdt', 'dart']),
    #     'num_leaves': trial.suggest_int('num_leaves', 16, 64, step=4)-1,
    #     'tree_learner': trial.suggest_categorical('tree_learner', ['serial', 'feature', 'data', 'voting']),
    #     'max_depth': trial.suggest_int('max_depth', 4, 64, step=4),
    #     'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 4, 32, step=4),
    #     'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 10, log=True),
    #     'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 10, log=True),
    #     'learning_rate': trial.suggest_float('learning_rate', 1e-2, 0.5, log=True),
    #     'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1/4, 8, log=True),
    #     'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1, log=True),
    #     'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1, log=True),
    #     # 'max_bin': trial.suggest_int('max_bin', 64, 512, step=64)-1,
    #     'n_estimators': trial.suggest_int('n_estimators', 128, 2048, step=128),
    #     # 'n_jobs': 8,
    #     'device': 'gpu',
    #     'random_state': SEED,
    #     'verbose': 0,
    #     # 'force_col_wise':True,
    #     # 'force_row_wise':True,
    # }

    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.5, log=True),
        'random_seed': SEED,
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.001, 10, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.01, 10, log=True),
        'depth': trial.suggest_int('depth', 4, 16, step=1),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 31, step=5),
        # 'max_leaves': trial.suggest_int('max_leaves', 16, 64, step=10)-1,
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1/4, 8, step=1),
        'task_type': 'GPU',
        'early_stopping_rounds': 50,
        'verbose': 0,
    }

    # gbm = LGBMClassifier(**params)
    gbm = CatBoostClassifier(**params)
    gbm.fit(X_train, y_train, eval_set=[(X_eval, y_eval)])
    eval_metric = roc_auc_score(y_eval, gbm.predict_proba(X_eval)[:, 1])
    # cv_results = cross_validate(gbm, X_train, y_train, scoring='roc_auc', cv=5)
    gc.collect()
    
    # return cv_results['test_score'].mean()
    return eval_metric
    

In [38]:
# sampler = optuna.samplers.TPESampler(seed=SEED)

# study = optuna.create_study(direction='maximize', sampler=sampler,
#                             study_name='cgbm-study_auc0', storage='sqlite:///lgbm-study_auc.db', load_if_exists=True,
#                            )
# study.optimize(objective, n_trials=100)