<a href="https://colab.research.google.com/github/DoItSon/playdata/blob/main/%EC%98%A4%EC%9D%BC%EB%B6%84%EB%A5%98base(%EC%88%98%EC%A0%95%ED%95%B4%EB%B3%B4%EA%B8%B0).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import random

import warnings
warnings.filterwarnings(action='ignore') 

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
CFG = {
    'EPOCHS': 30,
    'LEARNING_RATE':2e-2, 
    'BATCH_SIZE':256,
    'SEED':42
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED'])

In [None]:
SEED = 42
def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)    # 파이썬 환경변수 시드 고정
    np.random.seed(seed)
    torch.manual_seed(seed) # cpu 연산 무작위 고정
    torch.cuda.manual_seed(seed) # gpu 연산 무작위 고정
    torch.backends.cudnn.deterministic = True  # cuda 라이브러리에서 Deterministic(결정론적)으로 예측하기 (예측에 대한 불확실성 제거 )

In [None]:
DATA_PATH = "/content/drive/MyDrive/"

In [None]:
train = pd.read_csv(f'{DATA_PATH}train.csv')
test = pd.read_csv(f'{DATA_PATH}test.csv')

In [None]:
categorical_features2 = ['COMPONENT_ARBITRARY_COMPONENT1', 'COMPONENT_ARBITRARY_COMPONENT2',
       'COMPONENT_ARBITRARY_COMPONENT3', 'COMPONENT_ARBITRARY_COMPONENT4',
       'YEAR_2007', 'YEAR_2008', 'YEAR_2009', 'YEAR_2010', 'YEAR_2011',
       'YEAR_2012', 'YEAR_2013', 'YEAR_2014', 'YEAR_2015', 'YEAR_2016',
       'YEAR_2017', 'YEAR_2018', 'YEAR_2019', 'YEAR_2020', 'YEAR_2021',
       'YEAR_2022']
# Inference(실제 진단 환경)에 사용하는 컬럼
test_stage_features2 = ['ANONYMOUS_1', 'ANONYMOUS_2', 'AG', 'CO', 'CR', 'CU', 'FE', 'H2O', 'MN',
       'MO', 'NI', 'PQINDEX', 'TI', 'V', 'V40', 'ZN',
       'COMPONENT_ARBITRARY_COMPONENT1', 'COMPONENT_ARBITRARY_COMPONENT2',
       'COMPONENT_ARBITRARY_COMPONENT3', 'COMPONENT_ARBITRARY_COMPONENT4',
       'YEAR_2007', 'YEAR_2008', 'YEAR_2009', 'YEAR_2010', 'YEAR_2011',
       'YEAR_2012', 'YEAR_2013', 'YEAR_2014', 'YEAR_2015', 'YEAR_2016',
       'YEAR_2017', 'YEAR_2018', 'YEAR_2019', 'YEAR_2020', 'YEAR_2021',
       'YEAR_2022']

In [None]:
categorical_features = ['COMPONENT_ARBITRARY', 'YEAR']
# Inference(실제 진단 환경)에 사용하는 컬럼
test_stage_features = ['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR' , 'ANONYMOUS_2', 'AG', 'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V', 'V40', 'ZN']

In [None]:
train.columns

Index(['ID', 'COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR',
       'SAMPLE_TRANSFER_DAY', 'ANONYMOUS_2', 'AG', 'AL', 'B', 'BA', 'BE', 'CA',
       'CD', 'CO', 'CR', 'CU', 'FH2O', 'FNOX', 'FOPTIMETHGLY', 'FOXID', 'FSO4',
       'FTBN', 'FE', 'FUEL', 'H2O', 'K', 'LI', 'MG', 'MN', 'MO', 'NA', 'NI',
       'P', 'PB', 'PQINDEX', 'S', 'SB', 'SI', 'SN', 'SOOTPERCENTAGE', 'TI',
       'U100', 'U75', 'U50', 'U25', 'U20', 'U14', 'U6', 'U4', 'V', 'V100',
       'V40', 'ZN', 'Y_LABEL'],
      dtype='object')

- 전처리

In [None]:
train = train.fillna(0)
test = test.fillna(0)

In [None]:
all_X = train.drop(['ID', 'Y_LABEL'], axis = 1) # 정답값 제외
all_y = train['Y_LABEL'] # 정답

test = test.drop(['ID'], axis = 1) # 추론 데이터

In [None]:
all_X.columns

Index(['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR', 'SAMPLE_TRANSFER_DAY',
       'ANONYMOUS_2', 'AG', 'AL', 'B', 'BA', 'BE', 'CA', 'CD', 'CO', 'CR',
       'CU', 'FH2O', 'FNOX', 'FOPTIMETHGLY', 'FOXID', 'FSO4', 'FTBN', 'FE',
       'FUEL', 'H2O', 'K', 'LI', 'MG', 'MN', 'MO', 'NA', 'NI', 'P', 'PB',
       'PQINDEX', 'S', 'SB', 'SI', 'SN', 'SOOTPERCENTAGE', 'TI', 'U100', 'U75',
       'U50', 'U25', 'U20', 'U14', 'U6', 'U4', 'V', 'V100', 'V40', 'ZN'],
      dtype='object')

# 원핫인코딩
- 레이블 인코딩을 하고 원핫인코딩을 함! get_dummies쓰지마!
- 원핫인코딩 전에 할일
- 테스트 셋 분리하기 전에 원핫인코딩

In [None]:
enc = OneHotEncoder()

tmp = pd.DataFrame(
    enc.fit_transform(all_X[categorical_features]).toarray(),
    columns = enc.get_feature_names_out()
)
all_X = pd.concat([all_X,tmp],axis=1).drop(columns = categorical_features)
all_X

Unnamed: 0,ANONYMOUS_1,SAMPLE_TRANSFER_DAY,ANONYMOUS_2,AG,AL,B,BA,BE,CA,CD,...,YEAR_2013,YEAR_2014,YEAR_2015,YEAR_2016,YEAR_2017,YEAR_2018,YEAR_2019,YEAR_2020,YEAR_2021,YEAR_2022
0,1486,7,200,0,3,93,0,0,3059,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1350,51,375,0,2,19,0,0,2978,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2415,2,200,0,110,1,1,0,17,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7389,2,200,0,8,3,0,0,1960,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3954,4,200,0,1,157,0,0,71,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14090,1616,8,200,0,2,201,1,0,6,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14091,2784,2,200,0,3,85,0,0,2945,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14092,1788,9,550,0,6,0,1,0,13,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14093,2498,19,550,0,2,4,0,0,2244,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
tmp1 = pd.DataFrame(
    enc.transform(test[categorical_features]).toarray(),
    columns = enc.get_feature_names_out()
)
test = pd.concat([test,tmp1],axis=1).drop(columns = categorical_features)
test

Unnamed: 0,ANONYMOUS_1,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,...,YEAR_2013,YEAR_2014,YEAR_2015,YEAR_2016,YEAR_2017,YEAR_2018,YEAR_2019,YEAR_2020,YEAR_2021,YEAR_2022
0,2192,200,0,0,0,1,12,0.0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2794,200,0,0,2,1,278,0.0,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1982,200,0,0,0,16,5,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1404,200,0,0,3,4,163,0.0,4,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8225,200,0,0,0,6,13,0.0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,1714,200,0,0,3,130,1047,0.0,65,5,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,4131,200,0,0,5,2,736,0.0,5,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,4325,200,0,0,0,0,53,0.0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,1364,200,0,0,0,62,2,0.0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
all_X.shape,all_y.shape,test.shape

((14095, 70), (14095,), (6041, 36))

In [None]:
train_X, val_X, train_y, val_y = train_test_split(all_X, all_y, test_size=0.2, random_state=CFG['SEED'], stratify=all_y)
train_X.shape, val_X.shape, train_y.shape, val_y.shape

((11276, 70), (2819, 70), (11276,), (2819,))

# 스케일링

In [None]:
def get_values(value):
    return value.values.reshape(-1, 1)

for col in train_X.columns:
    if col not in categorical_features:
        scaler = StandardScaler()
        train_X[col] = scaler.fit_transform(get_values(train_X[col]))
        val_X[col] = scaler.transform(get_values(val_X[col]))
        if col in test.columns:
            test[col] = scaler.transform(get_values(test[col]))

# 데이터 셋

In [None]:
# class OilDataset(torch.utils.data.Dataset):
#     def __init__(self,x,y=None, distillation=False):
#         super(OilDataset,self).__init()
#         self.x = x
#         self.y = y
#         self.distillation = distillation

#     def __len__(self):
#         return self.x.shape[0]

#     def __getitem__(self,idx):
#         if self.distillation:
#         item = {}
#         item["x"] = torch.Tensor(self.x.iloc[idx])
#             if self.y is not None:
#                 item["z"] = self.y.values[idx]
#         return item

In [None]:
# class OilDataset(torch.utils.data.Dataset):
#     def __init__(self,x,y=None, distillation=False):
#         super(OilDataset,self).__init()
#         self.x = x
#         self.y = y
#         self.distillation = distillation

#     def __len__(self):
#         return self.x.shape[0]

#     def __getitem__(self,idx):
#         if self.distillation:
#             teacher_X  = torch.Tensor(self.x.iloc[idx])
#             student_X = torch.Tensor(self.x[test_stage_features].iloc[idx])
#             if self.y is not None:
#                 y = self.y.values[idx]
#             return teacher_X,student_X,y
#         else:
#             if self.y is None:
#                 test_X = torch.Tensor(self.x.iloc[idx])
#                 return test_X
#             else:
#                 teacher_X  = torch.Tensor(self.x.iloc[idx])
#                 y = self.y.values[idx]
#                 return teacher_X,y
#         return item

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data_X, data_y, distillation=False):
        super(CustomDataset, self).__init__()
        self.data_X = data_X
        self.data_y = data_y
        self.distillation = distillation
        
    def __len__(self):
        return len(self.data_X)
    
    def __getitem__(self, index):
        if self.distillation:
            # 지식 증류 학습 시
            teacher_X = torch.Tensor(self.data_X.iloc[index])
            student_X = torch.Tensor(self.data_X[test_stage_features].iloc[index])
            y = torch.Tensor([self.data_y.values[index]])
            return teacher_X, student_X, y
        else:
            if self.data_y is None:
                test_X = torch.Tensor(self.data_X.iloc[index])
                return test_X
            else:
                teacher_X = torch.Tensor(self.data_X.iloc[index])
                y = torch.Tensor([self.data_y.values[index]])
                return teacher_X, y

In [None]:
train_dataset = CustomDataset(train_X, train_y,False)
val_dataset = CustomDataset(val_X, val_y,False)

In [None]:
train_dataset[0]

(tensor([ 0.0669, -0.2194, -0.3408, -0.1499, -0.1096,  0.4295, -0.3082, -0.0407,
          1.1846, -0.0626, -0.1179, -0.0117, -0.2461, -0.3737, -0.5897, -0.3583,
         -0.5867, -0.6070, -0.6081, -0.1256, -0.1072, -0.0425, -0.1360, -0.1081,
         -0.2854,  0.0225, -0.4020,  0.6670, -0.1931,  2.7174,  1.5117,  0.2159,
          2.3507, -0.1908, -0.1130, -0.2369, -0.3854, -0.0980, -0.0840, -0.1138,
         -0.0714, -0.1174, -0.1217, -0.1280, -0.1222, -0.1631, -0.1056, -0.5973,
          0.6409, -1.0692, -0.6181, -0.4416,  1.0004, -0.2539, -0.1560, -0.3336,
         -0.2685, -0.2519, -0.2942, -0.2859, -0.3054, -0.3336, -0.2966,  3.7383,
         -0.2631, -0.2606, -0.2201, -0.1642, -0.1890, -0.1622]), tensor([0.]))

In [None]:
train_dl = torch.utils.data.DataLoader(train_dataset,batch_size=2,shuffle=False)
next(iter(train_dl))

[tensor([[ 0.0669, -0.2194, -0.3408, -0.1499, -0.1096,  0.4295, -0.3082, -0.0407,
           1.1846, -0.0626, -0.1179, -0.0117, -0.2461, -0.3737, -0.5897, -0.3583,
          -0.5867, -0.6070, -0.6081, -0.1256, -0.1072, -0.0425, -0.1360, -0.1081,
          -0.2854,  0.0225, -0.4020,  0.6670, -0.1931,  2.7174,  1.5117,  0.2159,
           2.3507, -0.1908, -0.1130, -0.2369, -0.3854, -0.0980, -0.0840, -0.1138,
          -0.0714, -0.1174, -0.1217, -0.1280, -0.1222, -0.1631, -0.1056, -0.5973,
           0.6409, -1.0692, -0.6181, -0.4416,  1.0004, -0.2539, -0.1560, -0.3336,
          -0.2685, -0.2519, -0.2942, -0.2859, -0.3054, -0.3336, -0.2966,  3.7383,
          -0.2631, -0.2606, -0.2201, -0.1642, -0.1890, -0.1622],
         [ 0.5727, -0.2194, -0.3408, -0.1499, -0.1649, -0.4277, -0.3082, -0.0407,
          -0.7973, -0.0626, -0.1179, -0.1089, -0.2539, -0.3737, -0.5897, -0.3583,
          -0.5867, -0.6070, -0.6081, -0.2203, -0.1072, -0.0425, -0.2017, -0.1081,
          -0.2854, -0.2405, -0.40

In [None]:
val_dl = torch.utils.data.DataLoader(val_dataset,batch_size=2,shuffle=False)
next(iter(val_dl))

[tensor([[-0.2420,  0.2880, -0.3408, -0.1499, -0.1511, -0.4085, -0.3082, -0.0407,
          -0.9144, -0.0626, -0.1179, -0.0441, -0.2617, -0.3737, -0.5897, -0.3583,
          -0.5867, -0.6070, -0.6081,  0.2908, -0.1072, -0.0425, -0.2017, -0.1081,
          -0.2513,  0.1978, -0.4020,  0.0217, -0.1931,  2.1481, -0.1966, -0.1936,
           1.7311, -0.1908, -0.1440, -0.2369, -0.3854, -0.0980, -0.0840, -0.1138,
          -0.0714, -0.1174, -0.1217, -0.1280, -0.1222, -0.1631, -0.1056, -0.5973,
           0.7249, -1.0222, -0.6181, -0.4416,  1.0004, -0.2539, -0.1560, -0.3336,
          -0.2685, -0.2519, -0.2942,  3.4978, -0.3054, -0.3336, -0.2966, -0.2675,
          -0.2631, -0.2606, -0.2201, -0.1642, -0.1890, -0.1622],
         [-0.3841, -0.2194, -0.0220, -0.1499, -0.1235, -0.6204, -0.3082, -0.0407,
           1.4121, -0.0626, -0.1179, -0.1089, -0.2694, -0.3737, -0.5897, -0.3583,
          -0.5867, -0.6070, -0.6081, -0.3239, -0.1072, -0.0425, -0.2017, -0.1081,
          -0.2854, -0.2405, -0.35

In [None]:
test_dl = torch.utils.data.DataLoader(val_dataset,batch_size=2,shuffle=False)
test_dl

<torch.utils.data.dataloader.DataLoader at 0x7fb3aab21050>

In [None]:
data = next(iter(train_dl))
data[0]

tensor([[ 0.0669, -0.2194, -0.3408, -0.1499, -0.1096,  0.4295, -0.3082, -0.0407,
          1.1846, -0.0626, -0.1179, -0.0117, -0.2461, -0.3737, -0.5897, -0.3583,
         -0.5867, -0.6070, -0.6081, -0.1256, -0.1072, -0.0425, -0.1360, -0.1081,
         -0.2854,  0.0225, -0.4020,  0.6670, -0.1931,  2.7174,  1.5117,  0.2159,
          2.3507, -0.1908, -0.1130, -0.2369, -0.3854, -0.0980, -0.0840, -0.1138,
         -0.0714, -0.1174, -0.1217, -0.1280, -0.1222, -0.1631, -0.1056, -0.5973,
          0.6409, -1.0692, -0.6181, -0.4416,  1.0004, -0.2539, -0.1560, -0.3336,
         -0.2685, -0.2519, -0.2942, -0.2859, -0.3054, -0.3336, -0.2966,  3.7383,
         -0.2631, -0.2606, -0.2201, -0.1642, -0.1890, -0.1622],
        [ 0.5727, -0.2194, -0.3408, -0.1499, -0.1649, -0.4277, -0.3082, -0.0407,
         -0.7973, -0.0626, -0.1179, -0.1089, -0.2539, -0.3737, -0.5897, -0.3583,
         -0.5867, -0.6070, -0.6081, -0.2203, -0.1072, -0.0425, -0.2017, -0.1081,
         -0.2854, -0.2405, -0.4020, -0.0705, 

In [None]:
train_X.shape[1]

70

In [None]:
class Teacher(nn.Module):
    def __init__(self,in_features):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features, 256),
            nn.BatchNorm1d(256),
            nn.ELU(),
            nn.Linear(256, 1024),
            nn.BatchNorm1d(1024),
            nn.ELU(),
            nn.Linear(1024, 256),
            nn.BatchNorm1d(256),
            nn.ELU(),
            nn.Linear(256, 1)
        )
        
    def forward(self, x):
        output = self.classifier(x)
        # output = output.squeeze() 
        return output

In [None]:
model = Teacher(train_X.shape[1]) # SEED 고정을 안해서 값이 다르다.
model(data[0])

tensor([[-1.0071],
        [ 1.1081]], grad_fn=<AddmmBackward0>)

In [None]:
batch_size = 256
loss_fn = torch.nn.BCEWithLogitsLoss()
model = Teacher(train_X.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
train_dt = CustomDataset(train_X,train_y)
train_dl = torch.utils.data.DataLoader(train_dt,batch_size=batch_size,shuffle=True)

### run!

In [None]:
data = next(iter(train_dl))
data[1]

In [None]:
def train_loop(train_dl,model,loss_fn,optimizer,device):
    model.train()

    epoch_loss = 0
    for batch in train_dl:
        pred = model(batch[0].to(device))
        loss = loss_fn(pred,batch[1].to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    epoch_loss /= len(train_dl)
    
    return epoch_loss

In [None]:
train_loop(train_dl,model,loss_fn,optimizer,device) # 여기 뭔가 이상함

0.3542283909188377

In [None]:
def competition_metric(true, pred):
    return f1_score(true, pred, average="macro")

@torch.no_grad() 
def validation_teacher(model, val_loader, criterion, device):
    model.eval()

    
    val_loss = []
    pred_labels = []
    true_labels = []
    threshold = 0.35


    for X, y in tqdm(train_dl):
        X = X.float().to(device)
        y = y.float().to(device)
        
        model_pred = model(X.to(device))
        loss = loss_fn(model_pred, y)
        val_loss.append(loss.item()) 

        model_pred = model_pred.to('cpu')  
        pred_labels += model_pred.tolist()
        true_labels += y.tolist()
    pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
    val_f1 = competition_metric(true_labels, pred_labels)
    return val_loss, val_f1 

In [None]:
valid_dt = CustomDataset(val_X,val_y) # 학습데이터를 검증하기 위해서 만듦!
valid_dl = torch.utils.data.DataLoader(valid_dt,batch_size=batch_size,shuffle=False)

In [None]:
validation_teacher(valid_dl,model,loss_fn,device)

AttributeError: ignored

In [None]:
n_splits = 5
batch_size = 256
epochs = 100
loss_fn = torch.nn.BCEWithLogitsLoss()

In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

In [None]:
num_features = train_X.shape[1]

In [None]:
train_X.shape,train_y.shape,test.shape

((11276, 70), (11276,), (6041, 36))

In [None]:
is_holdout = False
seed_everything(SEED)
best_score_list = []

for i,(tri,vai) in enumerate(cv.split(train_X)):

    X_train = train_X.iloc[tri]
    y_train = train_y.iloc[tri]

    X_valid = train_X.iloc[vai]
    y_valid = train_y.iloc[vai]

    model = Teacher(train_X.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters())

    train_dt = CustomDataset(X_train,y_train)
    valid_dt = CustomDataset(X_valid,y_valid)

    train_dl = torch.utils.data.DataLoader(train_dt,batch_size=batch_size,shuffle=True)
    valid_dl = torch.utils.data.DataLoader(valid_dt,batch_size=batch_size,shuffle=False)

    best_score = 0
    patience = 0

    for e in range(epochs):
        train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
        valid_loss, pred = validation_teacher(valid_dl, model, loss_fn, device)

AttributeError: ignored

In [None]:
next(iter(train_dl))

In [None]:
y_valid

In [None]:
def validation_teacher(model, val_loader, criterion, device):
    model.eval()

    val_loss = []
    pred_labels = []
    true_labels = []
    threshold = 0.35
    
    with torch.no_grad():
        for X, y in tqdm(val_loader):
            X = X.float().to(device)
            y = y.float().to(device)
            
            model_pred = model(X.to(device))
            
            loss = criterion(model_pred, y.reshape(-1, 1))
            val_loss.append(loss.item())      
            
            model_pred = model_pred.squeeze(1).to('cpu')  
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
        val_f1 = competition_metric(true_labels, pred_labels)
    return val_loss, val_f1   

In [None]:
model = Teacher(70)
model.eval()
optimizer = torch.optim.Adam(model.parameters(), lr=CFG['LEARNING_RATE'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1, threshold_mode='abs',min_lr=1e-8, verbose=True)

teacher_model = train(model, optimizer, train_dl, val_dl, device)

In [None]:
next(iter(test_dt1))

In [None]:
test_dt = CustomDataset(test)
test_dl = torch.utils.data.DataLoader(test_dt,batch_size=batch_size,shuffle=False)

_,pred = test_loop(test_dl,model,loss_fn,device) # _을 사용하여 pred값만 가져온다.
pred

In [None]:
loss_fn = torch.nn.BCEWithLogitsLoss()
epoch = 1000
batch_size = 32 # 8 16 32 64 이렇게 넣어주는 것이 좋다.
num_features = train_X.shape[1]

In [None]:
SEED = 42

In [None]:
from sklearn.model_selection import KFold
cv = KFold(n_splits = 5, shuffle = True, random_state = SEED)

In [None]:
is_holdout = False
for i, (tri,val) in enumerate(cv.split(train_X)):
    print(i)


In [None]:
train_dataset[0]

In [None]:
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False)

In [None]:
class Teacher(nn.Module):
    def __init__(self):
        super(Teacher, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=70, out_features=256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Linear(in_features=1024, out_features=256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.classifier(x)
        return output

In [None]:
next(iter(train_loader)) 

In [None]:
# reset_seeds(42)

# input_layer = torch.nn.Linear(train.shape[1],16) # x_train.shape[1] = 피처개수 (출력이 8개인 다중회귀)

# data = next(iter(train_dl))
# hidden_layer = input_layer(data["x"])
# hidden_layer

In [None]:
# model = Teacher(train.shape[1])

In [None]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)

    best_score = 0
    best_model = None
    criterion = nn.BCELoss().to(device)

    for epoch in range(CFG["EPOCHS"]):
        train_loss = []
  
        model.train()
        for X, y in tqdm(train_loader):
            X = X.float().to(device)
            y = y.float().to(device)
            
            optimizer.zero_grad()
            
            y_pred = model(X)
            
            loss = criterion(y_pred, y.reshape(-1, 1))
            loss.backward()
            
            optimizer.step()

            train_loss.append(loss.item())

        val_loss, val_score = validation_teacher(model, val_loader, criterion, device)

        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')

        if scheduler is not None:
            scheduler.step(val_score)
            
        if best_score < val_score:
            best_model = model
            best_score = val_score
        
    return best_model

In [None]:
def competition_metric(true, pred):
    return f1_score(true, pred, average="macro")

def validation_teacher(model, val_loader, criterion, device):
    model.eval()

    val_loss = []
    pred_labels = []
    true_labels = []
    threshold = 0.35
    
    with torch.no_grad():
        for X, y in tqdm(val_loader):
            X = X.float().to(device)
            y = y.float().to(device)
            
            model_pred = model(X.to(device))
            
            loss = criterion(model_pred, y.reshape(-1, 1))
            val_loss.append(loss.item())      
            
            model_pred = model_pred.squeeze(1).to('cpu')  
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
        val_f1 = competition_metric(true_labels, pred_labels)
    return val_loss, val_f1   

In [None]:
model = Teacher()
model.eval()
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG['LEARNING_RATE'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1, threshold_mode='abs',min_lr=1e-8, verbose=True)

teacher_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

In [None]:
class Student(nn.Module):
    def __init__(self):
        super(Student, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=18, out_features=128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.classifier(x)
        return output

In [None]:
def distillation(student_logits, labels, teacher_logits, alpha):
    distillation_loss = nn.BCELoss()(student_logits, teacher_logits)
    student_loss = nn.BCELoss()(student_logits, labels.reshape(-1, 1))
    return alpha * student_loss + (1-alpha) * distillation_loss

In [None]:
def distill_loss(output, target, teacher_output, loss_fn=distillation, opt=optimizer):
    loss_b = loss_fn(output, target, teacher_output, alpha=0.1)

    if opt is not None:
        opt.zero_grad()
        loss_b.backward()
        opt.step()

    return loss_b.item()

In [None]:
def student_train(s_model, t_model, optimizer, train_loader, val_loader, scheduler, device):
    s_model.to(device)
    t_model.to(device)
    
    best_score = 0
    best_model = None

    for epoch in range(CFG["EPOCHS"]):
        train_loss = []
        s_model.train()
        t_model.eval()
        
        for X_t, X_s, y in tqdm(train_loader):
            X_t = X_t.float().to(device)
            X_s = X_s.float().to(device)
            y = y.float().to(device)
            
            optimizer.zero_grad()

            output = s_model(X_s)
            with torch.no_grad():
                teacher_output = t_model(X_t)
                
            loss_b = distill_loss(output, y, teacher_output, loss_fn=distillation, opt=optimizer)

            train_loss.append(loss_b)

        val_loss, val_score = validation_student(s_model, t_model, val_loader, distill_loss, device)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_score)
            
        if best_score < val_score:
            best_model = s_model
            best_score = val_score
        
    return best_model

In [None]:
def validation_student(s_model, t_model, val_loader, criterion, device):
    s_model.eval()
    t_model.eval()

    val_loss = []
    pred_labels = []
    true_labels = []
    threshold = 0.35
    
    with torch.no_grad():
        for X_t, X_s, y in tqdm(val_loader):
            X_t = X_t.float().to(device)
            X_s = X_s.float().to(device)
            y = y.float().to(device)
            
            model_pred = s_model(X_s)
            teacher_output = t_model(X_t)
            
            loss_b = distill_loss(model_pred, y, teacher_output, loss_fn=distillation, opt=None)
            val_loss.append(loss_b)
            
            model_pred = model_pred.squeeze(1).to('cpu')
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
        val_f1 = competition_metric(true_labels, pred_labels)
    return val_loss, val_f1  

In [None]:
train_dataset = CustomDataset(train_X, train_y, True)
val_dataset = CustomDataset(val_X, val_y, True)

train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False)

In [None]:
student_model = Student()
student_model.eval()
optimizer = torch.optim.AdamW(student_model.parameters(), lr=CFG['LEARNING_RATE'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1, threshold_mode='abs',min_lr=1e-8, verbose=True)

best_student_model = student_train(student_model, teacher_model, optimizer, train_loader, val_loader, scheduler, device)


In [None]:
def choose_threshold(model, val_loader, device):
    model.to(device)
    model.eval()
    
    thresholds = [0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
    pred_labels = []
    true_labels = []
    
    best_score = 0
    best_thr = None
    with torch.no_grad():
        for _, x_s, y in tqdm(iter(val_loader)):
            x_s = x_s.float().to(device)
            y = y.float().to(device)
            
            model_pred = model(x_s)
            
            model_pred = model_pred.squeeze(1).to('cpu')
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        for threshold in thresholds:
            pred_labels_thr = np.where(np.array(pred_labels) > threshold, 1, 0)
            score_thr = competition_metric(true_labels, pred_labels_thr)
            if best_score < score_thr:
                best_score = score_thr
                best_thr = threshold
    return best_thr, best_score

In [None]:
best_threshold, best_score = choose_threshold(best_student_model, val_loader, device)
print(f'Best Threshold : [{best_threshold}], Score : [{best_score:.5f}]')

# 다시 제출 = [0.56523] -> 0.5569827357
# 

In [None]:
test_datasets = CustomDataset(test, None, False)
test_loaders = DataLoader(test_datasets, batch_size = CFG['BATCH_SIZE'], shuffle=False)

In [None]:
def inference(model, test_loader, threshold, device):
    model.to(device)
    model.eval()
    
    test_predict = []
    with torch.no_grad():
        for x in tqdm(test_loader):
            x = x.float().to(device)
            model_pred = model(x)

            model_pred = model_pred.squeeze(1).to('cpu')
            test_predict += model_pred
        
    test_predict = np.where(np.array(test_predict) > threshold, 1, 0)
    print('Done.')
    return test_predict

In [None]:
preds = inference(best_student_model, test_loaders, best_threshold, device)

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
submit['Y_LABEL'] = preds
submit.head()

In [None]:
submit.to_csv('/content/drive/MyDrive/submit.csv', index=False)