# 1DCNN + LSTM 결합 model

In [1]:
import pandas as pd 
import pandas_datareader as pdr   
import numpy as np
from sklearn.preprocessing import StandardScaler



In [2]:
import random
import os
import torch
random.seed(123)
np.random.seed(123)
os.environ["PYTHONHASHSEED"] = str(123)
torch.manual_seed(123)
torch.cuda.manual_seed(123)               # type: ignore
torch.backends.cudnn.deterministic = True  # type: ignore
torch.backends.cudnn.benchmark = True      # type: ignore

## 1. 데이터 로드 및 전처리

In [3]:
final = pd.read_csv('/home/jbj4278/ETH_data/final_dateadd') #feature포함 데이터

In [4]:
col = ['Date','Open', 'High', 'Low', 'Close', 'Volume','BTC_close', 'DXY', 'BTCD', 'Kimchi_premium', 'S&P500', 'Ethereum DeFi',
   'News_freq','signal']

In [5]:
final = final.loc[:,col]

In [6]:
final.loc[final['signal']==0.0 , "signal"] = 2   # 보합 > 2로
final.loc[final['signal']==1.0 , "signal"] = 0  
 # 상승 > 0으로
final.loc[final['signal']==-1.0 , "signal"] = 1 #하락 > 1로
#(상승, 하락, 보합) > (0 ,1, 2)

In [7]:
final['signal'].value_counts()

0.0    188025
1.0    171867
2.0    163209
Name: signal, dtype: int64

#### 시계열의 연속성을 고려하기 위해 window_size를 설정하여 data를 3차원으로 변경하는 함수

In [8]:
def make_dataset(data, label, window_size):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(np.array(label.iloc[i+window_size]))
    return np.array(feature_list), np.array(label_list)

In [9]:
window_size = 10

In [10]:
final.shape

(523101, 14)

#### train+val/ test split

In [11]:
len_test = int(len(final)*0.2)
test_set = final[-len_test:]
tr_set = final[:-len_test]

In [12]:
test_set.shape

(104620, 14)

In [13]:
test_set

Unnamed: 0,Date,Open,High,Low,Close,Volume,BTC_close,DXY,BTCD,Kimchi_premium,S&P500,Ethereum DeFi,News_freq,signal
418481,2022-01-09 20:55:00,3934000,3934000,3931000,3933000,7.091395,52343000.0,95.739,40.435605,-12.94548,4677.04,1.401528e+11,26.0,1.0
418482,2022-01-09 20:56:00,3933000,3935000,3931000,3932000,14.144101,52315000.0,95.739,40.435605,-12.94548,4677.04,1.401528e+11,26.0,1.0
418483,2022-01-09 20:57:00,3932000,3934000,3930000,3933000,19.051929,52313000.0,95.739,40.435605,-12.94548,4677.04,1.401528e+11,26.0,1.0
418484,2022-01-09 20:58:00,3933000,3935000,3930000,3935000,11.908381,52313000.0,95.739,40.435605,-12.94548,4677.04,1.401528e+11,26.0,1.0
418485,2022-01-09 20:59:00,3935000,3936000,3934000,3935000,7.690722,52292000.0,95.739,40.435605,-12.94548,4677.04,1.401528e+11,26.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523096,2022-03-23 15:15:00,3666000,3667000,3665000,3665000,39.688187,51801000.0,98.793,42.584513,15.24980,4488.22,1.159212e+11,57.0,2.0
523097,2022-03-23 15:16:00,3666000,3669000,3665000,3668000,76.731647,51753000.0,98.793,42.584513,15.24980,4488.22,1.159212e+11,57.0,2.0
523098,2022-03-23 15:17:00,3668000,3671000,3667000,3669000,96.039465,51782000.0,98.793,42.584513,15.24980,4488.22,1.159212e+11,57.0,2.0
523099,2022-03-23 15:18:00,3667000,3674000,3663000,3663000,91.949217,51780000.0,98.793,42.584513,15.24980,4488.22,1.159212e+11,57.0,2.0


In [14]:
tr_set.shape

(418481, 14)

In [15]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torchmetrics.functional import f1_score,precision_recall
device = torch.device("cpu")
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
import wandb
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mgyeongmocho[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [17]:
import time

def train(dataloader):
 
    model.train()
    total_acc, total_count = 0, 0
    start_time = time.time()

    
    loss_list = []
    for idx, (feature, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(feature.to(device))
        loss = criterion(predicted_label, label.to(device))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        
        total_count += label.size(0)
        loss_list.append(loss.item())
        
        if idx+1 % len(feature) == 0 and idx > 0:
            accuracy = total_acc/total_count
            loss_hist = np.mean(loss_list)
            elapsed = time.time() - start_time
            print(f'| epoch {epoch} | {idx}/{len(dataloader)} batches '
                f'| accuracy {accuracy} | loss {loss_hist}')
            total_acc, total_count = 0, 0
            start_time = time.time()

    wandb.log({'train_loss':np.mean(loss_list),'train_acc':total_acc/total_count})  
    return  total_acc/total_count,np.mean(loss_list)
    
            

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    loss_list = []
    
    score,precision,recall = 0, 0, 0
    with torch.no_grad():
        for idx, (feature, label) in enumerate(dataloader):
            feature, label = feature.to(device),label.to(device)
            predicted_label = model(feature.to(device))
            loss = criterion(predicted_label, label.to(device))
            
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            loss_list.append(loss.item())
            
            
            
            score += f1_score(predicted_label.argmax(1).to(device), label,num_classes=3)
            precision += precision_recall(predicted_label.argmax(1).to(device), label, average='macro', num_classes=3)[0]
            recall += precision_recall(predicted_label.argmax(1).to(device), label, average='macro', num_classes=3)[1]
      
        
        score = score / len(loss_list)
        precision = precision / len(loss_list)
        recall = recall / len(loss_list)
            
    #wandb.log({'val_loss':np.mean(loss_list), 'val_acc':total_acc/total_count,'f1_score':score,
              #'precision':precision,'recall':recall})
    
    return total_acc/total_count, np.mean(loss_list),score


            
   

In [18]:
class EarlyStopping:
    """주어진 patience 이후로 validation loss가 개선되지 않으면 학습을 조기 중지"""
    def __init__(self,path, patience=10, verbose=False, delta=0,):
        """
        Args:
            patience (int): validation loss가 개선된 후 기다리는 기간
                            Default: 7
            verbose (bool): True일 경우 각 validation loss의 개선 사항 메세지 출력
                            Default: False
            delta (float): 개선되었다고 인정되는 monitered quantity의 최소 변화
                            Default: 0
            path (str): checkpoint저장 경로
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''validation loss가 감소하면 모델을 저장한다.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [19]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)
    
        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.9 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]


## 모델 정의

In [20]:
class Conv1dlstm(nn.Module):
    def __init__(self, in_channel=12, out_channels=6):
        super(Conv1dlstm, self).__init__()
        self.conv1d_1 = nn.Conv1d(in_channels=in_channel,
                                out_channels=out_channels,
                                kernel_size=6,
                                stride=1)
                                
        
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

        self.lstm = nn.LSTM(input_size=6,
                            hidden_size=20,
                            num_layers=1,
                            batch_first = True
                            )
        

      
        self.dense = nn.Linear(20, 3)

    def forward(self, x):
	# Raw x shape : (B, S, F) => (B, 30, 33)
        
        # Shape : (B, F, S) => (B, 33, 30)
        x = x.transpose(1, 2)
        
        # Shape : (B, F, S) == (B, C, S) // C = channel => (B, 33, 21)
        x = self.conv1d_1(x)
        
        x = self.relu(x)
    

        # Shape : (B, 21, 33)
        x = x.transpose(2,1)

        
        
        # LSTM 순전파
        self.lstm.flatten_parameters()
        
        
        out, (hn, cn) = self.lstm(x) # out : (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)  , hn이 lstm의 마지막 hidden state
       
        x = out[:, -1, :]
        

       
        
        # Shape : (B, H) => (B, 10)
        
        
        # Shape : (B, O) // O = output => (B, 3)
        x = self.dense(x)
        

        return x
        


In [21]:

lr = 0.0001
batch_size = 32


## 모델 학습 (cross validation)

In [34]:
btss = BlockingTimeSeriesSplit(n_splits=5)
n_epochs = 150
scale_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'BTC_close', 'DXY', 'BTCD',
       'Kimchi_premium', 'S&P500', 'Ethereum DeFi', 'News_freq']
score_list = []
val_loss_list = []
voting_list = []
#cross validation

wandb.init(project='cnnlstm', entity='gyeongmoCho',name='cnnlstm_hidden4x_smallbatch_')
#wandb.watch(model, criterion, log = "all" )


for idx,(tr_idx, val_idx) in enumerate(btss.split(tr_set)):
    
    train_data, val_data = tr_set.iloc[tr_idx], tr_set.iloc[val_idx]
    
    
    scaler = StandardScaler()
    scaler.fit(train_data[scale_cols])#for문을 돌면서 각 train, val을 스케일링ㅇ
    train_feature = scaler.transform(train_data[scale_cols])
    train_feature = pd.DataFrame(train_feature)                                 
    val_feature = scaler.transform(val_data[scale_cols])
    val_feature = pd.DataFrame(val_feature)   

    
    train_feature, train_label = make_dataset(train_feature, train_data['signal'], window_size)
    val_feature, val_label = make_dataset(val_feature, val_data['signal'], window_size)

    
    #모델 선언 
    
    model = Conv1dlstm().to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    criterion = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100, eta_min=0,verbose=True)
    total_accu = None


    print(model)
    
    early_stopping = EarlyStopping(patience = 10, verbose = True)



    #스케일링된 데이터를 torch Tensor로 변경
    val_feature = torch.FloatTensor(val_feature).to(device)
    val_label = torch.LongTensor(val_label).to(device)
    train_feature = torch.FloatTensor(train_feature).to(device)
    train_label  = torch.LongTensor(train_label).to(device)

    train_set = TensorDataset(train_feature,train_label)
    val_set = TensorDataset(val_feature,val_label)
    
    #데이터로더 사용
    BATCH_SIZE = batch_size
    train_dataloader = DataLoader(train_set , batch_size=BATCH_SIZE,shuffle=False)
    valid_dataloader = DataLoader(val_set, batch_size=BATCH_SIZE,shuffle=False)

    print(f"train_feature:{train_feature.shape},val_feature:{val_feature.shape}" )
    

    for epoch in range(1, n_epochs + 1):
        # 현재 learning rate 출력
        print(optimizer.param_groups[0]['lr'])
        epoch_start_time = time.time()
        accu_train, train_loss = train(train_dataloader)
        accu_val,val_loss,score = evaluate(valid_dataloader)
        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
        else:
            total_accu = accu_val
        print('-' * 59)
        print(f'| end of epoch {epoch} | time: {time.time()- epoch_start_time}s | '
            f'valid accuracy {accu_val} | valid loss {val_loss}'
            ) 

        print('-' * 59)
        
        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break
    score_list.append(score)
    print("save score")

    
    PATH = f"/home/jbj4278/ETH_data/cnnlstm_hidden4x_smallbatch__{idx}.pt"
    torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),


                }, PATH)
    print('-' * 59)
    print('Fold Finish')
    print('-' * 59)

score_list = list(map(float, score_list))
final_f1_score = np.mean(score_list)
print(final_f1_score)
wandb.log({'mean_f1_score':final_f1_score})



        
    

   

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Adjusting learning rate of group 0 to 1.0000e-04.
Conv1dlstm(
  (conv1d_1): Conv1d(12, 12, kernel_size=(6,), stride=(1,))
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(12, 40, batch_first=True)
  (dense): Linear(in_features=40, out_features=3, bias=True)
)
train_feature:torch.Size([75316, 10, 12]),val_feature:torch.Size([8360, 10, 12])
0.0001
-----------------------------------------------------------
| end of epoch 1 | time: 7.4588422775268555s | valid accuracy 0.29832535885167466 | valid loss 1.1203265563222289
-----------------------------------------------------------
Validation loss decreased (inf --> 1.120327).  Saving model ...
0.0001
-----------------------------------------------------------
| end of epoch 2 | time: 7.566527366638184s | valid accuracy 0.32057416267942584 | valid loss 1.129225084572348
-----------------------------------------------------------
EarlyStopping counter: 1 out of 10
0.0001
Adjusting learning rate of group 0 to 9.9975e-0

-----------------------------------------------------------
| end of epoch 13 | time: 7.54605770111084s | valid accuracy 0.47081339712918663 | valid loss 1.085580562362234
-----------------------------------------------------------
Validation loss decreased (1.087190 --> 1.085581).  Saving model ...
0.0001
-----------------------------------------------------------
| end of epoch 14 | time: 7.569854736328125s | valid accuracy 0.4717703349282297 | valid loss 1.084099128501106
-----------------------------------------------------------
Validation loss decreased (1.085581 --> 1.084099).  Saving model ...
0.0001
Adjusting learning rate of group 0 to 9.9975e-05.
-----------------------------------------------------------
| end of epoch 15 | time: 7.528924942016602s | valid accuracy 0.47165071770334926 | valid loss 1.0827377502244848
-----------------------------------------------------------
Validation loss decreased (1.084099 --> 1.082738).  Saving model ...
9.997532801828658e-05
---------

-----------------------------------------------------------
| end of epoch 39 | time: 7.646445274353027s | valid accuracy 0.4717703349282297 | valid loss 1.0693671937662226
-----------------------------------------------------------
Validation loss decreased (1.069496 --> 1.069367).  Saving model ...
9.997532801828658e-05
-----------------------------------------------------------
| end of epoch 40 | time: 7.595901966094971s | valid accuracy 0.4717703349282297 | valid loss 1.0692603164956769
-----------------------------------------------------------
Validation loss decreased (1.069367 --> 1.069260).  Saving model ...
9.997532801828658e-05
-----------------------------------------------------------
| end of epoch 41 | time: 7.588608264923096s | valid accuracy 0.4717703349282297 | valid loss 1.0691732002123622
-----------------------------------------------------------
Validation loss decreased (1.069260 --> 1.069173).  Saving model ...
9.997532801828658e-05
----------------------------

Adjusting learning rate of group 0 to 9.8429e-05.
-----------------------------------------------------------
| end of epoch 9 | time: 7.511639833450317s | valid accuracy 0.3043062200956938 | valid loss 1.1048969735626046
-----------------------------------------------------------
EarlyStopping counter: 8 out of 10
9.842915805643157e-05
Adjusting learning rate of group 0 to 9.8015e-05.
-----------------------------------------------------------
| end of epoch 10 | time: 7.539645195007324s | valid accuracy 0.3041866028708134 | valid loss 1.1058103014494627
-----------------------------------------------------------
EarlyStopping counter: 9 out of 10
9.801468428384717e-05
Adjusting learning rate of group 0 to 9.7553e-05.
-----------------------------------------------------------
| end of epoch 11 | time: 7.5601770877838135s | valid accuracy 0.30406698564593304 | valid loss 1.1066987682844847
-----------------------------------------------------------
EarlyStopping counter: 10 out of 10


-----------------------------------------------------------
| end of epoch 21 | time: 7.4942708015441895s | valid accuracy 0.47452153110047846 | valid loss 1.0885606908616219
-----------------------------------------------------------
Validation loss decreased (1.089517 --> 1.088561).  Saving model ...
9.879583809693738e-05
-----------------------------------------------------------
| end of epoch 22 | time: 7.39861273765564s | valid accuracy 0.4842105263157895 | valid loss 1.0876586660174012
-----------------------------------------------------------
Validation loss decreased (1.088561 --> 1.087659).  Saving model ...
9.879583809693738e-05
Adjusting learning rate of group 0 to 9.8429e-05.
-----------------------------------------------------------
| end of epoch 23 | time: 7.544625759124756s | valid accuracy 0.4717703349282297 | valid loss 1.0868065957804673
-----------------------------------------------------------
Validation loss decreased (1.087659 --> 1.086807).  Saving model ...

Adjusting learning rate of group 0 to 8.0645e-05.
-----------------------------------------------------------
| end of epoch 44 | time: 7.5542311668396s | valid accuracy 0.4666267942583732 | valid loss 1.0769579235379023
-----------------------------------------------------------
Validation loss decreased (1.077198 --> 1.076958).  Saving model ...
8.064535268264887e-05
Adjusting learning rate of group 0 to 7.9389e-05.
-----------------------------------------------------------
| end of epoch 45 | time: 7.485866546630859s | valid accuracy 0.4666267942583732 | valid loss 1.0767319034074099
-----------------------------------------------------------
Validation loss decreased (1.076958 --> 1.076732).  Saving model ...
7.938926261462371e-05
Adjusting learning rate of group 0 to 7.8104e-05.
-----------------------------------------------------------
| end of epoch 46 | time: 7.47574520111084s | valid accuracy 0.4666267942583732 | valid loss 1.076520005255255
---------------------------------

Adjusting learning rate of group 0 to 4.8429e-05.
-----------------------------------------------------------
| end of epoch 66 | time: 7.587134122848511s | valid accuracy 0.408133971291866 | valid loss 1.0743478902878651
-----------------------------------------------------------
Validation loss decreased (1.074394 --> 1.074348).  Saving model ...
4.842946204609364e-05
Adjusting learning rate of group 0 to 4.6860e-05.
-----------------------------------------------------------
| end of epoch 67 | time: 7.489463567733765s | valid accuracy 0.3937799043062201 | valid loss 1.0743051501175829
-----------------------------------------------------------
Validation loss decreased (1.074348 --> 1.074305).  Saving model ...
4.686047402353438e-05
Adjusting learning rate of group 0 to 4.5295e-05.
-----------------------------------------------------------
| end of epoch 68 | time: 7.6226725578308105s | valid accuracy 0.37272727272727274 | valid loss 1.0742658096870392
----------------------------

Adjusting learning rate of group 0 to 1.6934e-05.
-----------------------------------------------------------
| end of epoch 88 | time: 7.391993284225464s | valid accuracy 0.3047846889952153 | valid loss 1.0739714883211005
-----------------------------------------------------------
Validation loss decreased (1.073974 --> 1.073971).  Saving model ...
1.693440673381744e-05
Adjusting learning rate of group 0 to 1.5773e-05.
-----------------------------------------------------------
| end of epoch 89 | time: 7.5153892040252686s | valid accuracy 0.3050239234449761 | valid loss 1.0739693195765254
-----------------------------------------------------------
Validation loss decreased (1.073971 --> 1.073969).  Saving model ...
1.577264470356559e-05
Adjusting learning rate of group 0 to 1.4645e-05.
-----------------------------------------------------------
| end of epoch 90 | time: 7.428787708282471s | valid accuracy 0.3058612440191388 | valid loss 1.0739676001417728
----------------------------

-----------------------------------------------------------
| end of epoch 2 | time: 7.592371940612793s | valid accuracy 0.4742822966507177 | valid loss 1.080344277711315
-----------------------------------------------------------
Validation loss decreased (1.081319 --> 1.080344).  Saving model ...
0.0001
-----------------------------------------------------------
| end of epoch 3 | time: 7.5555009841918945s | valid accuracy 0.4747607655502392 | valid loss 1.079443926347121
-----------------------------------------------------------
Validation loss decreased (1.080344 --> 1.079444).  Saving model ...
0.0001
-----------------------------------------------------------
| end of epoch 4 | time: 7.516641855239868s | valid accuracy 0.4759569377990431 | valid loss 1.0786118900957908
-----------------------------------------------------------
Validation loss decreased (1.079444 --> 1.078612).  Saving model ...
0.0001
-----------------------------------------------------------
| end of epoch 5 

-----------------------------------------------------------
| end of epoch 28 | time: 7.307977914810181s | valid accuracy 0.47811004784688993 | valid loss 1.071404886154728
-----------------------------------------------------------
EarlyStopping counter: 1 out of 10
9.997532801828658e-05
-----------------------------------------------------------
| end of epoch 29 | time: 7.3443732261657715s | valid accuracy 0.47811004784688993 | valid loss 1.071424197604638
-----------------------------------------------------------
EarlyStopping counter: 2 out of 10
9.997532801828658e-05
-----------------------------------------------------------
| end of epoch 30 | time: 7.306709051132202s | valid accuracy 0.47811004784688993 | valid loss 1.0714590042147019
-----------------------------------------------------------
EarlyStopping counter: 3 out of 10
9.997532801828658e-05
-----------------------------------------------------------
| end of epoch 31 | time: 7.215817928314209s | valid accuracy 0.4781

## f1-score의 fold별 평균이 가장 좋았던 모델로 Test set 성능 측정
outchaanel = 6,hidden state = 20 , batch = 32, lr = 0.0001, mean_f1score = 0.3972

#### test_set dataloader 생성

In [22]:
scale_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'BTC_close', 'DXY', 'BTCD',
       'Kimchi_premium', 'S&P500', 'Ethereum DeFi', 'News_freq']
scaler = StandardScaler()
scaler.fit(tr_set[scale_cols])#for문을 돌면서 각 train, val을 스케일링
tr_feature = scaler.transform(tr_set[scale_cols])
tr_feature = pd.DataFrame(tr_feature)                                 
test_feature = scaler.transform(test_set[scale_cols])
test_feature = pd.DataFrame(test_feature)   


test_feature, test_label = make_dataset(test_feature, test_set['signal'], window_size)


#스케일링된 데이터를 torch Tensor로 변경
test_feature = torch.FloatTensor(test_feature).to(device)
test_label  = torch.LongTensor(test_label).to(device)

final_test_set = TensorDataset(test_feature,test_label)


#데이터로더 사용
BATCH_SIZE = 32
test_dataloader = DataLoader(final_test_set , batch_size=BATCH_SIZE,shuffle=False)


#### train_set dataloader 생성

In [23]:


scale_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'BTC_close', 'DXY', 'BTCD',
       'Kimchi_premium', 'S&P500', 'Ethereum DeFi', 'News_freq']



   
scaler = StandardScaler()
scaler.fit(tr_set[scale_cols])#for문을 돌면서 각 train, val을 스케일링ㅇ
train_feature = scaler.transform(tr_set[scale_cols])
train_feature = pd.DataFrame(train_feature)                                 
 

    
train_feature, train_label = make_dataset(train_feature, tr_set['signal'], window_size)
  

train_feature = torch.FloatTensor(train_feature).to(device)
train_label  = torch.LongTensor(train_label).to(device)

train_set = TensorDataset(train_feature,train_label)


#데이터로더 사용
BATCH_SIZE = 32
final_dataloader = DataLoader(train_set , batch_size=BATCH_SIZE,shuffle=False)




        
    

   

In [24]:
#모델 저장 경로
path = f"/home/jbj4278/ETH_data/1DCNNLSTM_final_model.pt"
    

In [25]:
model = Conv1dlstm().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100, eta_min=0,verbose=True)
total_accu = None

Adjusting learning rate of group 0 to 1.0000e-04.


In [26]:
import time

def train(dataloader):
 
    model.train()
    total_acc, total_count = 0, 0
    start_time = time.time()

    
    loss_list = []
    for idx, (feature, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(feature.to(device))
        loss = criterion(predicted_label, label.to(device))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        
        total_count += label.size(0)
        loss_list.append(loss.item())
        
        if idx+1 % len(feature) == 0 and idx > 0:
            accuracy = total_acc/total_count
            loss_hist = np.mean(loss_list)
            elapsed = time.time() - start_time
            print(f'| epoch {epoch} | {idx}/{len(dataloader)} batches '
                f'| accuracy {accuracy} | loss {loss_hist}')
            total_acc, total_count = 0, 0
            start_time = time.time()

      
    return  total_acc/total_count,np.mean(loss_list)
    
            

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    loss_list = []
    pred_label = []
    pred_prob = []
    score,precision,recall = 0, 0, 0
    with torch.no_grad():
        for idx, (feature, label) in enumerate(dataloader):
            feature, label = feature.to(device),label.to(device)
            predicted_label = model(feature.to(device))
            loss = criterion(predicted_label, label.to(device))
            
            
            sm = nn.Softmax(dim=1)
            probabilities = sm(predicted_label)
            prob_arr = (probabilities.detach().cpu().numpy())[0]
            pred_prob.append(prob_arr)
            
            
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            loss_list.append(loss.item())
            pred_label.append(predicted_label.argmax(1))
            
            
            score += f1_score(predicted_label.argmax(1).to(device), label,num_classes=3)
            precision += precision_recall(predicted_label.argmax(1).to(device), label, average='macro', num_classes=3)[0]
            recall += precision_recall(predicted_label.argmax(1).to(device), label, average='macro', num_classes=3)[1]
      
        
        score = score / len(loss_list)
        precision = precision / len(loss_list)
        recall = recall / len(loss_list)
            
    
    
    return total_acc/total_count, np.mean(loss_list),score,pred_label,pred_prob


            
   

## 모델 학습

In [27]:
n_epochs = 100
early_stopping = EarlyStopping(path,patience = 3, verbose = True)


for epoch in range(1, n_epochs + 1):
    # 현재 learning rate 출력
    print(optimizer.param_groups[0]['lr'])
    epoch_start_time = time.time()
    accu_train, train_loss = train(final_dataloader)
    accu_test,test_loss,score,label,prob = evaluate(test_dataloader)
    if total_accu is not None and total_accu > accu_test:
        scheduler.step()
    else:
        total_accu = accu_test
    print('-' * 59)
    print(f'| end of epoch {epoch} | time: {time.time()- epoch_start_time}s | '
        f'valid accuracy {accu_test} | valid loss {test_loss}'
        ) 

    print('-' * 59)

    early_stopping(test_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break
print(score)
torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),


                }, path)
    

0.0001
-----------------------------------------------------------
| end of epoch 1 | time: 42.807490825653076s | valid accuracy 0.35171589714176466 | valid loss 1.0996810310112957
-----------------------------------------------------------
Validation loss decreased (inf --> 1.099681).  Saving model ...
0.0001
-----------------------------------------------------------
| end of epoch 2 | time: 43.30708122253418s | valid accuracy 0.35171589714176466 | valid loss 1.0996076868579292
-----------------------------------------------------------
Validation loss decreased (1.099681 --> 1.099608).  Saving model ...
0.0001
-----------------------------------------------------------
| end of epoch 3 | time: 44.17605471611023s | valid accuracy 0.35171589714176466 | valid loss 1.0994519645650087
-----------------------------------------------------------
Validation loss decreased (1.099608 --> 1.099452).  Saving model ...
0.0001
-----------------------------------------------------------
| end of e

In [28]:
wandb.init(project='result', entity='gyeongmoCho',name='1dcsnn+lstm')
wandb.log({'final_f1_score':0.3534})

[34m[1mwandb[0m: Currently logged in as: [33mgyeongmocho[0m. Use [1m`wandb login --relogin`[0m to force relogin


### 예측 라벨 붙여주기

In [36]:
pred_label = list(map(list, label))

answer = sum(pred_label, [])

final_label = []
for i in answer:
    final_label.append(i.tolist())

In [None]:
a = a.tolist()

#### voting을 위한 확률값 생성

In [38]:
soft, hard = make_result(model,test_dataloader)

In [39]:
soft = pd.DataFrame(soft)

In [35]:
soft

Unnamed: 0,0,1,2
0,0.336474,0.330568,0.332958
1,0.336474,0.330568,0.332958
2,0.336474,0.330568,0.332958
3,0.336474,0.330568,0.332958
4,0.336474,0.330568,0.332958
...,...,...,...
104605,0.337196,0.327918,0.334886
104606,0.337196,0.327918,0.334886
104607,0.337196,0.327918,0.334886
104608,0.336627,0.329800,0.333573


In [None]:
"""
final_labels = pd.DataFrame(final_label,columns = ['pred_labels'])

test_set = test_set[:-10]

index = final_labels.index

test_set.index = index

plus_label_df = test_set.join(final_labels['pred_labels'],how='right') 

plus_label_df = plus_label_df.join(soft,how='right') 

"""


In [None]:

#plus_label_df.to_csv('final_1DCNNLSTM_label.csv')

In [None]:
#plus_label_df['pred_labels'].value_counts()
