<a href="https://colab.research.google.com/github/Batwan01/2024-Challenge/blob/main/best_model/MLP_best_pt/MLP_drop2%2C6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import json
import numpy as np
import random
import pandas as pd
import torch.nn.init as init

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import ParameterGrid
from google.colab import drive

drive.mount('/content/drive')

# 데이터 경로 설정
train_csv_path = "/content/drive/MyDrive/Colab Notebooks/contest/samsung/train.csv"
test_csv_path = "/content/drive/MyDrive/Colab Notebooks/contest/samsung/test.csv"
submission_csv_path = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/sample_submission.csv'
train = pd.read_csv(train_csv_path)
test = pd.read_csv(test_csv_path)
com = pd.concat([train,test])
com = com.drop(['x_2', 'x_6'], axis=1)
train_data = com[:40118]
X_test = com[40118:]
X_test = X_test.drop('y',axis=1).iloc[:,1:]

print(X_test.shape)

# 잔차 연결
class ResidualMLP(nn.Module):
    def __init__(self, input_size=9, hidden_sizes=[32, 64, 64, 32, 16], output_size=1):
        super(ResidualMLP, self).__init__()

        # 6개의 MLP 레이어 정의
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.prelu1 = nn.PReLU()

        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.prelu2 = nn.PReLU()

        self.fc3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.prelu3 = nn.PReLU()

        self.fc4 = nn.Linear(hidden_sizes[2], hidden_sizes[3])
        self.prelu4 = nn.PReLU()

        self.fc5 = nn.Linear(hidden_sizes[3], hidden_sizes[4])
        self.prelu5 = nn.PReLU()

        self.fc6 = nn.Linear(hidden_sizes[4], output_size)
        self.prelu6 = nn.PReLU()

        # 1x1 Linear 레이어로 Residual Connection의 크기 맞추기
        self.residual1 = nn.Linear(hidden_sizes[0], hidden_sizes[2])  # fc1 -> fc3
        self.residual2 = nn.Linear(hidden_sizes[2], hidden_sizes[4])  # fc3 -> fc5

    def forward(self, x):
        # 첫 번째 MLP 레이어
        out = self.prelu1(self.fc1(x))

        # 첫 번째 Residual 연결 (fc1 -> fc2 -> fc3)
        residual = self.residual1(out)  # 크기 맞추기
        out = self.prelu2(self.fc2(out))
        out = self.prelu3(self.fc3(out))
        out += residual  # 첫 번째 Residual Connection

        # 두 번째 Residual 연결 (fc3 -> fc4 -> fc5)
        residual = self.residual2(out)  # 크기 맞추기
        out = self.prelu4(self.fc4(out))
        out = self.prelu5(self.fc5(out))
        out += residual  # 두 번째 Residual Connection

        # 최종 출력 레이어 (fc6) 및 PReLU 적용
        out = self.prelu6(self.fc6(out))  # 최종 레이어에도 PReLU 적용

        return out


Mounted at /content/drive
(4986, 9)


In [None]:
import os

# 수정된 모델 저장 함수
class ModelSaver:
    def __init__(self, model, result_path):
        self.model = model
        self.result_path = result_path
        self.best_models = []
        self.lowest_loss = float('inf')

    def save_model(self, epoch, loss):
        # 모델 저장 경로 설정
        os.makedirs(self.result_path, exist_ok=True)

        # 현재 에폭 모델 저장
        current_model_path = os.path.join(self.result_path, f'model_epoch_{epoch}_loss_{loss:.4f}.pt')
        torch.save(self.model.state_dict(), current_model_path)

        # 최상위 3개 모델 관리
        self.best_models.append((loss, epoch, current_model_path))
        self.best_models.sort()
        if len(self.best_models) > 3:
            _, _, path_to_remove = self.best_models.pop(-1)  # 가장 높은 손실 모델 삭제
            if os.path.exists(path_to_remove):
                os.remove(path_to_remove)

        # 가장 낮은 손실의 모델 저장 및 best epoch 기록
        if loss < self.lowest_loss:
            self.lowest_loss = loss
            best_model_path = os.path.join(self.result_path, 'best_model.pt')
            torch.save(self.model.state_dict(), best_model_path)
            with open(os.path.join(self.result_path, 'best_epoch.txt'), 'w') as f:
                f.write(f"Best Epoch: {epoch}\nBest Loss: {loss:.4f}")
            print(f"Save {epoch} epoch result. Loss = {loss:.4f}")


In [None]:

# 훈련 데이터를 훈련 + 검증 데이터로 나누고 train_dataset, val_dataset 을 반환하는 함수:

X = torch.tensor(train_data.drop(['ID','y'],axis=1).values, dtype=torch.float32)
y = torch.tensor(train_data['y'].values, dtype = torch.float32).view(-1,1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

model = ResidualMLP()

model.cuda()
criterion = nn.MSELoss().cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

train_losses = []
val_losses = []

# 모델 저장 클래스를 먼저 초기화
model_saver = ModelSaver(model, result_path='/content/best')

for epoch in range(70):
    model.train()
    train_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss = train_loss / len(train_loader)
    train_losses.append(train_loss)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()

    val_loss = val_loss / len(val_loader)
    val_losses.append(val_loss)

    # 각 에폭에서 최상의 모델을 저장하는 코드 추가
    model_saver.save_model(epoch, val_loss)  # val_loss 기준으로 저장

    print(f'Epoch {epoch+1}/300, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

Save 0 epoch result. Loss = 5.2522
Epoch 1/300, Train Loss: 289.0839, Validation Loss: 5.2522
Save 1 epoch result. Loss = 3.4962
Epoch 2/300, Train Loss: 3.3445, Validation Loss: 3.4962
Epoch 3/300, Train Loss: 3.2179, Validation Loss: 3.5184
Epoch 4/300, Train Loss: 3.1924, Validation Loss: 3.7847
Save 4 epoch result. Loss = 3.3492
Epoch 5/300, Train Loss: 3.2495, Validation Loss: 3.3492
Epoch 6/300, Train Loss: 3.1790, Validation Loss: 5.6740
Epoch 7/300, Train Loss: 3.1878, Validation Loss: 3.6514
Epoch 8/300, Train Loss: 3.1978, Validation Loss: 3.4060
Epoch 9/300, Train Loss: 3.2200, Validation Loss: 6.3504
Save 9 epoch result. Loss = 3.3089
Epoch 10/300, Train Loss: 3.2109, Validation Loss: 3.3089
Epoch 11/300, Train Loss: 3.1405, Validation Loss: 3.7713
Epoch 12/300, Train Loss: 3.1571, Validation Loss: 3.5064
Epoch 13/300, Train Loss: 3.1864, Validation Loss: 3.9364
Epoch 14/300, Train Loss: 3.1732, Validation Loss: 3.5808
Epoch 15/300, Train Loss: 3.0556, Validation Loss: 3.46

In [None]:
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
model.eval()
test_pred = []
with torch.no_grad():
    for batch in test_loader:
        batch_X = batch[0].cuda()
        outputs = model(batch_X)
        test_pred.extend(outputs.cpu().numpy())

test_pred = np.array(test_pred).flatten()
submission_df = pd.read_csv(submission_csv_path)
submission_df['y'] = test_pred  # 예측 결과를 y 컬럼에 추가

# 지정된 경로에 CSV 파일로 저장
submission_df.to_csv("/content/result_1.csv", index=False)
submission_df.describe()

Unnamed: 0,y
count,4986.0
mean,84.550102
std,3.568842
min,81.101212
25%,81.970192
50%,82.842384
75%,85.874701
max,94.032776


In [10]:

def find_non_matching_ids(file1, file2):
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    df1_sorted = df1.sort_values(by='y', ascending=False)
    df2_sorted = df2.sort_values(by='y', ascending=False)

    top_10_percent_count = int(len(df1_sorted) * 0.1)
    top_10_percent_ids_df1 = set(df1_sorted.head(top_10_percent_count)['ID'])
    top_10_percent_ids_df2 = set(df2_sorted.head(top_10_percent_count)['ID'])

    non_matching_ids = top_10_percent_ids_df1 - top_10_percent_ids_df2
    num_non_matching = len(non_matching_ids)

    print(f"file1의 상위 10% 항목 개수: {top_10_percent_count}")
    print(f"file1의 상위 10% 중 file2에 없는 항목 개수: {num_non_matching}")
    print(f"file1의 상위 10% 중 file2에 없는 항목 ID: {non_matching_ids}")

    return top_10_percent_count, num_non_matching, list(non_matching_ids)

file1 ='/content/drive/MyDrive/Colab Notebooks/contest/samsung/MLP5_PReLU(0.752).csv'
file2 = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/MLP_Residual_Connection_18.csv'
file3 = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/result/MLP_Residual_Connection_1.csv'
top_10_percent_count, num_non_matching, non_matching_ids = find_non_matching_ids(file1, file3)
top_10_percent_count, num_non_matching, non_matching_ids = find_non_matching_ids(file2, file3)


file1의 상위 10% 항목 개수: 498
file1의 상위 10% 중 file2에 없는 항목 개수: 19
file1의 상위 10% 중 file2에 없는 항목 ID: {'TEST_2000', 'TEST_0784', 'TEST_2632', 'TEST_0347', 'TEST_1706', 'TEST_2505', 'TEST_3482', 'TEST_4828', 'TEST_0665', 'TEST_4051', 'TEST_4705', 'TEST_2499', 'TEST_4472', 'TEST_4881', 'TEST_0813', 'TEST_1978', 'TEST_2682', 'TEST_4971', 'TEST_2426'}
file1의 상위 10% 항목 개수: 498
file1의 상위 10% 중 file2에 없는 항목 개수: 17
file1의 상위 10% 중 file2에 없는 항목 ID: {'TEST_4828', 'TEST_0665', 'TEST_0347', 'TEST_3482', 'TEST_2000', 'TEST_0784', 'TEST_4881', 'TEST_1706', 'TEST_0813', 'TEST_2632', 'TEST_2505', 'TEST_1978', 'TEST_2981', 'TEST_2499', 'TEST_2682', 'TEST_4971', 'TEST_4982'}


In [11]:
# best epoch 모델을 불러오기.
model.load_state_dict(
    torch.load(
        os.path.join('/content/best', "best_model.pt"),
        map_location='cpu'
    )
)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
model.eval()
test_pred = []
with torch.no_grad():
    for batch in test_loader:
        batch_X = batch[0].cuda()
        outputs = model(batch_X)
        test_pred.extend(outputs.cpu().numpy())

test_pred = np.array(test_pred).flatten()
submission_df = pd.read_csv(submission_csv_path)
submission_df['y'] = test_pred  # 예측 결과를 y 컬럼에 추가

# 지정된 경로에 CSV 파일로 저장
submission_df.to_csv("/content/result_2.csv", index=False)
submission_df.describe()

  torch.load(


Unnamed: 0,y
count,4986.0
mean,84.887337
std,3.604769
min,81.355537
25%,82.247709
50%,83.185173
75%,86.307728
max,94.504547


In [12]:

def find_non_matching_ids(file1, file2):
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    df1_sorted = df1.sort_values(by='y', ascending=False)
    df2_sorted = df2.sort_values(by='y', ascending=False)

    top_10_percent_count = int(len(df1_sorted) * 0.1)
    top_10_percent_ids_df1 = set(df1_sorted.head(top_10_percent_count)['ID'])
    top_10_percent_ids_df2 = set(df2_sorted.head(top_10_percent_count)['ID'])

    non_matching_ids = top_10_percent_ids_df1 - top_10_percent_ids_df2
    num_non_matching = len(non_matching_ids)

    print(f"file1의 상위 10% 항목 개수: {top_10_percent_count}")
    print(f"file1의 상위 10% 중 file2에 없는 항목 개수: {num_non_matching}")
    print(f"file1의 상위 10% 중 file2에 없는 항목 ID: {non_matching_ids}")

    return top_10_percent_count, num_non_matching, list(non_matching_ids)

file1 ='/content/result_1.csv'
file2 = '/content/result_2.csv'

top_10_percent_count, num_non_matching, non_matching_ids = find_non_matching_ids(file1, file2)

file1의 상위 10% 항목 개수: 498
file1의 상위 10% 중 file2에 없는 항목 개수: 8
file1의 상위 10% 중 file2에 없는 항목 ID: {'TEST_0635', 'TEST_4011', 'TEST_3205', 'TEST_4080', 'TEST_0212', 'TEST_4092', 'TEST_1792', 'TEST_1321'}
