In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import json
import numpy as np
import random
import pandas as pd

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import ParameterGrid
from google.colab import drive

drive.mount('/content/drive')

# 재현 가능성을 위한 시드 고정
RANDOM_SEED = 36
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Dataloader 시드 고정 (아직 사용 x)
'''
generator = torch.Generator()
generator.manual_seed(RANDOM_SEED)

def worker_init_fn(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# DataLoader(worker_init_fn=worker_init_fn)
'''

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'\ngenerator = torch.Generator()\ngenerator.manual_seed(RANDOM_SEED)\n\ndef worker_init_fn(worker_id):\n    worker_seed = torch.initial_seed() % 2**32\n    np.random.seed(worker_seed)\n    random.seed(worker_seed)\n\n# DataLoader(worker_init_fn=worker_init_fn)\n'

In [18]:
# 데이터 경로 설정
train_csv_path = "/content/drive/MyDrive/Colab Notebooks/data/samsung/train.csv"
test_csv_path = "/content/drive/MyDrive/Colab Notebooks/data/samsung/test.csv"
submission_csv_path = '/content/drive/MyDrive/Colab Notebooks/data/samsung/sample_submission.csv'

# 기타 경로 설정 : 저장되는 데이터 이름 등 (model, method 에 맞게 설정)

In [19]:
# MLP - 은닉층 5개
# 현재 최고성능 모델 하이퍼파라미터(GS) : {"batch_size": 32, "hidden_sizes": [32, 64, 64, 32, 16], "learning_rate": 0.001}
# Epoch : 93
# 제출 성능 : 0.752
class MLP5Hidden(nn.Module):
    def __init__(self, input_size=11, hidden_sizes=[32, 64, 64, 32, 16], output_size=1):
        super(MLP5Hidden, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.fc3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.fc4 = nn.Linear(hidden_sizes[2], hidden_sizes[3])
        self.fc5 = nn.Linear(hidden_sizes[3], hidden_sizes[4])
        self.fc6 = nn.Linear(hidden_sizes[4], output_size)
        self.lrelu = nn.LeakyReLU()

    def forward(self, x):
        x = self.lrelu(self.fc1(x))
        x = self.lrelu(self.fc2(x))
        x = self.lrelu(self.fc3(x))
        x = self.lrelu(self.fc4(x))
        x = self.lrelu(self.fc5(x))
        x = self.fc6(x)
        return x

In [20]:
# 잔차 연결
class ResidualMLP(nn.Module):
    def __init__(self, input_size=11, hidden_sizes=[32, 64, 64, 32, 16], output_size=1):
        super(ResidualMLP, self).__init__()

        # 6개의 MLP 레이어 정의
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.prelu1 = nn.PReLU()

        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.prelu2 = nn.PReLU()

        self.fc3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.prelu3 = nn.PReLU()

        self.fc4 = nn.Linear(hidden_sizes[2], hidden_sizes[3])
        self.prelu4 = nn.PReLU()

        self.fc5 = nn.Linear(hidden_sizes[3], hidden_sizes[4])
        self.prelu5 = nn.PReLU()

        self.fc6 = nn.Linear(hidden_sizes[4], output_size)
        self.prelu6 = nn.PReLU()

        # 1x1 Linear 레이어로 Residual Connection의 크기 맞추기
        self.residual1 = nn.Linear(hidden_sizes[0], hidden_sizes[2])  # fc1 -> fc3
        self.residual2 = nn.Linear(hidden_sizes[2], hidden_sizes[4])  # fc3 -> fc5

    def forward(self, x):
        # 첫 번째 MLP 레이어
        out = self.prelu1(self.fc1(x))

        # 첫 번째 Residual 연결 (fc1 -> fc2 -> fc3)
        residual = self.residual1(out)  # 크기 맞추기
        out = self.prelu2(self.fc2(out))
        out = self.prelu3(self.fc3(out))
        out += residual  # 첫 번째 Residual Connection

        # 두 번째 Residual 연결 (fc3 -> fc4 -> fc5)
        residual = self.residual2(out)  # 크기 맞추기
        out = self.prelu4(self.fc4(out))
        out = self.prelu5(self.fc5(out))
        out += residual  # 두 번째 Residual Connection

        # 최종 출력 레이어 (fc6) 및 PReLU 적용
        out = self.prelu6(self.fc6(out))  # 최종 레이어에도 PReLU 적용

        return out

In [21]:
# 훈련 데이터를 훈련 + 검증 데이터로 나누고 train_dataset, val_dataset 을 반환하는 함수:
train_data = pd.read_csv(train_csv_path)

X = torch.tensor(train_data.iloc[:,1:-1].values, dtype=torch.float32)
y = torch.tensor(train_data.iloc[:,-1].values, dtype = torch.float32).view(-1,1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

In [22]:
# model = MLP5Hidden()
model = ResidualMLP()
model.cuda()
criterion = nn.MSELoss().cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)


generator = torch.Generator()
generator.manual_seed(RANDOM_SEED)

def worker_init_fn(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, worker_init_fn = worker_init_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, worker_init_fn = worker_init_fn)

train_losses = []
val_losses = []

for epoch in range(60):
    model.train()
    train_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss = train_loss / len(train_loader)
    train_losses.append(train_loss)


    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()

    val_loss = val_loss / len(val_loader)
    val_losses.append(val_loss)

    print(f'Epoch {epoch+1}/300, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

Epoch 1/300, Train Loss: 251.8373, Validation Loss: 3.7005
Epoch 2/300, Train Loss: 3.7622, Validation Loss: 3.2366
Epoch 3/300, Train Loss: 3.4991, Validation Loss: 2.9415
Epoch 4/300, Train Loss: 3.4001, Validation Loss: 7.3093
Epoch 5/300, Train Loss: 3.4928, Validation Loss: 3.4298
Epoch 6/300, Train Loss: 3.5050, Validation Loss: 2.8929
Epoch 7/300, Train Loss: 3.4367, Validation Loss: 3.8403
Epoch 8/300, Train Loss: 3.4361, Validation Loss: 3.3910
Epoch 9/300, Train Loss: 3.3806, Validation Loss: 2.8130
Epoch 10/300, Train Loss: 3.3967, Validation Loss: 3.6004
Epoch 11/300, Train Loss: 3.4466, Validation Loss: 3.0652
Epoch 12/300, Train Loss: 3.3466, Validation Loss: 2.7684
Epoch 13/300, Train Loss: 3.2517, Validation Loss: 2.6838
Epoch 14/300, Train Loss: 3.2848, Validation Loss: 3.0916
Epoch 15/300, Train Loss: 3.4036, Validation Loss: 3.0973
Epoch 16/300, Train Loss: 3.2418, Validation Loss: 2.8394
Epoch 17/300, Train Loss: 3.2738, Validation Loss: 4.0514
Epoch 18/300, Train L

In [23]:
X_test = pd.read_csv(test_csv_path).iloc[:,1:]
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
model.eval()
test_pred = []
with torch.no_grad():
    for batch in test_loader:
        batch_X = batch[0].cuda()
        outputs = model(batch_X)
        test_pred.extend(outputs.cpu().numpy())  # 예측값을 CPU로 이동하여 리스트에 추가

test_pred = np.array(test_pred).flatten()
submission_df = pd.read_csv(submission_csv_path)
submission_df['y'] = test_pred  # 예측 결과를 y 컬럼에 추가

# 지정된 경로에 CSV 파일로 저장
submission_df.to_csv("/content/drive/MyDrive/Colab Notebooks/data/samsung/result/MLP_Residual_Connection_36.csv", index=False)

In [24]:
submission_df.describe()

Unnamed: 0,y
count,4986.0
mean,84.804443
std,3.416158
min,81.383751
25%,82.308115
50%,83.127857
75%,86.356033
max,93.401047


In [29]:
import pandas as pd

def find_non_matching_ids(file1, file2):
    # 두 개의 CSV 파일을 읽어옴
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # y 값 기준으로 내림차순 정렬
    df1_sorted = df1.sort_values(by='y', ascending=False)
    df2_sorted = df2.sort_values(by='y', ascending=False)

    # file1의 상위 10% 항목 계산
    top_10_percent_count = int(len(df1_sorted) * 0.1)
    top_10_percent_ids_df1 = set(df1_sorted.head(top_10_percent_count)['ID'])

    # file2의 상위 10% ID 추출
    top_10_percent_ids_df2 = set(df2_sorted.head(top_10_percent_count)['ID'])

    # file1의 상위 10% 중 file2의 상위 10%에 없는 ID 계산
    non_matching_ids = top_10_percent_ids_df1 - top_10_percent_ids_df2
    num_non_matching = len(non_matching_ids)

    # 결과 출력
    print(f"file1의 상위 10% 항목 개수: {top_10_percent_count}")
    print(f"file1의 상위 10% 중 file2에 없는 항목 개수: {num_non_matching}")
    print(f"file1의 상위 10% 중 file2에 없는 항목 ID: {non_matching_ids}")

    return top_10_percent_count, num_non_matching, list(non_matching_ids)

# 사용 예시
file1 ='/content/drive/MyDrive/Colab Notebooks/data/samsung/MLP5_PReLU(0.752).csv' # best 성능 파일
file2 = '/content/drive/MyDrive/Colab Notebooks/data/samsung/result/MLP_Residual_Connection_36.csv' # 측정하고자 하는 파일
top_10_percent_count, num_non_matching, non_matching_ids = find_non_matching_ids(file1, file2)


file1의 상위 10% 항목 개수: 498
file1의 상위 10% 중 file2에 없는 항목 개수: 8
file1의 상위 10% 중 file2에 없는 항목 ID: {'TEST_1178', 'TEST_2426', 'TEST_4051', 'TEST_1502', 'TEST_1592', 'TEST_3172', 'TEST_1309', 'TEST_1478'}
