In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import json
import numpy as np
import random
import pandas as pd

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import ParameterGrid
from google.colab import drive

drive.mount('/content/drive')

# 재현 가능성을 위한 시드 고정
RANDOM_SEED = 18
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Dataloader 시드 고정 (아직 사용 x)
'''
generator = torch.Generator()
generator.manual_seed(RANDOM_SEED)

def worker_init_fn(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# DataLoader(worker_init_fn=worker_init_fn)
'''

# 데이터 경로 설정
train_csv_path = "/content/drive/MyDrive/train.csv"
test_csv_path = "/content/drive/MyDrive/test.csv"
submission_csv_path = '/content/drive/MyDrive/sample_submission.csv'

Mounted at /content/drive


In [2]:
from sklearn.preprocessing import StandardScaler

In [3]:
# MLP - 은닉층 5개
'''
Grid Search 하이퍼파라미터 범위

[Epoch 100 기준]
param_grid = {
    'hidden_sizes': [[64, 128, 128, 64, 32], [128, 256, 256, 128, 64], [32, 64, 64, 32, 16]],
    'learning_rate': [0.01, 0.001],
    'batch_size': [32, 64, 128],
}
'''
# 현재 최고성능 모델 하이퍼파라미터(GS) : {"batch_size": 32, "hidden_sizes": [32, 64, 64, 32, 16], "learning_rate": 0.001}
# Epoch : 93
# Top 10% threshold: 92.9191
# 제출 성능 : 0.736
class MLP5Hidden(nn.Module):
    def __init__(self, input_size=11, hidden_sizes=[32, 64, 64, 32, 16], output_size=1):
        super(MLP5Hidden, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.fc3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.fc4 = nn.Linear(hidden_sizes[2], hidden_sizes[3])
        self.fc5 = nn.Linear(hidden_sizes[3], hidden_sizes[4])
        self.fc6 = nn.Linear(hidden_sizes[4], output_size)
        self.prelu = nn.PReLU()

    def forward(self, x):
        x = self.prelu(self.fc1(x))
        x = self.prelu(self.fc2(x))
        x = self.prelu(self.fc3(x))
        x = self.prelu(self.fc4(x))
        x = self.prelu(self.fc5(x))
        x = self.fc6(x)
        return x

In [4]:
train_data = pd.read_csv(train_csv_path)
X = train_data.iloc[:,1:-1]  # 특성 데이터
y = train_data.iloc[:,-1]   # 타겟 데이터
test_csv_path = "/content/drive/MyDrive/test.csv"
X_test = pd.read_csv(test_csv_path).iloc[:,1:]
X_concat = pd.concat([X,X_test])
X_concat = X_concat.values

In [5]:
X.describe()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10
count,40118.0,40118.0,40118.0,40118.0,40118.0,40118.0,40118.0,40118.0,40118.0,40118.0,40118.0
mean,1.047842,-2.204742,1.201854,0.868693,-0.277682,-1.775907,0.488336,-0.168241,0.653337,0.288575,0.078679
std,0.040025,0.309915,0.036752,0.048606,0.039428,0.043249,0.015579,0.037058,0.03768,0.042907,0.023325
min,0.907816,-2.548461,1.056123,0.741617,-0.581258,-1.817216,0.463784,-0.231485,0.574932,0.222945,-0.060792
25%,1.028898,-2.42727,1.176222,0.83269,-0.285265,-1.805732,0.474365,-0.196759,0.622813,0.252706,0.074605
50%,1.054757,-2.330303,1.212876,0.871334,-0.265732,-1.79016,0.4881,-0.174313,0.663973,0.281562,0.086375
75%,1.076332,-2.061368,1.231058,0.908141,-0.255766,-1.76042,0.500362,-0.14561,0.688103,0.317799,0.093821
max,1.112987,-0.971386,1.253528,0.964632,-0.24313,-1.545138,0.585086,-0.055259,0.700005,0.428381,0.097734


In [6]:
# 1. 특성 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_concat)
X = X_scaled[:40118]
X_test = X_scaled[40118:]

# # 2. 이상치 처리 (Z-Score를 사용하여 이상치 제거)
# z_scores = np.abs(stats.zscore(X_scaled))
# X_clean = X_scaled[(z_scores < 3).all(axis=1)]
# y_clean = y[(z_scores < 3).all(axis=1)]

# 3. 타겟 변수 변환 (로그 변환)
# y_log = np.log1p(y_clean)

# # 텐서로 변환
# X_tensor = torch.tensor(X_clean, dtype=torch.float32)
# y_tensor = torch.tensor(y_log, dtype=torch.float32).view(-1, 1)

#--------------------------------------------
#3. 타겟 변수 변환 (로그 변환)
y_log = np.log1p(y)

# 텐서로 변환
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y_log, dtype=torch.float32).view(-1, 1)
#---------------------------------------------

# 데이터셋 분할 (Train/Validation Split)
X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=RANDOM_SEED)

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

In [7]:
# best_hidden_sizes = best_params['hidden_sizes']
# best_learning_rate = best_params['learning_rate']
# best_batch_size = best_params['batch_size']

best_hidden_sizes = [32,64,32]
best_learning_rate = 0.001
best_batch_size = 32

model = MLP5Hidden().cuda()
criterion = nn.MSELoss().cuda()
optimizer = optim.Adam(model.parameters(), lr=best_learning_rate)

train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=best_batch_size, shuffle=False)

train_losses = []
val_losses = []

for epoch in range(100):
    model.train()
    train_loss = 0.0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
        outputs = model(batch_X)

        # 로그 값을 풀어준 후 MSE 계산
        outputs_exp = torch.exp(outputs)
        batch_y_exp = torch.exp(batch_y)
        loss = criterion(outputs_exp, batch_y_exp)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss = train_loss / len(train_loader)
    train_losses.append(train_loss)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
            outputs = model(batch_X)

            # 로그 값을 풀어준 후 MSE 계산
            outputs_exp = torch.exp(outputs)
            batch_y_exp = torch.exp(batch_y)
            loss = criterion(outputs_exp, batch_y_exp)

            val_loss += loss.item()

    val_loss = val_loss / len(val_loader)
    val_losses.append(val_loss)

    print(f'Epoch {epoch+1}/300, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

Epoch 1/300, Train Loss: 646.9441, Validation Loss: 65.1769
Epoch 2/300, Train Loss: 37.4009, Validation Loss: 19.5240
Epoch 3/300, Train Loss: 14.9734, Validation Loss: 14.7700
Epoch 4/300, Train Loss: 8.8953, Validation Loss: 6.7168
Epoch 5/300, Train Loss: 7.2210, Validation Loss: 5.6555
Epoch 6/300, Train Loss: 6.9486, Validation Loss: 8.1389
Epoch 7/300, Train Loss: 6.5112, Validation Loss: 4.2881
Epoch 8/300, Train Loss: 5.7391, Validation Loss: 4.7958
Epoch 9/300, Train Loss: 5.5621, Validation Loss: 7.0027
Epoch 10/300, Train Loss: 5.1179, Validation Loss: 4.0894
Epoch 11/300, Train Loss: 5.1298, Validation Loss: 12.4453
Epoch 12/300, Train Loss: 4.5654, Validation Loss: 3.5951
Epoch 13/300, Train Loss: 4.6760, Validation Loss: 3.7381
Epoch 14/300, Train Loss: 4.9164, Validation Loss: 4.8272
Epoch 15/300, Train Loss: 5.0142, Validation Loss: 5.9233
Epoch 16/300, Train Loss: 4.4659, Validation Loss: 5.8500
Epoch 17/300, Train Loss: 4.4374, Validation Loss: 4.2540
Epoch 18/300, T

In [8]:
import os

X_test_scaled = X_test
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=best_batch_size, shuffle=False)


model.eval()
test_pred = []
with torch.no_grad():
    for batch in test_loader:
        batch_X = batch[0].cuda()
        outputs = model(batch_X)
        test_pred.extend(outputs.cpu().numpy())  # 예측값을 CPU로 이동하여 리스트에 추가

test_pred = np.array(test_pred).flatten()
test_pred_original_scale = np.expm1(test_pred)
# 상위 10% 임계값 계산
threshold = np.percentile(test_pred_original_scale, 90)
top_10_percent_mask = test_pred_original_scale >= threshold

# 빈 데이터프레임 생성 (또는 제출 형식에 맞게 생성)
submission_df = pd.read_csv(submission_csv_path)
submission_df['y'] = test_pred_original_scale  # 예측 결과를 y 컬럼에 추가

# 지정된 경로에 CSV 파일로 저장
submission_df.to_csv("/content/drive/MyDrive/new_MLP5.csv", index=False)

In [9]:
submission_df.describe()

Unnamed: 0,y
count,4986.0
mean,83.983414
std,3.305408
min,80.719887
25%,81.61619
50%,82.360104
75%,85.36348
max,91.791939
