In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import json
import numpy as np
import random
import pandas as pd

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import ParameterGrid
from google.colab import drive

drive.mount('/content/drive')

# 재현 가능성을 위한 시드 고정
RANDOM_SEED = 18
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Dataloader 시드 고정 (아직 사용 x)

generator = torch.Generator()
generator.manual_seed(RANDOM_SEED)

def worker_init_fn(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# DataLoader(worker_init_fn=worker_init_fn)

# 기타 경로 설정 : 저장되는 데이터 이름 등 (model, method 에 맞게 설정)

Mounted at /content/drive


In [13]:
# 데이터 경로 설정
train_csv_path = "/content/drive/MyDrive/Colab Notebooks/data/samsung/train.csv"
test_csv_path = "/content/drive/MyDrive/Colab Notebooks/data/samsung/test.csv"
submission_csv_path = '/content/drive/MyDrive/Colab Notebooks/data/samsung/sample_submission.csv'

In [27]:
# MLP - 은닉층 5개
# 현재 최고성능 모델 하이퍼파라미터(GS) : {"batch_size": 32, "hidden_sizes": [32, 64, 64, 32, 16], "learning_rate": 0.001}
# Epoch : 93
# 제출 성능 : 0.752

class MLP5Hidden(nn.Module):
    def __init__(self, input_size=11, hidden_sizes=[32, 64, 64, 32, 16], output_size=1):
        super(MLP5Hidden, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.fc3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.fc4 = nn.Linear(hidden_sizes[2], hidden_sizes[3])
        self.fc5 = nn.Linear(hidden_sizes[3], hidden_sizes[4])
        self.fc6 = nn.Linear(hidden_sizes[4], output_size)
        self.elu = nn.ELU()

    def forward(self, x):
        x = self.elu(self.fc1(x))
        x = self.elu(self.fc2(x))
        x = self.elu(self.fc3(x))
        x = self.elu(self.fc4(x))
        x = self.elu(self.fc5(x))
        x = self.fc6(x)
        return x



In [28]:
# 훈련 데이터를 훈련 + 검증 데이터로 나누고 train_dataset, val_dataset 을 반환하는 함수:
train_data = pd.read_csv(train_csv_path)

X = torch.tensor(train_data.iloc[:,1:-1].values, dtype=torch.float32)
y = torch.tensor(train_data.iloc[:,-1].values, dtype = torch.float32).view(-1,1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

In [29]:
batch_size = 32
learning_rate = 0.001

model = MLP5Hidden()
model.cuda()
criterion = nn.MSELoss().cuda()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=worker_init_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

train_losses = []
val_losses = []

# 검증 손실이 정체되면 학습률을 줄임
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

# early stop
best_val_loss = float('inf')
patience = 10
trigger_times = 0

# epochs
epochs = 100
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss = train_loss / len(train_loader)
    train_losses.append(train_loss)


    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()

    val_loss = val_loss / len(val_loader)
    val_losses.append(val_loss)

    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break
    scheduler.step(val_loss)



Epoch 1/100, Train Loss: 361.4540, Validation Loss: 7.5634
Epoch 2/100, Train Loss: 3.7519, Validation Loss: 3.6782
Epoch 3/100, Train Loss: 3.0281, Validation Loss: 4.2205
Epoch 4/100, Train Loss: 3.1266, Validation Loss: 3.5074
Epoch 5/100, Train Loss: 3.1257, Validation Loss: 3.3096
Epoch 6/100, Train Loss: 3.0783, Validation Loss: 3.7842
Epoch 7/100, Train Loss: 3.1585, Validation Loss: 3.3515
Epoch 8/100, Train Loss: 3.0985, Validation Loss: 3.2924
Epoch 9/100, Train Loss: 3.1034, Validation Loss: 3.7062
Epoch 10/100, Train Loss: 3.0107, Validation Loss: 3.3970
Epoch 11/100, Train Loss: 3.0850, Validation Loss: 3.2779
Epoch 12/100, Train Loss: 3.0313, Validation Loss: 3.4998
Epoch 13/100, Train Loss: 3.0427, Validation Loss: 3.6608
Epoch 14/100, Train Loss: 2.9959, Validation Loss: 3.3729
Epoch 15/100, Train Loss: 3.0291, Validation Loss: 3.2947
Epoch 16/100, Train Loss: 3.0688, Validation Loss: 3.2603
Epoch 17/100, Train Loss: 2.9813, Validation Loss: 3.2475
Epoch 18/100, Train L

In [47]:
X_test = pd.read_csv(test_csv_path).iloc[:,1:]
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
model.eval()
test_pred = []
with torch.no_grad():
    for batch in test_loader:
        batch_X = batch[0].cuda()
        outputs = model(batch_X)
        test_pred.extend(outputs.cpu().numpy())  # 예측값을 CPU로 이동하여 리스트에 추가

test_pred = np.array(test_pred).flatten()
submission_df = pd.read_csv(submission_csv_path)
submission_df['y'] = test_pred  # 예측 결과를 y 컬럼에 추가

# 지정된 경로에 CSV 파일로 저장
submission_df.to_csv("/content/drive/MyDrive/Colab Notebooks/data/samsung/result/new_MLP5_ELU_2.csv", index=False)

torch.save(model.state_dict(), '모델명_특징_param.pth')

In [48]:
submission_df.describe()

Unnamed: 0,y
count,4986.0
mean,84.904381
std,3.655252
min,81.48951
25%,82.289679
50%,83.118317
75%,86.316305
max,94.688248


In [49]:
print("Train Losses:", np.mean(train_losses))
print("Validation Losses:", np.mean(val_losses))

test_pred = []
with torch.no_grad():
    for batch in test_loader:
        batch_X = batch[0].cuda()
        outputs = model(batch_X)
        test_pred.extend(outputs.cpu().numpy())

last_pred = np.array(test_pred)
threshold = np.percentile(last_pred, 90)
print(f"Top 10% threshold: {threshold:.4f}")

Train Losses: 7.6264996292132325
Validation Losses: 3.387999682619594
Top 10% threshold: 91.6495


In [50]:
import pandas as pd

def find_non_matching_ids(file1, file2):
    # 두 개의 CSV 파일을 읽어옴
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # y 값 기준으로 내림차순 정렬
    df1_sorted = df1.sort_values(by='y', ascending=False)
    df2_sorted = df2.sort_values(by='y', ascending=False)

    # file1의 상위 10% 항목 계산
    top_10_percent_count = int(len(df1_sorted) * 0.1)
    top_10_percent_ids_df1 = set(df1_sorted.head(top_10_percent_count)['ID'])

    # file2의 상위 10% ID 추출
    top_10_percent_ids_df2 = set(df2_sorted.head(top_10_percent_count)['ID'])

    # file1의 상위 10% 중 file2의 상위 10%에 없는 ID 계산
    non_matching_ids = top_10_percent_ids_df1 - top_10_percent_ids_df2
    num_non_matching = len(non_matching_ids)

    # 결과 출력
    print(f"file1의 상위 10% ID 개수: {top_10_percent_count}")
    print(f"일치하는 ID 개수: {top_10_percent_count-num_non_matching}")
    print(f"file1의 상위 10% 중 file2에 없는 ID 개수: {num_non_matching}")
    print(f"file1의 상위 10% 중 file2에 없는 ID: {non_matching_ids}")

    return top_10_percent_count, num_non_matching, list(non_matching_ids)


In [51]:
ids = []

In [60]:
# 현재 best 모델 (기준 모델)
'''
MLP5_PReLU
Method : 활성화 함수 ReLU 사용

epoch = 93
lr = 0.001
batch = 32

hidden_size = [32, 64, 64, 32, 16]
'''

best_csv = "/content/drive/MyDrive/Colab Notebooks/data/samsung/MLP5_PReLU(0.752).csv"

In [61]:
# 기준 모델
'''
MLP5_744
Method : 활성화 함수 ReLU 사용

epoch = 93
lr = 0.001
batch = 32

hidden_size = [32, 64, 128, 64, 32]
'''
mlp5_744 = "/content/drive/MyDrive/Colab Notebooks/data/samsung/result/new_MLP5_ELU.csv"
_, _, mlp5_744_id = find_non_matching_ids(best_csv, mlp5_744)
ids.append(mlp5_744_id)

file1의 상위 10% ID 개수: 498
일치하는 ID 개수: 470
file1의 상위 10% 중 file2에 없는 ID 개수: 28
file1의 상위 10% 중 file2에 없는 ID: {'TEST_0784', 'TEST_0493', 'TEST_2613', 'TEST_1309', 'TEST_2577', 'TEST_2538', 'TEST_1178', 'TEST_2632', 'TEST_4707', 'TEST_1362', 'TEST_4949', 'TEST_0103', 'TEST_0935', 'TEST_1909', 'TEST_2275', 'TEST_4578', 'TEST_1792', 'TEST_1574', 'TEST_1502', 'TEST_2426', 'TEST_4876', 'TEST_3536', 'TEST_0898', 'TEST_4511', 'TEST_4221', 'TEST_0108', 'TEST_0037', 'TEST_0691'}


In [62]:

ids

[['TEST_0784',
  'TEST_0493',
  'TEST_2613',
  'TEST_1309',
  'TEST_2577',
  'TEST_2538',
  'TEST_1178',
  'TEST_2632',
  'TEST_4707',
  'TEST_1362',
  'TEST_4949',
  'TEST_0103',
  'TEST_0935',
  'TEST_1909',
  'TEST_2275',
  'TEST_4578',
  'TEST_1792',
  'TEST_1574',
  'TEST_1502',
  'TEST_2426',
  'TEST_4876',
  'TEST_3536',
  'TEST_0898',
  'TEST_4511',
  'TEST_4221',
  'TEST_0108',
  'TEST_0037',
  'TEST_0691']]