In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import json
import numpy as np
import random
import pandas as pd

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import ParameterGrid
from google.colab import drive

drive.mount('/content/drive')

# 재현 가능성을 위한 시드 고정
RANDOM_SEED = 18
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Dataloader 시드 고정 (아직 사용 x)

generator = torch.Generator()
generator.manual_seed(RANDOM_SEED)

def worker_init_fn(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# DataLoader(worker_init_fn=worker_init_fn)

# 기타 경로 설정 : 저장되는 데이터 이름 등 (model, method 에 맞게 설정)

Mounted at /content/drive


In [13]:
# 데이터 경로 설정
train_csv_path = "/content/drive/MyDrive/Colab Notebooks/data/samsung/train.csv"
test_csv_path = "/content/drive/MyDrive/Colab Notebooks/data/samsung/test.csv"
submission_csv_path = '/content/drive/MyDrive/Colab Notebooks/data/samsung/sample_submission.csv'

In [14]:
# MLP - 은닉층 5개
# 현재 최고성능 모델 하이퍼파라미터(GS) : {"batch_size": 32, "hidden_sizes": [32, 64, 64, 32, 16], "learning_rate": 0.001}
# Epoch : 93
# 제출 성능 : 0.752
class MLP5Hidden(nn.Module):
    def __init__(self, input_size=11, hidden_sizes=[32, 64, 64, 32, 16], output_size=1):
        super(MLP5Hidden, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.fc3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.fc4 = nn.Linear(hidden_sizes[2], hidden_sizes[3])
        self.fc5 = nn.Linear(hidden_sizes[3], hidden_sizes[4])
        self.fc6 = nn.Linear(hidden_sizes[4], output_size)
        self.gelu = nn.GELU()

    def forward(self, x):
        x = self.gelu(self.fc1(x))
        x = self.gelu(self.fc2(x))
        x = self.gelu(self.fc3(x))
        x = self.gelu(self.fc4(x))
        x = self.gelu(self.fc5(x))
        x = self.fc6(x)
        return x


In [15]:
# 훈련 데이터를 훈련 + 검증 데이터로 나누고 train_dataset, val_dataset 을 반환하는 함수:
train_data = pd.read_csv(train_csv_path)

X = torch.tensor(train_data.iloc[:,1:-1].values, dtype=torch.float32)
y = torch.tensor(train_data.iloc[:,-1].values, dtype = torch.float32).view(-1,1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

In [16]:
batch_size = 32
learning_rate = 0.001

model = MLP5Hidden()
model.cuda()
criterion = nn.MSELoss().cuda()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=worker_init_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

train_losses = []
val_losses = []

# 검증 손실이 정체되면 학습률을 줄임
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

# early stop
best_val_loss = float('inf')
patience = 10
trigger_times = 0

# epochs
epochs = 100
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss = train_loss / len(train_loader)
    train_losses.append(train_loss)


    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.cuda(), batch_y.cuda()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()

    val_loss = val_loss / len(val_loader)
    val_losses.append(val_loss)

    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break
    scheduler.step(val_loss)



Epoch 1/100, Train Loss: 428.9682, Validation Loss: 3.7752
Epoch 2/100, Train Loss: 3.0757, Validation Loss: 3.3679
Epoch 3/100, Train Loss: 3.0631, Validation Loss: 3.3075
Epoch 4/100, Train Loss: 3.0340, Validation Loss: 3.5641
Epoch 5/100, Train Loss: 3.1164, Validation Loss: 3.4901
Epoch 6/100, Train Loss: 3.0115, Validation Loss: 3.2508
Epoch 7/100, Train Loss: 3.1138, Validation Loss: 3.2514
Epoch 8/100, Train Loss: 3.0986, Validation Loss: 3.3028
Epoch 9/100, Train Loss: 3.1243, Validation Loss: 3.3738
Epoch 10/100, Train Loss: 3.0424, Validation Loss: 3.2671
Epoch 11/100, Train Loss: 3.0716, Validation Loss: 3.2757
Epoch 12/100, Train Loss: 3.0756, Validation Loss: 3.4557
Epoch 13/100, Train Loss: 2.7399, Validation Loss: 3.2249
Epoch 14/100, Train Loss: 2.7564, Validation Loss: 3.2268
Epoch 15/100, Train Loss: 2.7557, Validation Loss: 3.2179
Epoch 16/100, Train Loss: 2.7585, Validation Loss: 3.3859
Epoch 17/100, Train Loss: 2.7630, Validation Loss: 3.2276
Epoch 18/100, Train L

In [17]:
X_test = pd.read_csv(test_csv_path).iloc[:,1:]
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
model.eval()
test_pred = []
with torch.no_grad():
    for batch in test_loader:
        batch_X = batch[0].cuda()
        outputs = model(batch_X)
        test_pred.extend(outputs.cpu().numpy())  # 예측값을 CPU로 이동하여 리스트에 추가

test_pred = np.array(test_pred).flatten()
submission_df = pd.read_csv(submission_csv_path)
submission_df['y'] = test_pred  # 예측 결과를 y 컬럼에 추가

# 지정된 경로에 CSV 파일로 저장
submission_df.to_csv("/content/drive/MyDrive/Colab Notebooks/data/samsung/result/new_MLP5_PReLU_2.csv", index=False)

torch.save(model.state_dict(), '모델명_특징_param.pth')

In [18]:
submission_df.describe()

Unnamed: 0,y
count,4986.0
mean,84.878448
std,3.583705
min,81.422081
25%,82.344269
50%,83.10778
75%,86.340916
max,94.309654


In [25]:
print("Train Losses:", train_losses)
print("Validation Losses:", val_losses)

test_pred = []
with torch.no_grad():
    for batch in test_loader:
        batch_X = batch[0].cuda()
        outputs = model(batch_X)
        test_pred.extend(outputs.cpu().numpy())

last_pred = np.array(test_pred)
threshold = np.percentile(last_pred, 90)
print(f"Top 10% threshold: {threshold:.4f}")

Train Losses: [428.9681717792038, 3.0756759805194402, 3.063133762044422, 3.033994324185912, 3.1164079599104757, 3.0114553653350025, 3.1137757225264817, 3.098626083771467, 3.1243357249532835, 3.0424248382434294, 3.071552916870992, 3.0755677622550746, 2.739932391721967, 2.7563902703025644, 2.7556620231891795, 2.7584865616777483, 2.7630392131753125, 2.7675823100423766, 2.7516456684942616, 2.7745518760452956, 2.7578393233381027, 2.7117900780642614, 2.7108591698102673, 2.7114720206674288, 2.7106124694539924, 2.7123499002794205, 2.7120348069508555, 2.7111706632084527, 2.7120095062945206, 2.712374596629043, 2.7138194327696725, 2.7108148552722495, 2.7143921774263275, 2.711206669940549, 2.7096855750350155, 2.710398358457229, 2.710128556754986, 2.7116711248429204, 2.709723783989371, 2.705670871679947, 2.70580231246064, 2.7058029167673525, 2.705850609991391, 2.705887478465216, 2.705802839869635, 2.705891272541533, 2.705481084251214, 2.705210870783683, 2.7051790491533896, 2.7050978095676466, 2.705

In [20]:
import pandas as pd

def find_non_matching_ids(file1, file2):
    # 두 개의 CSV 파일을 읽어옴
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # y 값 기준으로 내림차순 정렬
    df1_sorted = df1.sort_values(by='y', ascending=False)
    df2_sorted = df2.sort_values(by='y', ascending=False)

    # file1의 상위 10% 항목 계산
    top_10_percent_count = int(len(df1_sorted) * 0.1)
    top_10_percent_ids_df1 = set(df1_sorted.head(top_10_percent_count)['ID'])

    # file2의 상위 10% ID 추출
    top_10_percent_ids_df2 = set(df2_sorted.head(top_10_percent_count)['ID'])

    # file1의 상위 10% 중 file2의 상위 10%에 없는 ID 계산
    non_matching_ids = top_10_percent_ids_df1 - top_10_percent_ids_df2
    num_non_matching = len(non_matching_ids)

    # 결과 출력
    print(f"file1의 상위 10% ID 개수: {top_10_percent_count}")
    print(f"일치하는 ID 개수: {top_10_percent_count-num_non_matching}")
    print(f"file1의 상위 10% 중 file2에 없는 ID 개수: {num_non_matching}")
    print(f"file1의 상위 10% 중 file2에 없는 ID: {non_matching_ids}")

    return top_10_percent_count, num_non_matching, list(non_matching_ids)


In [21]:
ids = []

In [22]:
# 현재 best 모델 (기준 모델)
'''
MLP5_PReLU
Method : 활성화 함수 ReLU 사용

epoch = 93
lr = 0.001
batch = 32

hidden_size = [32, 64, 64, 32, 16]
'''

best_csv = "/content/drive/MyDrive/Colab Notebooks/data/samsung/MLP5_PReLU.csv"

In [23]:
# 기준 모델
'''
MLP5_744
Method : 활성화 함수 ReLU 사용

epoch = 93
lr = 0.001
batch = 32

hidden_size = [32, 64, 128, 64, 32]
'''
mlp5_744 = "/content/drive/MyDrive/Colab Notebooks/data/samsung/result/new_MLP5_PReLU_2.csv"
_, _, mlp5_744_id = find_non_matching_ids(best_csv, mlp5_744)
ids.append(mlp5_744_id)

file1의 상위 10% ID 개수: 498
일치하는 ID 개수: 473
file1의 상위 10% 중 file2에 없는 ID 개수: 25
file1의 상위 10% 중 file2에 없는 ID: {'TEST_0784', 'TEST_0493', 'TEST_1309', 'TEST_2538', 'TEST_1178', 'TEST_2632', 'TEST_4707', 'TEST_1362', 'TEST_4949', 'TEST_0103', 'TEST_0935', 'TEST_2275', 'TEST_4578', 'TEST_1792', 'TEST_1574', 'TEST_1502', 'TEST_2426', 'TEST_4876', 'TEST_4881', 'TEST_3536', 'TEST_0898', 'TEST_4511', 'TEST_3811', 'TEST_0037', 'TEST_0691'}


In [24]:

ids

[['TEST_0784',
  'TEST_0493',
  'TEST_1309',
  'TEST_2538',
  'TEST_1178',
  'TEST_2632',
  'TEST_4707',
  'TEST_1362',
  'TEST_4949',
  'TEST_0103',
  'TEST_0935',
  'TEST_2275',
  'TEST_4578',
  'TEST_1792',
  'TEST_1574',
  'TEST_1502',
  'TEST_2426',
  'TEST_4876',
  'TEST_4881',
  'TEST_3536',
  'TEST_0898',
  'TEST_4511',
  'TEST_3811',
  'TEST_0037',
  'TEST_0691']]