<a href="https://colab.research.google.com/github/Batwan01/2024-Challenge/blob/main/history/24-9-11/claude.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# MLP5 + Residual 모델 정의
class MLP5Residual(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP5Residual, self).__init__()
        self.fc_in = nn.Linear(input_size, hidden_size)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc_in(x))
        residual = x
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x) + residual)  # Residual connection
        residual = x
        x = self.relu(self.fc4(x))
        x = self.relu(self.fc5(x) + residual)  # Residual connection
        x = self.fc_out(x)
        return x

# 데이터 로드 및 전처리
def load_and_preprocess_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    X_train = train_df.iloc[:, 1:-1].values  # x_0 to x_10
    y_train = train_df.iloc[:, -1].values  # y
    X_test = test_df.iloc[:, 1:].values  # x_0 to x_10

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, y_train, X_test_scaled

# y값 스케일링 함수 (0 ~ 100 → -0.5 ~ 0.5)
def scale_y(y, min_val=0, max_val=100, new_min=-0.5, new_max=0.5):
    return ((y - min_val) / (max_val - min_val)) * (new_max - new_min) + new_min

# y값 역변환 함수 (-0.5 ~ 0.5 → 0 ~ 100)
def inverse_scale_y(y_scaled, min_val=0, max_val=100, new_min=-0.5, new_max=0.5):
    return ((y_scaled - new_min) / (new_max - new_min)) * (max_val - min_val) + min_val

# 모델 훈련 함수
def train_model(model, train_loader, criterion, optimizer, num_epochs, device):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}')

# 메인 실행 부분
if __name__ == "__main__":
    # 파일 경로 설정
    train_path = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/train.csv'
    test_path = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/test.csv'
    submission_csv_path = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/sample_submission.csv'

    # 하이퍼파라미터 설정
    input_size = 11  # x_0 to x_10
    hidden_size = 64
    output_size = 1
    batch_size = 32
    learning_rate = 0.001
    num_epochs = 100

    # 디바이스 설정
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 데이터 로드 및 전처리
    X_train, y_train, X_test = load_and_preprocess_data(train_path, test_path)

    # y값 스케일링
    y_train_scaled = scale_y(y_train)

    # 데이터셋 및 데이터로더 생성
    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                  torch.tensor(y_train_scaled, dtype=torch.float32))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # 모델, 손실 함수, 옵티마이저 초기화
    model = MLP5Residual(input_size, hidden_size, output_size)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # 모델 훈련
    train_model(model, train_loader, criterion, optimizer, num_epochs, device)

    print("Training completed!")

    # 모델 저장
    torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/contest/samsung/results/mlp5_residual_model.pth')
    print("Model saved!")

    # 테스트 데이터에 대한 예측
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)  # X_test는 이미 NumPy 배열입니다.
    test_dataset = TensorDataset(X_test_tensor)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    model.eval()
    test_pred = []
    with torch.no_grad():
        for batch in test_loader:
            batch_X = batch[0].to(device)
            outputs = model(batch_X)
            test_pred.extend(outputs.cpu().numpy())

    test_pred = np.array(test_pred).flatten()

    # 예측값을 원래 범위로 변환
    test_pred_original = inverse_scale_y(test_pred)

    # 제출 파일 생성
    submission_df = pd.read_csv(submission_csv_path)
    submission_df['y'] = test_pred_original
    print(submission_df['y'])

    # 지정된 경로에 CSV 파일로 저장
    submission_df.to_csv("/content/drive/MyDrive/Colab Notebooks/contest/samsung/results/MLP_Residual_normalize.csv", index=False)
    print("Predictions saved to 'MLP_Residual_normalize.csv'!")

Epoch [1/100], Average Loss: 0.0012
Epoch [2/100], Average Loss: 0.0004
Epoch [3/100], Average Loss: 0.0004
Epoch [4/100], Average Loss: 0.0003
Epoch [5/100], Average Loss: 0.0003
Epoch [6/100], Average Loss: 0.0003
Epoch [7/100], Average Loss: 0.0003
Epoch [8/100], Average Loss: 0.0003
Epoch [9/100], Average Loss: 0.0003
Epoch [10/100], Average Loss: 0.0003
Epoch [11/100], Average Loss: 0.0003
Epoch [12/100], Average Loss: 0.0003
Epoch [13/100], Average Loss: 0.0003
Epoch [14/100], Average Loss: 0.0003
Epoch [15/100], Average Loss: 0.0003
Epoch [16/100], Average Loss: 0.0003
Epoch [17/100], Average Loss: 0.0003
Epoch [18/100], Average Loss: 0.0003
Epoch [19/100], Average Loss: 0.0003
Epoch [20/100], Average Loss: 0.0003
Epoch [21/100], Average Loss: 0.0003
Epoch [22/100], Average Loss: 0.0003
Epoch [23/100], Average Loss: 0.0003
Epoch [24/100], Average Loss: 0.0003
Epoch [25/100], Average Loss: 0.0003
Epoch [26/100], Average Loss: 0.0003
Epoch [27/100], Average Loss: 0.0003
Epoch [28/

In [12]:
def find_non_matching_ids(file1, file2):
    # 두 개의 CSV 파일을 읽어옴
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # y 값 기준으로 내림차순 정렬
    df1_sorted = df1.sort_values(by='y', ascending=False)
    df2_sorted = df2.sort_values(by='y', ascending=False)

    # file1의 상위 10% 항목 계산
    top_10_percent_count = int(len(df1_sorted) * 0.1)
    top_10_percent_ids_df1 = set(df1_sorted.head(top_10_percent_count)['ID'])

    # file2의 상위 10% ID 추출
    top_10_percent_ids_df2 = set(df2_sorted.head(top_10_percent_count)['ID'])

    # file1의 상위 10% 중 file2의 상위 10%에 없는 ID 계산
    non_matching_ids = top_10_percent_ids_df1 - top_10_percent_ids_df2
    num_non_matching = len(non_matching_ids)

    # 결과 출력
    print(f"file1의 상위 10% 항목 개수: {top_10_percent_count}")
    print(f"file1의 상위 10% 중 file2에 없는 항목 개수: {num_non_matching}")
    print(f"file1의 상위 10% 중 file2에 없는 항목 ID: {non_matching_ids}")

    return top_10_percent_count, num_non_matching, list(non_matching_ids)

# 사용 예시
file1 ='/content/drive/MyDrive/Colab Notebooks/contest/samsung/MLP_Residual_Connection_drop_x2_x6(0.752).csv' # best 성능 파일
file2 = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/MLP_Residual_Connection_Mul_0.752.csv'
file3 = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/results/MLP_Residual_normalize.csv' # 측정하고자 하는 파일
top_10_percent_count, num_non_matching, non_matching_ids = find_non_matching_ids(file1, file3)
top_10_percent_count, num_non_matching, non_matching_ids = find_non_matching_ids(file2, file3)

file1의 상위 10% 항목 개수: 498
file1의 상위 10% 중 file2에 없는 항목 개수: 36
file1의 상위 10% 중 file2에 없는 항목 ID: {'TEST_0037', 'TEST_3536', 'TEST_4551', 'TEST_0108', 'TEST_0103', 'TEST_1309', 'TEST_4179', 'TEST_2220', 'TEST_4033', 'TEST_2577', 'TEST_4707', 'TEST_2613', 'TEST_0367', 'TEST_4070', 'TEST_3512', 'TEST_1909', 'TEST_2275', 'TEST_1178', 'TEST_4876', 'TEST_2538', 'TEST_1574', 'TEST_1502', 'TEST_0493', 'TEST_0935', 'TEST_4949', 'TEST_0898', 'TEST_2845', 'TEST_0635', 'TEST_4011', 'TEST_2609', 'TEST_4578', 'TEST_1362', 'TEST_0107', 'TEST_4221', 'TEST_3672', 'TEST_3042'}
file1의 상위 10% 항목 개수: 498
file1의 상위 10% 중 file2에 없는 항목 개수: 23
file1의 상위 10% 중 file2에 없는 항목 ID: {'TEST_4146', 'TEST_0108', 'TEST_0103', 'TEST_2220', 'TEST_4511', 'TEST_4033', 'TEST_2577', 'TEST_4707', 'TEST_2613', 'TEST_4070', 'TEST_4982', 'TEST_1909', 'TEST_4876', 'TEST_2538', 'TEST_2632', 'TEST_0493', 'TEST_4949', 'TEST_0635', 'TEST_4578', 'TEST_1362', 'TEST_0107', 'TEST_4221', 'TEST_3042'}
