# 요구사항 1

In [108]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split


class TitanicDataset(Dataset):
  def __init__(self, X, y):
    self.X = torch.FloatTensor(X)
    self.y = torch.LongTensor(y)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    feature = self.X[idx]
    target = self.y[idx]
    return {'input': feature, 'target': target}

  def __str__(self):
    str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
      len(self.X), self.X.shape, self.y.shape
    )
    return str

In [109]:
class TitanicTestDataset(Dataset):
  def __init__(self, X):
    self.X = torch.FloatTensor(X)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    feature = self.X[idx]
    return {'input': feature}

  def __str__(self):
    str = "Data Size: {0}, Input Shape: {1}".format(
      len(self.X), self.X.shape
    )
    return str

In [110]:
def get_preprocessed_dataset_1(all_df):
    # Pclass별 Fare 평균값을 사용하여 Fare 결측치 메우기
    Fare_mean = all_df[["Pclass", "Fare"]].groupby("Pclass").mean().reset_index()
    Fare_mean.columns = ["Pclass", "Fare_mean"]
    all_df = pd.merge(all_df, Fare_mean, on="Pclass", how="left")
    all_df.loc[(all_df["Fare"].isnull()), "Fare"] = all_df["Fare_mean"]

    return all_df

In [111]:
def get_preprocessed_dataset_2(all_df):
    # name을 세 개의 컬럼으로 분리하여 다시 all_df에 합침
    name_df = all_df["Name"].str.split("[,.]", n=2, expand=True)
    name_df.columns = ["family_name", "honorific", "name"]
    name_df["family_name"] = name_df["family_name"].str.strip()
    name_df["honorific"] = name_df["honorific"].str.strip()
    name_df["name"] = name_df["name"].str.strip()
    all_df = pd.concat([all_df, name_df], axis=1)

    return all_df

In [112]:
def get_preprocessed_dataset_3(all_df):
    # honorific별 Age 평균값을 사용하여 Age 결측치 메우기
    honorific_age_mean = all_df[["honorific", "Age"]].groupby("honorific").median().round().reset_index()
    honorific_age_mean.columns = ["honorific", "honorific_age_mean", ]
    all_df = pd.merge(all_df, honorific_age_mean, on="honorific", how="left")
    all_df.loc[(all_df["Age"].isnull()), "Age"] = all_df["honorific_age_mean"]
    all_df = all_df.drop(["honorific_age_mean"], axis=1)

    return all_df

In [113]:
def get_preprocessed_dataset_4(all_df):
    all_df["family_num"] = all_df["Parch"] + all_df["SibSp"]
    all_df.loc[all_df["family_num"] == 0, "alone"] = 1
    all_df["alone"].fillna(0, inplace=True)
    all_df = all_df.drop(["PassengerId", "Name", "family_name", "name", "Ticket", "Cabin"], axis=1)
    
    return all_df

In [114]:
def get_preprocessed_dataset_5(all_df):
    all_df.loc[
    ~(
            (all_df["honorific"] == "Mr") |
            (all_df["honorific"] == "Miss") |
            (all_df["honorific"] == "Mrs") |
            (all_df["honorific"] == "Master")
    ),
    "honorific"
    ] = "other"
    all_df["Embarked"].fillna("missing", inplace=True)
    
    return all_df

In [115]:
def get_preprocessed_dataset_6(all_df):
    category_features = all_df.columns[all_df.dtypes == "object"]
    from sklearn.preprocessing import LabelEncoder
    for category_feature in category_features:
        le = LabelEncoder()
        if all_df[category_feature].dtypes == "object":
          le = le.fit(all_df[category_feature])
          all_df[category_feature] = le.transform(all_df[category_feature])

    return all_df

In [116]:
def get_preprocessed_dataset():
    CURRENT_FILE_PATH = os.path.abspath(os.getcwd()) # ipynb 에서 절대 경로를 얻기 위해서 사용

    train_data_path = os.path.join(CURRENT_FILE_PATH, "train.csv")
    test_data_path = os.path.join(CURRENT_FILE_PATH, "test.csv")

    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)

    # 데이터 전처리 과정
    all_df = pd.concat([train_df, test_df], sort=False)
    all_df = get_preprocessed_dataset_1(all_df)
    all_df = get_preprocessed_dataset_2(all_df)
    all_df = get_preprocessed_dataset_3(all_df)
    all_df = get_preprocessed_dataset_4(all_df)
    all_df = get_preprocessed_dataset_5(all_df)
    all_df = get_preprocessed_dataset_6(all_df)

    train_X = all_df[~all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)
    train_y = train_df["Survived"]

    test_X = all_df[all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)

    # 데이터셋 초기화
    dataset = TitanicDataset(train_X.values, train_y.values)
    train_dataset, validation_dataset = random_split(dataset, [0.8, 0.2])
    test_dataset = TitanicTestDataset(test_X.values)

    return train_dataset, validation_dataset, test_dataset

In [117]:
from torch import nn


class MyModel(nn.Module):
  def __init__(self, n_input, n_output):
    super().__init__()

    self.model = nn.Sequential(
      nn.Linear(n_input, 30),
      nn.GELU(),
      nn.Linear(30, 48),
      nn.GELU(),
      nn.Linear(48, 64),
      nn.GELU(),
      nn.Linear(64, 48),
      nn.GELU(),
      nn.Linear(48, 30),
      nn.GELU(),
      nn.Linear(30, n_output),
    )

  def forward(self, x):
    x = self.model(x)
    return x

네트워크 구성을 변경했습니다.

In [118]:
def test(test_data_loader):
  print("[TEST]")
  batch = next(iter(test_data_loader))
  print("{0}".format(batch['input'].shape))
  my_model = MyModel(n_input=11, n_output=2)
  output_batch = my_model(batch['input'])
  prediction_batch = torch.argmax(output_batch, dim=1)
  for idx, prediction in enumerate(prediction_batch, start=892):
      print(idx, prediction.item())

테스트 데이터의 ID는 892부터 시작합니다.

In [119]:
if __name__ == "__main__":
  train_dataset, validation_dataset, test_dataset = get_preprocessed_dataset()

  train_data_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
  validation_data_loader = DataLoader(dataset=validation_dataset, batch_size=16, shuffle=True)
  test_data_loader = DataLoader(dataset=test_dataset, batch_size=len(test_dataset))

# 요구사항 2 - 활성화 함수

Wandb 접속

In [120]:
import wandb
wandb.login()

True

소스 코드

In [129]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from datetime import datetime
import wandb
import os
import sys
from pathlib import Path

BASE_PATH = str(Path(os.getcwd()).resolve().parent.parent.parent)
sys.path.append(BASE_PATH)

def training_loop(model, optimizer, train_data_loader, validation_data_loader):
  n_epochs = wandb.config.epochs
  loss_fn = nn.CrossEntropyLoss() # MSELoss → CrossEntropyLoss
  next_print_epoch = 100

  for epoch in range(1, n_epochs + 1):
    loss_train = 0.0
    num_trains = 0
    for train_batch in train_data_loader:
      output_train = model(train_batch['input'])
      loss = loss_fn(output_train, train_batch['target'])
      loss_train += loss.item()
      num_trains += 1

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    loss_validation = 0.0
    num_validations = 0
    with torch.no_grad():
      for validation_batch in validation_data_loader:
        output_validation = model(validation_batch['input'])
        loss = loss_fn(output_validation, validation_batch['target'])
        loss_validation += loss.item()
        num_validations += 1

    wandb.log({
      "Epoch": epoch,
      "Training loss": loss_train / num_trains,
      "Validation loss": loss_validation / num_validations
    })

    if epoch >= next_print_epoch:
      print(
        f"Epoch {epoch}, "
        f"Training loss {loss_train / num_trains:.4f}, "
        f"Validation loss {loss_validation / num_validations:.4f}"
      )
      next_print_epoch += 100

def main():

  wandb.init(
    mode="online",
    project="my_model_training",
    notes="My first wandb experiment",
    tags=["my_model"],
    name=datetime.now().astimezone().strftime('%Y-%m-%d_%H-%M-%S'),
    config={
      'epochs': 1000,
      'batch_size': 32, # 512 → 32
      'learning_rate': 1e-3,
      'n_hidden_unit_list': [20, 20],
    }
  )

  train_dataset, validation_dataset, test_dataset = get_preprocessed_dataset()

  train_data_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
  validation_data_loader = DataLoader(dataset=validation_dataset, batch_size=16, shuffle=True)
  test_data_loader = DataLoader(dataset=test_dataset, batch_size=len(test_dataset))

  linear_model = MyModel(n_input=11, n_output=2)
  optimizer = optim.Adam(linear_model.parameters(), lr=wandb.config.learning_rate)

  wandb.watch(linear_model)
  
  training_loop(
    model=linear_model,
    optimizer=optimizer,
    train_data_loader=train_data_loader,
    validation_data_loader=validation_data_loader
  )
  wandb.finish()

  # 요구사항 3
  linear_model.eval()

  batch = next(iter(test_data_loader))
  output_batch = linear_model(batch['input'])
  prediction_batch = torch.argmax(output_batch, dim=1)

  predictions = []

  for idx, prediction in enumerate(prediction_batch, start=892):
      predictions.append((idx, prediction.item()))

  df = pd.DataFrame(predictions, columns=['PassengerId', 'Survived'])
  df.to_csv('submission.csv', index=False)

  print('Finished.')

if __name__ == "__main__":
  main()



Epoch 100, Training loss 0.3373, Validation loss 0.4442
Epoch 200, Training loss 0.3416, Validation loss 0.4802
Epoch 300, Training loss 0.2765, Validation loss 0.6975
Epoch 400, Training loss 0.2617, Validation loss 0.6207
Epoch 500, Training loss 0.2327, Validation loss 1.0093
Epoch 600, Training loss 0.2252, Validation loss 0.6765
Epoch 700, Training loss 0.2794, Validation loss 0.5582
Epoch 800, Training loss 0.2224, Validation loss 0.6741
Epoch 900, Training loss 0.2302, Validation loss 0.8024
Epoch 1000, Training loss 0.1832, Validation loss 0.7594


0,1
Epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Training loss,█▆▅▅▄▄▄▄▄▄▅▃▃▃▃▃▃▂▂▂▂▂▃▃▂▂▃▂▁▂▁▁▁▁▁▁▂▁▂▂
Validation loss,▂▁▂▂▁▁▂▃▂▁▁▃▂▃▄▃▃▆▄▂▆▄▃▄▄▃▆▇▅▄█▅▄▄▇▄▄▄▄▅

0,1
Epoch,1000.0
Training loss,0.18323
Validation loss,0.75937


Finished.


<b>MSELoss 에서 CrossEntropyLoss 로 변경</b><br>
분류 문제에 더 적합한 손실 함수로 변경했습니다.

<h3>Wandb Graph</h3>

https://wandb.ai/tkdwns1610/my_model_training/reports/Training-loss-23-10-15-22-55-25---Vmlldzo1Njc3ODMw<br>
https://wandb.ai/tkdwns1610/my_model_training/reports/Validation-loss-23-10-15-22-55-16---Vmlldzo1Njc3ODI5<br>

# 요구사항 3 - 테스트 진행

# 요구사항 4 - 제출 및 등수확인

# 숙제 후기

딥 러닝으로 무언가를 예측할 수 있다는 점이 흥미롭습니다.<br>
Training Loss가 감소하는 데 반해, Validation Loss가 증가하는 과적합 문제를 겪었습니다.