## 첫 번째 시도

- 현재 데이터 구조 = 1캐릭터당 여러 장비 행
- 1행 1캐릭터 구조로 바꾸는 것이 다소 어려워서, 머신러닝 대신 딥러닝 이용할 예정

In [None]:
# 'None' 문자열이 결측치로 오해받지 않도록 불러오기

import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/data/item_nomissingvalues_copy.csv', na_values=[], keep_default_na=False)

# 전투력 지표 포함하고 있는 데이터 불러오기
df_stat = pd.read_csv('/content/drive/MyDrive/data/merged/stat_merged.csv')

In [None]:
# df와 df_stat의 '전투력' 칼럼 병합

# 1. 병합 대상 컬럼만 추출 (중복 제거 포함)
df_stat_trimmed = df_stat[['nickname', 'subclass', '전투력']].drop_duplicates()

# 2. left join으로 '전투력' 추가
df = df.merge(df_stat_trimmed, on=['nickname', 'subclass'], how='left')

# 3. 전투력 결측치 제거
df.dropna(subset=['전투력'], inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1059756 entries, 0 to 1063723
Data columns (total 38 columns):
 #   Column                               Non-Null Count    Dtype  
---  ------                               --------------    -----  
 0   nickname                             1059756 non-null  object 
 1   subclass                             1059756 non-null  object 
 2   equipment_slot                       1059756 non-null  object 
 3   item_name                            1059756 non-null  object 
 4   boss_damage_total                    1059756 non-null  int64  
 5   ignore_monster_armor_total           1059756 non-null  int64  
 6   all_stat_total                       1059756 non-null  int64  
 7   damage_total                         1059756 non-null  int64  
 8   potential_option_grade               1059756 non-null  object 
 9   additional_potential_option_grade    1059756 non-null  object 
 10  exceptional_upgrade                  1059756 non-null  bool   
 11  bos

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch

# 전처리 구성
cat_cols = [
    'subclass', 'equipment_slot', 'main_stat_type', 'item_group',
    'starforce_scroll_flag', 'potential_option_grade', 'additional_potential_option_grade',
    'main_pot_grade_summary', 'add_pot_grade_summary', 'potential_status'
]

num_cols = [
    'boss_damage_total', 'ignore_monster_armor_total', 'all_stat_total', 'damage_total',
    'boss_damage_add', 'damage_add', 'all_stat_add', 'starforce', 'special_ring_level',
    'bonus_stat_total', 'mainstat_total', 'power_total', 'mainstat_add', 'power_add',
    'mainstat_etc', 'power_etc', 'mainstat_starforce', 'power_starforce'
]

# Label encoding
encoders = {col: LabelEncoder().fit(df[col]) for col in cat_cols}
for col, encoder in encoders.items():
    df[col] = encoder.transform(df[col])

# 수치형 정규화
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
# 장비 시퀀스 만들기
feature_cols = cat_cols + num_cols
feature_dim = len(feature_cols)
max_len = 24  # 최대 장비 수

character_inputs = []
character_labels = []
attention_masks = []  # 마스크 추가

for name, group in df.groupby('nickname'):
    features = group[feature_cols].values
    valid_len = len(features)

    if valid_len < max_len:
        pad = np.zeros((max_len - valid_len, feature_dim))
        features = np.vstack([features, pad])
        mask = [1] * valid_len + [0] * (max_len - valid_len)
    else:
        features = features[:max_len]
        mask = [1] * max_len

    character_inputs.append(torch.tensor(features, dtype=torch.float32))
    attention_masks.append(torch.tensor(mask, dtype=torch.float32))
    character_labels.append(group['전투력'].iloc[0])

In [None]:
# Tensor 변환
X = torch.stack(character_inputs)          # (B, T, D)
mask = torch.stack(attention_masks)        # (B, T)
y = torch.tensor(character_labels)         # (B,)

In [None]:
# 모델 정의
import torch.nn as nn

class DeepSetMasked(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.phi = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
        )
        self.rho = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
        )

    def forward(self, x, mask):  # x: (B, T, D), mask: (B, T)
        encoded = self.phi(x)  # (B, T, H)
        masked_encoded = encoded * mask.unsqueeze(-1)  # mask 확장 후 곱하기
        aggregated = masked_encoded.sum(dim=1)  # (B, H)
        return self.rho(aggregated).squeeze()  # (B,)

In [None]:
# DataLoader 만들기
from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(X, mask, y)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
# 학습 루프
model = DeepSetMasked(input_dim=X.shape[-1]).to("cuda")
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0

    for xb, mb, yb in train_loader:
        xb = xb.to("cuda")
        mb = mb.to("cuda")
        yb = yb.to("cuda").float()

        optimizer.zero_grad()
        preds = model(xb, mb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * len(xb)

    print(f"Epoch {epoch+1} - Loss: {epoch_loss / len(dataset):.4f}")

Epoch 1 - Loss: 20313777141531684.0000
Epoch 2 - Loss: 10104777078699980.0000
Epoch 3 - Loss: 9557737416839084.0000
Epoch 4 - Loss: 8866219926050771.0000
Epoch 5 - Loss: 7874012985708706.0000
Epoch 6 - Loss: 6750520516183836.0000
Epoch 7 - Loss: 6049198103661723.0000
Epoch 8 - Loss: 5757279609114496.0000
Epoch 9 - Loss: 5630692330944702.0000
Epoch 10 - Loss: 5557885620930596.0000


In [None]:
model.eval()
with torch.no_grad():
    preds = model(X.to("cuda"), mask.to("cuda")).cpu().numpy()
    y_true = y.cpu().numpy()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

mse = mean_squared_error(y_true, preds)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, preds)

print(f"RMSE: {rmse:,.0f}")
print(f"R²: {r2:.4f}")

RMSE: 74,356,325
R²: 0.4940


## 두 번째 시도

In [None]:
# 'None' 문자열이 결측치로 오해받지 않도록 불러오기

import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/data/item_nomissingvalues_copy.csv', na_values=[], keep_default_na=False)

# 전투력 지표 포함하고 있는 데이터 불러오기
df_stat = pd.read_csv('/content/drive/MyDrive/data/merged/stat_merged.csv')

In [None]:
# df와 df_stat의 '전투력' 칼럼 병합

# 1. 병합 대상 컬럼만 추출 (중복 제거 포함)
df_stat_trimmed = df_stat[['nickname', 'subclass', '전투력']].drop_duplicates()

# 2. left join으로 '전투력' 추가
df = df.merge(df_stat_trimmed, on=['nickname', 'subclass'], how='left')

# 3. 전투력 결측치 제거
df.dropna(subset=['전투력'], inplace=True)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 대상 열
cat_cols = [
    'subclass', 'equipment_slot', 'main_stat_type', 'item_group',
    'starforce_scroll_flag', 'potential_option_grade', 'additional_potential_option_grade',
    'main_pot_grade_summary', 'add_pot_grade_summary', 'potential_status'
]

num_cols = [
    'boss_damage_total', 'ignore_monster_armor_total', 'all_stat_total', 'damage_total',
    'boss_damage_add', 'damage_add', 'all_stat_add', 'starforce', 'special_ring_level',
    'bonus_stat_total', 'mainstat_total', 'power_total', 'mainstat_add', 'power_add',
    'mainstat_etc', 'power_etc', 'mainstat_starforce', 'power_starforce'
]

# 결측치 처리 (간단히 dropna)
df = df.dropna(subset=cat_cols + num_cols + ['전투력'])

# 범주형 인코딩
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# 수치형 정규화
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# log 변환
df['log_전투력'] = np.log1p(df['전투력'])

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn

# === 1. 전처리 구성 ===
cat_cols = [
    'subclass', 'equipment_slot', 'main_stat_type', 'item_group',
    'starforce_scroll_flag', 'potential_option_grade', 'additional_potential_option_grade',
    'main_pot_grade_summary', 'add_pot_grade_summary', 'potential_status'
]

num_cols = [
    'boss_damage_total', 'ignore_monster_armor_total', 'all_stat_total', 'damage_total',
    'boss_damage_add', 'damage_add', 'all_stat_add', 'starforce', 'special_ring_level',
    'bonus_stat_total', 'mainstat_total', 'power_total', 'mainstat_add', 'power_add',
    'mainstat_etc', 'power_etc', 'mainstat_starforce', 'power_starforce'
]

existing_cat_cols = [col for col in cat_cols if col in df.columns]
existing_num_cols = [col for col in num_cols if col in df.columns]
target_col = '전투력'

df = df.dropna(subset=existing_cat_cols + existing_num_cols + [target_col])

# 인코딩 + 정규화
for col in existing_cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

scaler = StandardScaler()
df[existing_num_cols] = scaler.fit_transform(df[existing_num_cols])

df['log_전투력'] = np.log1p(df[target_col])

# === 2. 데이터 준비 ===
X = torch.tensor(df[existing_cat_cols + existing_num_cols].values, dtype=torch.float32)
y = torch.tensor(df['log_전투력'].values, dtype=torch.float32)

dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

# === 3. 모델 정의 ===
class SimpleRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.model(x).squeeze()

model = SimpleRegressor(X.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# === 4. 학습 ===
for epoch in range(10):
    model.train()
    total_loss = 0.0
    for xb, yb in loader:
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(xb)
    print(f"Epoch {epoch+1}: Loss = {total_loss / len(dataset):.4f}")

# === 5. 평가 ===
model.eval()
with torch.no_grad():
    y_pred = model(X).numpy()
    y_true = y.numpy()
    y_pred_exp = np.expm1(y_pred)
    y_true_exp = np.expm1(y_true)

rmse = np.sqrt(mean_squared_error(y_true_exp, y_pred_exp))
r2 = r2_score(y_true_exp, y_pred_exp)

print(f"📊 최종 평가 결과")
print(f"RMSE: {rmse:,.0f}")
print(f"R²: {r2:.4f}")

Epoch 1: Loss = 1.3431
Epoch 2: Loss = 0.9169
Epoch 3: Loss = 0.9009
Epoch 4: Loss = 0.8930
Epoch 5: Loss = 0.8864
Epoch 6: Loss = 0.8815
Epoch 7: Loss = 0.8775
Epoch 8: Loss = 0.8762
Epoch 9: Loss = 0.8728
Epoch 10: Loss = 0.8708
📊 최종 평가 결과
RMSE: 89,449,549
R²: 0.2594


## 세 번째 시도

In [None]:
# Masked DeepSets로 시퀀스 학습

# (1) 데이터 전처리
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 사용할 범주형 / 수치형 컬럼 정의
cat_cols = [
    'subclass', 'equipment_slot', 'main_stat_type', 'item_group',
    'starforce_scroll_flag', 'potential_option_grade', 'additional_potential_option_grade',
    'main_pot_grade_summary', 'add_pot_grade_summary', 'potential_status'
]

num_cols = [
    'boss_damage_total', 'ignore_monster_armor_total', 'all_stat_total', 'damage_total',
    'boss_damage_add', 'damage_add', 'all_stat_add', 'starforce', 'special_ring_level',
    'bonus_stat_total', 'mainstat_total', 'power_total', 'mainstat_add', 'power_add',
    'mainstat_etc', 'power_etc', 'mainstat_starforce', 'power_starforce'
]

# 존재하는 열만 필터링
cat_cols = [col for col in cat_cols if col in df.columns]
num_cols = [col for col in num_cols if col in df.columns]

# 전투력 유효성 확인
if '전투력' not in df.columns:
    raise ValueError("❌ '전투력' 컬럼이 존재하지 않습니다.")

# 결측치 제거
df = df.dropna(subset=cat_cols + num_cols + ['전투력'])

# 범주형 인코딩
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# 수치형 정규화
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# 전투력 로그 변환 (추후 선택적으로 사용 가능)
df['log_전투력'] = np.log1p(df['전투력'])

In [None]:
# (2) 캐릭터별 장비 시퀀스 구성(nickname 기준 groupby)
import torch
import numpy as np

feature_cols = cat_cols + num_cols
feature_dim = len(feature_cols)
max_len = 24  # 캐릭터당 최대 장비 수

character_inputs = []
character_labels = []
attention_masks = []

for name, group in df.groupby('nickname'):
    features = group[feature_cols].values
    valid_len = len(features)

    if valid_len < max_len:
        pad = np.zeros((max_len - valid_len, feature_dim))
        features = np.vstack([features, pad])
        mask = [1] * valid_len + [0] * (max_len - valid_len)
    else:
        features = features[:max_len]
        mask = [1] * max_len

    character_inputs.append(torch.tensor(features, dtype=torch.float32))
    attention_masks.append(torch.tensor(mask, dtype=torch.float32))
    character_labels.append(group['전투력'].iloc[0])  # log_전투력으로 바꿀 수도 있음

X = torch.stack(character_inputs).float()                # (B, T, D)
mask = torch.stack(attention_masks).float()              # (B, T)
y = torch.tensor(character_labels, dtype=torch.float32)  # (B,)

In [None]:
# (3) Masked DeepSets 모델 정의

import torch
import torch.nn as nn

class MaskedDeepSets(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.phi = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.rho = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x, mask):
        # x: (B, T, D), mask: (B, T)
        encoded = self.phi(x)                          # (B, T, 64)
        masked = encoded * mask.unsqueeze(-1)          # (B, T, 64)
        pooled = masked.sum(dim=1)                     # (B, 64)
        return self.rho(pooled).squeeze()              # (B,)

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 데이터셋 및 로더 구성
dataset = TensorDataset(X, mask, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

# 모델 초기화
model = MaskedDeepSets(input_dim=X.shape[-1])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# 학습
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for xb, mb, yb in loader:
        optimizer.zero_grad()
        preds = model(xb, mb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(xb)
    print(f"Epoch {epoch+1}: Loss = {total_loss / len(dataset):.4f}")

# 평가
model.eval()
with torch.no_grad():
    preds = model(X, mask).numpy()
    y_true = y.numpy()

rmse = np.sqrt(mean_squared_error(y_true, preds))
r2 = r2_score(y_true, preds)

print(f"\n📊 최종 평가 결과:")
print(f"RMSE: {rmse:,.0f}")
print(f"R²: {r2:.4f}")

Epoch 1: Loss = 21320032586292988.0000
Epoch 2: Loss = 10198710239161158.0000
Epoch 3: Loss = 9711300646582294.0000
Epoch 4: Loss = 9142901835325022.0000
Epoch 5: Loss = 8357941116880978.0000
Epoch 6: Loss = 7326946706100004.0000
Epoch 7: Loss = 6429572304124114.0000
Epoch 8: Loss = 5940441267996914.0000
Epoch 9: Loss = 5726394045816610.0000
Epoch 10: Loss = 5622332282307216.0000

📊 최종 평가 결과:
RMSE: 74,712,527
R²: 0.4891


In [None]:
# 모델 튜닝 시도 (1)
# 범주형 Feature -> Embedding vector로 표현

import torch
import torch.nn as nn

class MaskedDeepSetsWithEmbedding(nn.Module):
    def __init__(self, embedding_info, num_cont_features, emb_dim=8):
        super().__init__()
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_classes, emb_dim)
            for col, num_classes in embedding_info.items()
        })
        self.total_emb_dim = len(embedding_info) * emb_dim
        self.input_dim = self.total_emb_dim + num_cont_features

        self.phi = nn.Sequential(
            nn.Linear(self.input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.rho = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x_cat, x_cont, mask):
        # x_cat: (B, T, num_cat), x_cont: (B, T, num_cont), mask: (B, T)
        embs = [self.embeddings[col](x_cat[..., i]) for i, col in enumerate(self.embeddings)]
        emb_cat = torch.cat(embs, dim=-1)  # (B, T, total_emb_dim)
        x = torch.cat([emb_cat, x_cont], dim=-1)  # (B, T, input_dim)

        encoded = self.phi(x)
        masked = encoded * mask.unsqueeze(-1)
        pooled = masked.sum(dim=1)
        return self.rho(pooled).squeeze()

In [None]:
# embedding 모델 학습을 위한 입력 분리 및 텐서 생성

# 1. embedding 정보 구성 (각 범주형 컬럼의 고유값 개수)
embedding_info = {col: df[col].nunique() for col in cat_cols}

# 2. feature 분리
feature_cols = cat_cols + num_cols
feature_dim = len(feature_cols)
max_len = 24

x_cat_list = []
x_cont_list = []
attention_masks = []
character_labels = []

for name, group in df.groupby('nickname'):
    cat_feats = group[cat_cols].values.astype(np.int64)  # long 타입으로 변환
    cont_feats = group[num_cols].values.astype(np.float32)
    valid_len = len(group)

    if valid_len < max_len:
        pad_cat = np.zeros((max_len - valid_len, len(cat_cols)), dtype=np.int64)
        pad_cont = np.zeros((max_len - valid_len, len(num_cols)), dtype=np.float32)
        cat_feats = np.vstack([cat_feats, pad_cat])
        cont_feats = np.vstack([cont_feats, pad_cont])
        mask = [1] * valid_len + [0] * (max_len - valid_len)
    else:
        cat_feats = cat_feats[:max_len]
        cont_feats = cont_feats[:max_len]
        mask = [1] * max_len

    x_cat_list.append(torch.tensor(cat_feats, dtype=torch.long))
    x_cont_list.append(torch.tensor(cont_feats, dtype=torch.float32))
    attention_masks.append(torch.tensor(mask, dtype=torch.float32))
    character_labels.append(group['전투력'].iloc[0])  # log_전투력으로 바꿔도 됨

# 텐서화
x_cat = torch.stack(x_cat_list)       # (B, T, num_cat)
x_cont = torch.stack(x_cont_list)     # (B, T, num_cont)
mask = torch.stack(attention_masks)   # (B, T)
y = torch.tensor(character_labels, dtype=torch.float32)  # (B,)

# 확인
x_cat.shape, x_cont.shape, mask.shape, y.shape

(torch.Size([45770, 24, 10]),
 torch.Size([45770, 24, 18]),
 torch.Size([45770, 24]),
 torch.Size([45770]))

In [None]:
# embedding 모델 학습 루프 + 평가 코드
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# 데이터셋 및 DataLoader 생성
dataset = TensorDataset(x_cat, x_cont, mask, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

# 모델 초기화
model = MaskedDeepSetsWithEmbedding(embedding_info, num_cont_features=x_cont.shape[-1])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# 학습
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for xcb, xfb, mb, yb in loader:
        optimizer.zero_grad()
        preds = model(xcb, xfb, mb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(xcb)
    print(f"Epoch {epoch+1}: Loss = {total_loss / len(dataset):.4f}")

# 평가
model.eval()
with torch.no_grad():
    preds = model(x_cat, x_cont, mask).numpy()
    y_true = y.numpy()

rmse = np.sqrt(mean_squared_error(y_true, preds))
r2 = r2_score(y_true, preds)

print(f"\n📊 최종 평가 결과:")
print(f"RMSE: {rmse:,.0f}")
print(f"R²: {r2:.4f}")

Epoch 1: Loss = 19354970680136156.0000
Epoch 2: Loss = 6985824516823240.0000
Epoch 3: Loss = 5738621470542747.0000
Epoch 4: Loss = 5513033042183031.0000
Epoch 5: Loss = 5432516061833051.0000
Epoch 6: Loss = 5383590933385174.0000
Epoch 7: Loss = 5341979742175306.0000
Epoch 8: Loss = 5311223582819306.0000
Epoch 9: Loss = 5291353764650500.0000
Epoch 10: Loss = 5272607709977308.0000

📊 최종 평가 결과:
RMSE: 72,540,977
R²: 0.5184


# 네 번째 시도

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

# 전처리 구성
cat_cols = [
    'subclass', 'equipment_slot', 'main_stat_type', 'item_group',
    'starforce_scroll_flag', 'potential_option_grade', 'additional_potential_option_grade',
    'main_pot_grade_summary', 'add_pot_grade_summary', 'potential_status'
]

num_cols = [
    'boss_damage_total', 'ignore_monster_armor_total', 'all_stat_total', 'damage_total',
    'boss_damage_add', 'damage_add', 'all_stat_add', 'starforce', 'special_ring_level',
    'bonus_stat_total', 'mainstat_total', 'power_total', 'mainstat_add', 'power_add',
    'mainstat_etc', 'power_etc', 'mainstat_starforce', 'power_starforce'
]

df = df.dropna(subset=cat_cols + num_cols + ['전투력'])

encoders = {col: LabelEncoder().fit(df[col]) for col in cat_cols}
for col in cat_cols:
    df[col] = encoders[col].transform(df[col])

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# log 전투력으로 바꾸기
df['log_전투력'] = np.log1p(df['전투력'])

In [None]:
# 시퀀스 구성
import torch

max_len = 24
feature_dim = len(cat_cols + num_cols)
x_cat_list, x_cont_list, mask_list, y_list = [], [], [], []

for name, group in df.groupby('nickname'):
    cat_feats = group[cat_cols].values.astype(np.int64)
    cont_feats = group[num_cols].values.astype(np.float32)
    valid_len = len(group)

    if valid_len < max_len:
        cat_feats = np.vstack([cat_feats, np.zeros((max_len - valid_len, len(cat_cols)))])
        cont_feats = np.vstack([cont_feats, np.zeros((max_len - valid_len, len(num_cols)))])
        mask = [1] * valid_len + [0] * (max_len - valid_len)
    else:
        cat_feats = cat_feats[:max_len]
        cont_feats = cont_feats[:max_len]
        mask = [1] * max_len

    x_cat_list.append(torch.tensor(cat_feats, dtype=torch.long))
    x_cont_list.append(torch.tensor(cont_feats, dtype=torch.float32))
    mask_list.append(torch.tensor(mask, dtype=torch.float32))
    y_list.append(np.log1p(group['전투력'].iloc[0]))  # log 전투력

x_cat = torch.stack(x_cat_list)
x_cont = torch.stack(x_cont_list)
mask = torch.stack(mask_list)
y = torch.tensor(y_list, dtype=torch.float32)

embedding_info = {col: df[col].nunique() for col in cat_cols}

In [None]:
# 모델 정의
import torch.nn as nn

class MaskedDeepSetsWithEmbedding(nn.Module):
    def __init__(self, embedding_info, num_cont_features, emb_dim=8):
        super().__init__()
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_classes, emb_dim)
            for col, num_classes in embedding_info.items()
        })
        self.total_emb_dim = len(embedding_info) * emb_dim
        self.input_dim = self.total_emb_dim + num_cont_features

        self.phi = nn.Sequential(
            nn.Linear(self.input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.rho = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Softplus()  # log 예측 폭주 방지
        )

    def forward(self, x_cat, x_cont, mask):
        embs = [self.embeddings[col](x_cat[..., i]) for i, col in enumerate(self.embeddings)]
        emb_cat = torch.cat(embs, dim=-1)
        x = torch.cat([emb_cat, x_cont], dim=-1)
        encoded = self.phi(x)
        masked = encoded * mask.unsqueeze(-1)
        pooled = masked.sum(dim=1)
        return self.rho(pooled).squeeze()

In [None]:
# 학습
from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(x_cat, x_cont, mask, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

model = MaskedDeepSetsWithEmbedding(embedding_info, x_cont.shape[-1])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

for epoch in range(10):
    model.train()
    total_loss = 0.0
    for xcb, xfb, mb, yb in loader:
        optimizer.zero_grad()
        pred = model(xcb, xfb, mb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(xcb)
    print(f"Epoch {epoch+1}: Loss = {total_loss / len(dataset):.4f}")

Epoch 1: Loss = 5.7401
Epoch 2: Loss = 0.7835
Epoch 3: Loss = 0.6542
Epoch 4: Loss = 0.6242
Epoch 5: Loss = 0.6122
Epoch 6: Loss = 0.5977
Epoch 7: Loss = 0.5904
Epoch 8: Loss = 0.5796
Epoch 9: Loss = 0.5688
Epoch 10: Loss = 0.5471


In [None]:
# 평가
from sklearn.metrics import mean_squared_error, r2_score

model.eval()
with torch.no_grad():
    pred_log = model(x_cat, x_cont, mask).numpy()
    y_log = y.numpy()

# 안정성: 클리핑
pred_log = np.clip(pred_log, 0, 50)
y_log = np.clip(y_log, 0, 50)

# 전투력 복원
pred_real = np.expm1(pred_log)
y_real = np.expm1(y_log)

# 지표
rmse = np.sqrt(mean_squared_error(y_real, pred_real))
rmsle = np.sqrt(mean_squared_error(np.log1p(y_real), np.log1p(pred_real)))
r2 = r2_score(y_real, pred_real)

print(f"📊 평가 결과:")
print(f"RMSE: {rmse:,.0f}")
print(f"RMSLE: {rmsle:.4f}")
print(f"R²: {r2:.4f}")

📊 평가 결과:
RMSE: 73,975,112
RMSLE: 0.7019
R²: 0.4992


# 다섯 번째 시도

In [None]:
# 1. 시드 고정
import torch
import random
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

def set_seed(seed=42):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# 2. 정렬 보장된 groupby → 텐서 구성
df = df.sort_values("nickname")  # 필수 정렬

# 장비 수 파생 피처
equip_counts = mask.sum(dim=1).unsqueeze(1).repeat(1, x_cont.shape[1]).unsqueeze(2)
x_cont_with_count = torch.cat([x_cont, equip_counts], dim=2)

# 3. 딥러닝 모델 정의
class DeepMaskedModel(nn.Module):
    def __init__(self, embedding_info, num_cont_features, emb_dim=8):
        super().__init__()
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_classes, emb_dim)
            for col, num_classes in embedding_info.items()
        })
        self.total_emb_dim = len(embedding_info) * emb_dim
        self.input_dim = self.total_emb_dim + num_cont_features

        self.phi = nn.Sequential(
            nn.Linear(self.input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.rho = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Softplus 제거: 전투력 직접 예측
        )

    def forward(self, x_cat, x_cont, mask):
        embs = [self.embeddings[col](x_cat[..., i]) for i, col in enumerate(self.embeddings)]
        emb_cat = torch.cat(embs, dim=-1)
        x = torch.cat([emb_cat, x_cont], dim=-1)
        encoded = self.phi(x)
        masked = encoded * mask.unsqueeze(-1)
        pooled = masked.sum(dim=1)
        return self.rho(pooled).squeeze()

# 4. K-Fold 학습 및 평가
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmses, r2s = [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(x_cat)):
    print(f"\n📂 Fold {fold + 1}")

    train_set = TensorDataset(x_cat[train_idx], x_cont_with_count[train_idx], mask[train_idx], y[train_idx])
    val_set = TensorDataset(x_cat[val_idx], x_cont_with_count[val_idx], mask[val_idx], y[val_idx])

    train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=256, shuffle=False)

    model = DeepMaskedModel(embedding_info, x_cont_with_count.shape[-1])
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    for epoch in range(10):
        model.train()
        total_loss = 0
        for xcb, xfb, mb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xcb, xfb, mb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(xcb)
        print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader.dataset):.4f}")

    # Fold별 평가
    model.eval()
    preds_all, y_all = [], []
    with torch.no_grad():
        for xcb, xfb, mb, yb in val_loader:
            preds = model(xcb, xfb, mb)
            preds_all.append(preds.numpy())
            y_all.append(yb.numpy())

    pred_all = np.concatenate(preds_all)
    y_true = np.concatenate(y_all)

    rmse = np.sqrt(mean_squared_error(y_true, pred_all))
    r2 = r2_score(y_true, pred_all)
    rmses.append(rmse)
    r2s.append(r2)

    print(f"✅ Fold {fold+1} RMSE: {rmse:,.2f}, R²: {r2:.4f}")

# 최종 평균 출력
print("\n🎯 K-Fold 평균 성능:")
print(f"Avg RMSE: {np.mean(rmses):,.2f}")
print(f"Avg R²: {np.mean(r2s):.4f}")


📂 Fold 1
Epoch 1: Loss = 3.4886
Epoch 2: Loss = 0.7343
Epoch 3: Loss = 0.6853
Epoch 4: Loss = 0.6459
Epoch 5: Loss = 0.6000
Epoch 6: Loss = 0.5665
Epoch 7: Loss = 0.5513
Epoch 8: Loss = 0.5325
Epoch 9: Loss = 0.5008
Epoch 10: Loss = 0.4938
✅ Fold 1 RMSE: 0.73, R²: 0.6740

📂 Fold 2
Epoch 1: Loss = 4.0717
Epoch 2: Loss = 0.7298
Epoch 3: Loss = 0.6257
Epoch 4: Loss = 0.5967
Epoch 5: Loss = 0.5543
Epoch 6: Loss = 0.5527
Epoch 7: Loss = 0.5215
Epoch 8: Loss = 0.5127
Epoch 9: Loss = 0.4905
Epoch 10: Loss = 0.4871
✅ Fold 2 RMSE: 0.71, R²: 0.6544

📂 Fold 3
Epoch 1: Loss = 4.6613
Epoch 2: Loss = 0.6714
Epoch 3: Loss = 0.6011
Epoch 4: Loss = 0.5780
Epoch 5: Loss = 0.5465
Epoch 6: Loss = 0.5278
Epoch 7: Loss = 0.5015
Epoch 8: Loss = 0.4608
Epoch 9: Loss = 0.4540
Epoch 10: Loss = 0.4435
✅ Fold 3 RMSE: 1.00, R²: 0.4164

📂 Fold 4
Epoch 1: Loss = 4.8802
Epoch 2: Loss = 0.7252
Epoch 3: Loss = 0.6534
Epoch 4: Loss = 0.6133
Epoch 5: Loss = 0.5635
Epoch 6: Loss = 0.5344
Epoch 7: Loss = 0.5197
Epoch 8: L

In [None]:
# MSE -> HuberLoss로 바꾸고 다시 실행

# 1. 시드 고정
import torch
import random
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

def set_seed(seed=42):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# 2. 정렬 보장된 groupby → 텐서 구성
df = df.sort_values("nickname")  # 필수 정렬

# 장비 수 파생 피처
equip_counts = mask.sum(dim=1).unsqueeze(1).repeat(1, x_cont.shape[1]).unsqueeze(2)
x_cont_with_count = torch.cat([x_cont, equip_counts], dim=2)

# 3. 딥러닝 모델 정의
class DeepMaskedModel(nn.Module):
    def __init__(self, embedding_info, num_cont_features, emb_dim=8):
        super().__init__()
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_classes, emb_dim)
            for col, num_classes in embedding_info.items()
        })
        self.total_emb_dim = len(embedding_info) * emb_dim
        self.input_dim = self.total_emb_dim + num_cont_features

        self.phi = nn.Sequential(
            nn.Linear(self.input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.rho = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Softplus 제거: 전투력 직접 예측
        )

    def forward(self, x_cat, x_cont, mask):
        embs = [self.embeddings[col](x_cat[..., i]) for i, col in enumerate(self.embeddings)]
        emb_cat = torch.cat(embs, dim=-1)
        x = torch.cat([emb_cat, x_cont], dim=-1)
        encoded = self.phi(x)
        masked = encoded * mask.unsqueeze(-1)
        pooled = masked.sum(dim=1)
        return self.rho(pooled).squeeze()

# 4. K-Fold 학습 및 평가
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmses, r2s = [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(x_cat)):
    print(f"\n📂 Fold {fold + 1}")

    train_set = TensorDataset(x_cat[train_idx], x_cont_with_count[train_idx], mask[train_idx], y[train_idx])
    val_set = TensorDataset(x_cat[val_idx], x_cont_with_count[val_idx], mask[val_idx], y[val_idx])

    train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=256, shuffle=False)

    model = DeepMaskedModel(embedding_info, x_cont_with_count.shape[-1])
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.HuberLoss(delta=1.0)

    for epoch in range(10):
        model.train()
        total_loss = 0
        for xcb, xfb, mb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xcb, xfb, mb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(xcb)
        print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader.dataset):.4f}")

    # Fold별 평가
    model.eval()
    preds_all, y_all = [], []
    with torch.no_grad():
        for xcb, xfb, mb, yb in val_loader:
            preds = model(xcb, xfb, mb)
            preds_all.append(preds.numpy())
            y_all.append(yb.numpy())

    pred_all = np.concatenate(preds_all)
    y_true = np.concatenate(y_all)

    rmse = np.sqrt(mean_squared_error(y_true, pred_all))
    r2 = r2_score(y_true, pred_all)
    rmses.append(rmse)
    r2s.append(r2)

    print(f"✅ Fold {fold+1} RMSE: {rmse:,.2f}, R²: {r2:.4f}")

# 최종 평균 출력
print("\n🎯 K-Fold 평균 성능:")
print(f"Avg RMSE: {np.mean(rmses):,.2f}")
print(f"Avg R²: {np.mean(r2s):.4f}")


📂 Fold 1
Epoch 1: Loss = 0.4246
Epoch 2: Loss = 0.1550
Epoch 3: Loss = 0.1408
Epoch 4: Loss = 0.1360
Epoch 5: Loss = 0.1363
Epoch 6: Loss = 0.1286
Epoch 7: Loss = 0.1292
Epoch 8: Loss = 0.1271
Epoch 9: Loss = 0.1229
Epoch 10: Loss = 0.1238
✅ Fold 1 RMSE: 0.77, R²: 0.6381

📂 Fold 2
Epoch 1: Loss = 0.4636
Epoch 2: Loss = 0.1583
Epoch 3: Loss = 0.1492
Epoch 4: Loss = 0.1358
Epoch 5: Loss = 0.1315
Epoch 6: Loss = 0.1290
Epoch 7: Loss = 0.1218
Epoch 8: Loss = 0.1197
Epoch 9: Loss = 0.1187
Epoch 10: Loss = 0.1171
✅ Fold 2 RMSE: 0.76, R²: 0.6004

📂 Fold 3
Epoch 1: Loss = 0.4842
Epoch 2: Loss = 0.1509
Epoch 3: Loss = 0.1406
Epoch 4: Loss = 0.1307
Epoch 5: Loss = 0.1260
Epoch 6: Loss = 0.1266
Epoch 7: Loss = 0.1258
Epoch 8: Loss = 0.1241
Epoch 9: Loss = 0.1102
Epoch 10: Loss = 0.1194
✅ Fold 3 RMSE: 0.97, R²: 0.4501

📂 Fold 4
Epoch 1: Loss = 0.4939
Epoch 2: Loss = 0.1481
Epoch 3: Loss = 0.1389
Epoch 4: Loss = 0.1369
Epoch 5: Loss = 0.1300
Epoch 6: Loss = 0.1256
Epoch 7: Loss = 0.1261
Epoch 8: L

In [None]:
# Step 1: 오차 큰 캐릭터만 추출하여 fine-tune용 텐서 구성
model.eval()
with torch.no_grad():
    pred_all = model(x_cat, x_cont_with_count, mask).numpy()
    y_all = y.numpy()

# 상위 10% 오차 유저를 대상
abs_errors = np.abs(pred_all - y_all)
threshold = np.percentile(abs_errors, 90)  # 상위 10%
error_indices = np.where(abs_errors > threshold)[0]
print(f"🎯 상위 10% 오차 캐릭터 수: {len(error_indices)}")

# 해당 인덱스의 텐서만 추출
x_cat_ft = x_cat[error_indices]
x_cont_ft = x_cont_with_count[error_indices]
mask_ft = mask[error_indices]
y_ft = y[error_indices]

from torch.utils.data import DataLoader, TensorDataset
ft_dataset = TensorDataset(x_cat_ft, x_cont_ft, mask_ft, y_ft)
ft_loader = DataLoader(ft_dataset, batch_size=32, shuffle=True)

# Step 2: 기존 모델 그대로 불러와 fine-tune (3 epoch)
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

for epoch in range(3):
    total_loss = 0
    for xcb, xfb, mb, yb in ft_loader:
        optimizer.zero_grad()
        preds = model(xcb, xfb, mb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(xcb)
    print(f"🔧 Fine-tune Epoch {epoch+1}: Loss = {total_loss / len(ft_loader.dataset):.4f}")

# Step 3: 전체 RMSE 다시 평가
model.eval()
with torch.no_grad():
    final_pred = model(x_cat, x_cont_with_count, mask).numpy()

from sklearn.metrics import mean_squared_error, r2_score
final_rmse = np.sqrt(mean_squared_error(y_all, final_pred))
final_r2 = r2_score(y_all, final_pred)

print(f"\n📊 Fine-tune 이후 전체 평가:")
print(f"RMSE: {final_rmse:,.2f}")
print(f"R²: {final_r2:.4f}")

🎯 상위 10% 오차 캐릭터 수: 4577


  return F.mse_loss(input, target, reduction=self.reduction)


🔧 Fine-tune Epoch 1: Loss = 4.3509
🔧 Fine-tune Epoch 2: Loss = 4.0787
🔧 Fine-tune Epoch 3: Loss = 3.8983

📊 Fine-tune 이후 전체 평가:
RMSE: 0.76
R²: 0.6604


In [None]:
# Step 1: 오차 큰 캐릭터만 추출하여 fine-tune용 텐서 구성
model.eval()
with torch.no_grad():
    pred_all = model(x_cat, x_cont_with_count, mask).numpy()
    y_all = y.numpy()

# 상위 30% 오차 유저를 대상
abs_errors = np.abs(pred_all - y_all)
threshold = np.percentile(abs_errors, 70)  # 상위 30% 커트라인
error_indices = np.where(abs_errors > threshold)[0]
print(f"🎯 상위 30% 오차 캐릭터 수: {len(error_indices)}")

# 해당 인덱스의 텐서만 추출
x_cat_ft = x_cat[error_indices]
x_cont_ft = x_cont_with_count[error_indices]
mask_ft = mask[error_indices]
y_ft = y[error_indices]

from torch.utils.data import DataLoader, TensorDataset
ft_dataset = TensorDataset(x_cat_ft, x_cont_ft, mask_ft, y_ft)
ft_loader = DataLoader(ft_dataset, batch_size=32, shuffle=True)

# Step 2: 기존 모델 그대로 불러와 fine-tune (3 epoch)
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

for epoch in range(3):
    total_loss = 0
    for xcb, xfb, mb, yb in ft_loader:
        optimizer.zero_grad()
        preds = model(xcb, xfb, mb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(xcb)
    print(f"🔧 Fine-tune Epoch {epoch+1}: Loss = {total_loss / len(ft_loader.dataset):.4f}")

# Step 3: 전체 RMSE 다시 평가
model.eval()
with torch.no_grad():
    final_pred = model(x_cat, x_cont_with_count, mask).numpy()

from sklearn.metrics import mean_squared_error, r2_score
final_rmse = np.sqrt(mean_squared_error(y_all, final_pred))
final_r2 = r2_score(y_all, final_pred)

print(f"\n📊 Fine-tune 이후 전체 평가:")
print(f"RMSE: {final_rmse:,.2f}")
print(f"R²: {final_r2:.4f}")

🎯 상위 30% 오차 캐릭터 수: 13731
🔧 Fine-tune Epoch 1: Loss = 1.4933
🔧 Fine-tune Epoch 2: Loss = 1.4428
🔧 Fine-tune Epoch 3: Loss = 1.4206

📊 Fine-tune 이후 전체 평가:
RMSE: 0.71
R²: 0.7001


In [None]:
import torch
import pandas as pd

# 모델 저장 경로 설정
save_path = "best_model_r2_0700_rmse_071.pt"

# 모델 state_dict 저장
torch.save(model.state_dict(), save_path)

print(f"✅ 모델이 저장되었습니다: {save_path}")

# 성능 요약표 정리
results = pd.DataFrame([
    {"모델": "기본 DeepSets", "RMSE": 0.79, "R²": 0.6235},
    {"모델": "DeepSets + HuberLoss", "RMSE": 0.83, "R²": 0.5880},
    {"모델": "DeepSets + Fine-Tune (10%)", "RMSE": 0.76, "R²": 0.6604},
    {"모델": "DeepSets + Fine-Tune (30%)", "RMSE": 0.71, "R²": 0.7001},
    {"모델": "Attention 구조", "RMSE": 0.76, "R²": 0.6547},
    {"모델": "Attention 구조 + 잘못된 fine-tune", "RMSE": 1.12, "R²": 0.2606},
])
results

✅ 모델이 저장되었습니다: best_model_r2_0700_rmse_071.pt


Unnamed: 0,모델,RMSE,R²
0,기본 DeepSets,0.79,0.6235
1,DeepSets + HuberLoss,0.83,0.588
2,DeepSets + Fine-Tune (10%),0.76,0.6604
3,DeepSets + Fine-Tune (30%),0.71,0.7001
4,Attention 구조,0.76,0.6547
5,Attention 구조 + 잘못된 fine-tune,1.12,0.2606


In [None]:
from google.colab import files
files.download("best_model_r2_0700_rmse_071.pt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 필요한 라이브러리 불러오기
import torch
import torch.nn as nn
import torch.nn.functional as F

# 불러올 모델 클래스 정의
class DeepMaskedModel(nn.Module):
    def __init__(self, embedding_info, num_cont_features, emb_dim=8):
        super().__init__()
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_classes, emb_dim)
            for col, num_classes in embedding_info.items()
        })
        self.total_emb_dim = len(embedding_info) * emb_dim
        self.input_dim = self.total_emb_dim + num_cont_features

        self.phi = nn.Sequential(
            nn.Linear(self.input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.rho = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x_cat, x_cont, mask):
        embs = [self.embeddings[col](x_cat[..., i]) for i, col in enumerate(self.embeddings)]
        emb_cat = torch.cat(embs, dim=-1)
        x = torch.cat([emb_cat, x_cont], dim=-1)
        encoded = self.phi(x)
        masked = encoded * mask.unsqueeze(-1)
        pooled = masked.sum(dim=1)
        return self.rho(pooled).squeeze()

# 모델 인스턴스 생성 후 .pt 파일 불러오기
model = DeepMaskedModel(embedding_info, x_cont_with_count.shape[-1])

# 모델 가중치 로드
model.load_state_dict(torch.load("best_model_r2_0700_rmse_071.pt"))

# 예측용 모드로 전환
model.eval()

RuntimeError: Error(s) in loading state_dict for DeepMaskedModel:
	Unexpected key(s) in state_dict: "attention_score.weight", "attention_score.bias", "rho.4.weight", "rho.4.bias". 
	size mismatch for embeddings.subclass.weight: copying a param with shape torch.Size([46, 16]) from checkpoint, the shape in current model is torch.Size([46, 8]).
	size mismatch for embeddings.equipment_slot.weight: copying a param with shape torch.Size([24, 16]) from checkpoint, the shape in current model is torch.Size([24, 8]).
	size mismatch for embeddings.main_stat_type.weight: copying a param with shape torch.Size([6, 16]) from checkpoint, the shape in current model is torch.Size([6, 8]).
	size mismatch for embeddings.item_group.weight: copying a param with shape torch.Size([15, 16]) from checkpoint, the shape in current model is torch.Size([15, 8]).
	size mismatch for embeddings.starforce_scroll_flag.weight: copying a param with shape torch.Size([2, 16]) from checkpoint, the shape in current model is torch.Size([2, 8]).
	size mismatch for embeddings.potential_option_grade.weight: copying a param with shape torch.Size([6, 16]) from checkpoint, the shape in current model is torch.Size([6, 8]).
	size mismatch for embeddings.additional_potential_option_grade.weight: copying a param with shape torch.Size([6, 16]) from checkpoint, the shape in current model is torch.Size([6, 8]).
	size mismatch for embeddings.main_pot_grade_summary.weight: copying a param with shape torch.Size([42, 16]) from checkpoint, the shape in current model is torch.Size([42, 8]).
	size mismatch for embeddings.add_pot_grade_summary.weight: copying a param with shape torch.Size([55, 16]) from checkpoint, the shape in current model is torch.Size([55, 8]).
	size mismatch for embeddings.potential_status.weight: copying a param with shape torch.Size([27, 16]) from checkpoint, the shape in current model is torch.Size([27, 8]).
	size mismatch for phi.0.weight: copying a param with shape torch.Size([256, 179]) from checkpoint, the shape in current model is torch.Size([256, 99]).
	size mismatch for rho.0.weight: copying a param with shape torch.Size([128, 64]) from checkpoint, the shape in current model is torch.Size([32, 64]).
	size mismatch for rho.0.bias: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for rho.2.weight: copying a param with shape torch.Size([64, 128]) from checkpoint, the shape in current model is torch.Size([1, 32]).
	size mismatch for rho.2.bias: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([1]).

In [None]:
# 필요한 라이브러리 불러오기
import torch
import torch.nn as nn
import torch.nn.functional as F

# 불러올 모델 클래스 정의
class AttentionMaskedModelV2(nn.Module):
    def __init__(self, embedding_info, num_cont_features, emb_dim=16, dropout=0.1):
        super().__init__()
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_classes, emb_dim)
            for col, num_classes in embedding_info.items()
        })
        self.total_emb_dim = len(embedding_info) * emb_dim
        self.input_dim = self.total_emb_dim + num_cont_features

        self.phi = nn.Sequential(
            nn.Linear(self.input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )

        self.attention_score = nn.Linear(64, 1)
        self.dropout = nn.Dropout(dropout)

        self.rho = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x_cat, x_cont, mask):
        embs = [self.embeddings[col](x_cat[..., i]) for i, col in enumerate(self.embeddings)]
        emb_cat = torch.cat(embs, dim=-1)
        x = torch.cat([emb_cat, x_cont], dim=-1)
        h = self.phi(x)
        attn_scores = self.attention_score(h).squeeze(-1)
        attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))
        attn_weights = F.softmax(attn_scores, dim=1).unsqueeze(-1)
        weighted_sum = torch.sum(h * self.dropout(attn_weights), dim=1)
        return self.rho(weighted_sum).squeeze()

In [None]:
# 모델 생성 시 embedding_info, 연속형 피처 차원 수 일치시키기
model = AttentionMaskedModelV2(embedding_info, x_cont_with_count.shape[-1])

# 가중치 불러오기
model.load_state_dict(torch.load("best_model_r2_0700_rmse_071.pt"))

# 평가 모드 전환
model.eval()
print("모델 로딩 성공")

모델 로딩 성공


# 가장 성능 좋았던 모델 저장하기

In [None]:
import pandas as pd
import numpy as np
import torch
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# 1. 시드 설정

def set_seed(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# 2. 데이터 불러오기 및 병합
df = pd.read_csv('/content/drive/MyDrive/data/item_nomissingvalues_copy.csv', na_values=[], keep_default_na=False)
df_stat = pd.read_csv('/content/drive/MyDrive/data/merged/stat_merged.csv')
df_stat_trimmed = df_stat[['nickname', 'subclass', '전투력']].drop_duplicates()
df = df.merge(df_stat_trimmed, on=['nickname', 'subclass'], how='left')
df.dropna(subset=['전투력'], inplace=True)

# 3. 전처리
cat_cols = [
    'subclass', 'equipment_slot', 'main_stat_type', 'item_group',
    'starforce_scroll_flag', 'potential_option_grade', 'additional_potential_option_grade',
    'main_pot_grade_summary', 'add_pot_grade_summary', 'potential_status'
]

num_cols = [
    'boss_damage_total', 'ignore_monster_armor_total', 'all_stat_total', 'damage_total',
    'boss_damage_add', 'damage_add', 'all_stat_add', 'starforce', 'special_ring_level',
    'bonus_stat_total', 'mainstat_total', 'power_total', 'mainstat_add', 'power_add',
    'mainstat_etc', 'power_etc', 'mainstat_starforce', 'power_starforce'
]

df = df.dropna(subset=cat_cols + num_cols + ['전투력'])
encoders = {col: LabelEncoder().fit(df[col]) for col in cat_cols}
for col in cat_cols:
    df[col] = encoders[col].transform(df[col])
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df['log_전투력'] = np.log1p(df['전투력'])  # 로그 변환

# 4. 시퀀스 구성
df = df.sort_values("nickname")
max_len = 24
x_cat_list, x_cont_list, mask_list, y_list = [], [], [], []

for name, group in df.groupby("nickname"):
    cat = group[cat_cols].values.astype(np.int64)
    cont = group[num_cols].values.astype(np.float32)
    valid_len = len(group)

    if valid_len < max_len:
        pad_cat = np.zeros((max_len - valid_len, len(cat_cols)))
        pad_cont = np.zeros((max_len - valid_len, len(num_cols)))
        pad_mask = [0] * (max_len - valid_len)
        cat = np.vstack([cat, pad_cat])
        cont = np.vstack([cont, pad_cont])
        mask = [1] * valid_len + pad_mask
    else:
        cat = cat[:max_len]
        cont = cont[:max_len]
        mask = [1] * max_len

    x_cat_list.append(torch.tensor(cat, dtype=torch.long))
    x_cont_list.append(torch.tensor(cont, dtype=torch.float32))
    mask_list.append(torch.tensor(mask, dtype=torch.float32))
    y_list.append(np.log1p(group['전투력'].iloc[0]))

x_cat = torch.stack(x_cat_list)
x_cont = torch.stack(x_cont_list)
mask = torch.stack(mask_list) # 총 24개의 슬롯 중 몇 개 안 낀 애가 있다면 없는 장비에 대해서는 무시
y = torch.tensor(y_list, dtype=torch.float32)

equip_counts = mask.sum(dim=1).unsqueeze(1).repeat(1, x_cont.shape[1]).unsqueeze(2)
x_cont_with_count = torch.cat([x_cont, equip_counts], dim=2)
embedding_info = {col: df[col].nunique() for col in cat_cols}

# 5. 모델 정의
class DeepMaskedModel(nn.Module): # Deepsets
    def __init__(self, embedding_info, num_cont_features, emb_dim=8):
        super().__init__()
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_classes, emb_dim)
            for col, num_classes in embedding_info.items()
        })
        self.total_emb_dim = len(embedding_info) * emb_dim
        self.input_dim = self.total_emb_dim + num_cont_features

        self.phi = nn.Sequential(
            nn.Linear(self.input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.rho = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x_cat, x_cont, mask):
        embs = [self.embeddings[col](x_cat[..., i]) for i, col in enumerate(self.embeddings)]
        emb_cat = torch.cat(embs, dim=-1)
        x = torch.cat([emb_cat, x_cont], dim=-1)
        encoded = self.phi(x)
        masked = encoded * mask.unsqueeze(-1)
        pooled = masked.sum(dim=1)
        return self.rho(pooled).squeeze()

# 6. K-Fold + Fine-tune
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmses, r2s = [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(x_cat)):
    print(f"\n📂 Fold {fold + 1}")

    train_set = TensorDataset(x_cat[train_idx], x_cont_with_count[train_idx], mask[train_idx], y[train_idx])
    val_set = TensorDataset(x_cat[val_idx], x_cont_with_count[val_idx], mask[val_idx], y[val_idx])

    train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=256, shuffle=False)

    model = DeepMaskedModel(embedding_info, x_cont_with_count.shape[-1])
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    for epoch in range(10):
        model.train()
        total_loss = 0
        for xcb, xfb, mb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xcb, xfb, mb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(xcb)
        print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader.dataset):.4f}")

    # 상위 30% 오차 캐릭터만 Fine-tune
    model.eval()
    with torch.no_grad():
        pred_all = model(x_cat, x_cont_with_count, mask).numpy()
        y_all = y.numpy()
    abs_errors = np.abs(pred_all - y_all)
    threshold = np.percentile(abs_errors, 70)
    error_indices = np.where(abs_errors > threshold)[0]

    ft_set = TensorDataset(
        x_cat[error_indices],
        x_cont_with_count[error_indices],
        mask[error_indices],
        y[error_indices]
    )
    ft_loader = DataLoader(ft_set, batch_size=32, shuffle=True)

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    for epoch in range(3):
        total_loss = 0
        for xcb, xfb, mb, yb in ft_loader:
            optimizer.zero_grad()
            preds = model(xcb, xfb, mb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(xcb)
        print(f"🔧 Fine-tune Epoch {epoch+1}: Loss = {total_loss / len(ft_loader.dataset):.4f}")

    # 평가
    model.eval()
    with torch.no_grad():
        final_pred = model(x_cat, x_cont_with_count, mask).numpy()

    final_rmse = np.sqrt(mean_squared_error(y_all, final_pred))
    final_r2 = r2_score(y_all, final_pred)

    print(f"✅ Fold {fold+1} Final RMSE: {final_rmse:,.2f}, R²: {final_r2:.4f}")
    rmses.append(final_rmse)
    r2s.append(final_r2)

# 최종 성능 출력
print(f"\n🎯 평균 성능: RMSE = {np.mean(rmses):,.2f}, R² = {np.mean(r2s):.4f}")

# 모델 저장
torch.save(model.state_dict(), f"best_model_r2_{np.mean(r2s):.4f}_rmse_{np.mean(rmses):.2f}.pt")
print("✅ 모델 저장 완료!")


📂 Fold 1
Epoch 1: Loss = 3.4882
Epoch 2: Loss = 0.7373
Epoch 3: Loss = 0.6872
Epoch 4: Loss = 0.6495
Epoch 5: Loss = 0.6062
Epoch 6: Loss = 0.5705
Epoch 7: Loss = 0.5539
Epoch 8: Loss = 0.5349
Epoch 9: Loss = 0.5038
Epoch 10: Loss = 0.5001
🔧 Fine-tune Epoch 1: Loss = 1.2077
🔧 Fine-tune Epoch 2: Loss = 1.1489
🔧 Fine-tune Epoch 3: Loss = 1.1188
✅ Fold 1 Final RMSE: 0.67, R²: 0.7328

📂 Fold 2
Epoch 1: Loss = 3.4654
Epoch 2: Loss = 0.7116
Epoch 3: Loss = 0.6481
Epoch 4: Loss = 0.6123
Epoch 5: Loss = 0.5787
Epoch 6: Loss = 0.5548
Epoch 7: Loss = 0.5422
Epoch 8: Loss = 0.5196
Epoch 9: Loss = 0.4873
Epoch 10: Loss = 0.4991
🔧 Fine-tune Epoch 1: Loss = 1.1377
🔧 Fine-tune Epoch 2: Loss = 1.0677
🔧 Fine-tune Epoch 3: Loss = 1.0406
✅ Fold 2 Final RMSE: 0.72, R²: 0.6951

📂 Fold 3
Epoch 1: Loss = 4.4264
Epoch 2: Loss = 0.6740
Epoch 3: Loss = 0.6047
Epoch 4: Loss = 0.5652
Epoch 5: Loss = 0.5611
Epoch 6: Loss = 0.5318
Epoch 7: Loss = 0.5050
Epoch 8: Loss = 0.5109
Epoch 9: Loss = 0.4738
Epoch 10: Loss 

# 모델 사용하기

- 현재 상황: 딥러닝 기반 전투력 예측기는 꽤나 잘 만들어짐
- 현재 문제점: 근데 이걸 사용하려면 뉴비 유저 입장에서 아이템 24개의 정보를 세세하게 다 입력해야(입력값 투머치) 정확하게 전투력를 예측받을 수 있음

In [None]:
# 사용할 모델 다시 불러오기
# 1. 시드 설정

def set_seed(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# 2. 데이터 불러오기 및 병합
df = pd.read_csv('/content/drive/MyDrive/data/item_nomissingvalues_copy.csv', na_values=[], keep_default_na=False)
df_stat = pd.read_csv('/content/drive/MyDrive/data/merged/stat_merged.csv')
df_stat_trimmed = df_stat[['nickname', 'subclass', '전투력']].drop_duplicates()
df = df.merge(df_stat_trimmed, on=['nickname', 'subclass'], how='left')
df.dropna(subset=['전투력'], inplace=True)

# 3. 전처리
cat_cols = [
    'subclass', 'equipment_slot', 'main_stat_type', 'item_group',
    'starforce_scroll_flag', 'potential_option_grade', 'additional_potential_option_grade',
    'main_pot_grade_summary', 'add_pot_grade_summary', 'potential_status'
]

num_cols = [
    'boss_damage_total', 'ignore_monster_armor_total', 'all_stat_total', 'damage_total',
    'boss_damage_add', 'damage_add', 'all_stat_add', 'starforce', 'special_ring_level',
    'bonus_stat_total', 'mainstat_total', 'power_total', 'mainstat_add', 'power_add',
    'mainstat_etc', 'power_etc', 'mainstat_starforce', 'power_starforce'
]

df = df.dropna(subset=cat_cols + num_cols + ['전투력'])
encoders = {col: LabelEncoder().fit(df[col]) for col in cat_cols}
for col in cat_cols:
    df[col] = encoders[col].transform(df[col])
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df['log_전투력'] = np.log1p(df['전투력'])  # 로그 변환

# 4. 시퀀스 구성
df = df.sort_values("nickname")
max_len = 24
x_cat_list, x_cont_list, mask_list, y_list = [], [], [], []

for name, group in df.groupby("nickname"):
    cat = group[cat_cols].values.astype(np.int64)
    cont = group[num_cols].values.astype(np.float32)
    valid_len = len(group)

    if valid_len < max_len:
        pad_cat = np.zeros((max_len - valid_len, len(cat_cols)))
        pad_cont = np.zeros((max_len - valid_len, len(num_cols)))
        pad_mask = [0] * (max_len - valid_len)
        cat = np.vstack([cat, pad_cat])
        cont = np.vstack([cont, pad_cont])
        mask = [1] * valid_len + pad_mask
    else:
        cat = cat[:max_len]
        cont = cont[:max_len]
        mask = [1] * max_len

    x_cat_list.append(torch.tensor(cat, dtype=torch.long))
    x_cont_list.append(torch.tensor(cont, dtype=torch.float32))
    mask_list.append(torch.tensor(mask, dtype=torch.float32))
    y_list.append(np.log1p(group['전투력'].iloc[0]))

x_cat = torch.stack(x_cat_list)
x_cont = torch.stack(x_cont_list)
mask = torch.stack(mask_list) # 총 24개의 슬롯 중 몇 개 안 낀 애가 있다면 없는 장비에 대해서는 무시
y = torch.tensor(y_list, dtype=torch.float32)

equip_counts = mask.sum(dim=1).unsqueeze(1).repeat(1, x_cont.shape[1]).unsqueeze(2)
x_cont_with_count = torch.cat([x_cont, equip_counts], dim=2)
embedding_info = {col: df[col].nunique() for col in cat_cols}

# 5. 모델 정의
class DeepMaskedModel(nn.Module): # Deepsets
    def __init__(self, embedding_info, num_cont_features, emb_dim=8):
        super().__init__()
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_classes, emb_dim)
            for col, num_classes in embedding_info.items()
        })
        self.total_emb_dim = len(embedding_info) * emb_dim
        self.input_dim = self.total_emb_dim + num_cont_features

        self.phi = nn.Sequential(
            nn.Linear(self.input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.rho = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x_cat, x_cont, mask):
        embs = [self.embeddings[col](x_cat[..., i]) for i, col in enumerate(self.embeddings)]
        emb_cat = torch.cat(embs, dim=-1)
        x = torch.cat([emb_cat, x_cont], dim=-1)
        encoded = self.phi(x)
        masked = encoded * mask.unsqueeze(-1)
        pooled = masked.sum(dim=1)
        return self.rho(pooled).squeeze()

# 6. K-Fold + Fine-tune
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmses, r2s = [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(x_cat)):
    print(f"\n📂 Fold {fold + 1}")

    train_set = TensorDataset(x_cat[train_idx], x_cont_with_count[train_idx], mask[train_idx], y[train_idx])
    val_set = TensorDataset(x_cat[val_idx], x_cont_with_count[val_idx], mask[val_idx], y[val_idx])

    train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=256, shuffle=False)

    model = DeepMaskedModel(embedding_info, x_cont_with_count.shape[-1])
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    for epoch in range(10):
        model.train()
        total_loss = 0
        for xcb, xfb, mb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xcb, xfb, mb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(xcb)
        print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader.dataset):.4f}")

    # 상위 30% 오차 캐릭터만 Fine-tune
    model.eval()
    with torch.no_grad():
        pred_all = model(x_cat, x_cont_with_count, mask).numpy()
        y_all = y.numpy()
    abs_errors = np.abs(pred_all - y_all)
    threshold = np.percentile(abs_errors, 70)
    error_indices = np.where(abs_errors > threshold)[0]

    ft_set = TensorDataset(
        x_cat[error_indices],
        x_cont_with_count[error_indices],
        mask[error_indices],
        y[error_indices]
    )
    ft_loader = DataLoader(ft_set, batch_size=32, shuffle=True)

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    for epoch in range(3):
        total_loss = 0
        for xcb, xfb, mb, yb in ft_loader:
            optimizer.zero_grad()
            preds = model(xcb, xfb, mb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(xcb)
        print(f"🔧 Fine-tune Epoch {epoch+1}: Loss = {total_loss / len(ft_loader.dataset):.4f}")

    # 평가
    model.eval()
    with torch.no_grad():
        final_pred = model(x_cat, x_cont_with_count, mask).numpy()

    final_rmse = np.sqrt(mean_squared_error(y_all, final_pred))
    final_r2 = r2_score(y_all, final_pred)

    print(f"✅ Fold {fold+1} Final RMSE: {final_rmse:,.2f}, R²: {final_r2:.4f}")
    rmses.append(final_rmse)
    r2s.append(final_r2)

# 최종 성능 출력
print(f"\n🎯 평균 성능: RMSE = {np.mean(rmses):,.2f}, R² = {np.mean(r2s):.4f}")


📂 Fold 1
Epoch 1: Loss = 3.4882
Epoch 2: Loss = 0.7373
Epoch 3: Loss = 0.6872
Epoch 4: Loss = 0.6495
Epoch 5: Loss = 0.6062
Epoch 6: Loss = 0.5705
Epoch 7: Loss = 0.5539
Epoch 8: Loss = 0.5349
Epoch 9: Loss = 0.5038
Epoch 10: Loss = 0.5001
🔧 Fine-tune Epoch 1: Loss = 1.2077
🔧 Fine-tune Epoch 2: Loss = 1.1489
🔧 Fine-tune Epoch 3: Loss = 1.1188
✅ Fold 1 Final RMSE: 0.67, R²: 0.7328

📂 Fold 2
Epoch 1: Loss = 3.4654
Epoch 2: Loss = 0.7116
Epoch 3: Loss = 0.6481
Epoch 4: Loss = 0.6123
Epoch 5: Loss = 0.5787
Epoch 6: Loss = 0.5548
Epoch 7: Loss = 0.5422
Epoch 8: Loss = 0.5196
Epoch 9: Loss = 0.4873
Epoch 10: Loss = 0.4991
🔧 Fine-tune Epoch 1: Loss = 1.1377
🔧 Fine-tune Epoch 2: Loss = 1.0677
🔧 Fine-tune Epoch 3: Loss = 1.0406
✅ Fold 2 Final RMSE: 0.72, R²: 0.6951

📂 Fold 3
Epoch 1: Loss = 4.4264
Epoch 2: Loss = 0.6740
Epoch 3: Loss = 0.6047
Epoch 4: Loss = 0.5652
Epoch 5: Loss = 0.5611
Epoch 6: Loss = 0.5318
Epoch 7: Loss = 0.5050
Epoch 8: Loss = 0.5109
Epoch 9: Loss = 0.4738
Epoch 10: Loss 

## 중요도가 높은 장비 부위 확인

In [None]:
def compute_slot_importance(model, x_cat, x_cont, mask, y, cat_cols, equipment_slot_col, device='cpu'):
    model.eval()
    model.to(device)

    # 장비 슬롯 컬럼 인덱스
    slot_idx = cat_cols.index(equipment_slot_col)

    # 기본 예측값
    with torch.no_grad():
        base_pred = model(x_cat.to(device), x_cont.to(device), mask.to(device)).cpu().numpy()
    y_true = y.numpy()
    base_rmse = np.sqrt(((y_true - base_pred) ** 2).mean())

    # 고유 슬롯 추출
    unique_slots = torch.unique(x_cat[:, :, slot_idx]).tolist()
    slot_importance = {}

    for slot in tqdm(unique_slots, desc="📊 장비 중요도 분석 중"):
        # perturb할 위치 마스크
        slot_mask = (x_cat[:, :, slot_idx] == slot)

        # x_cat 복제
        x_cat_perturbed = x_cat.clone()

        # 해당 위치만 랜덤값으로 대체 (해당 칼럼만 따로 작업)
        max_val = x_cat[:, :, slot_idx].max().item() + 1
        rand_vals = torch.randint(0, max_val, slot_mask.shape)
        x_cat_perturbed[:, :, slot_idx][slot_mask] = rand_vals[slot_mask]

        # 예측
        with torch.no_grad():
            pred = model(x_cat_perturbed.to(device), x_cont.to(device), mask.to(device)).cpu().numpy()

        # RMSE 변화량 저장
        perturbed_rmse = np.sqrt(((y_true - pred) ** 2).mean())
        delta = perturbed_rmse - base_rmse
        slot_importance[slot] = delta

    return sorted(slot_importance.items(), key=lambda x: -x[1])

In [None]:
# 장비 부위 중요도 계산
slot_scores = compute_slot_importance(
    model=model,
    x_cat=x_cat,
    x_cont=x_cont_with_count,
    mask=mask,
    y=y,
    cat_cols=cat_cols,
    equipment_slot_col='equipment_slot'
)

# equipment_slot 인코더로 디코딩
equipment_slot_encoder = encoders['equipment_slot']
decoded_scores = [(equipment_slot_encoder.inverse_transform([slot])[0], delta) for slot, delta in slot_scores]

# 상위 Top-N 출력
top_n = 10
for i, (slot_name, delta) in enumerate(decoded_scores[:top_n]):
    print(f"{i+1}. {slot_name} ➜ 전투력 예측 RMSE 변화량: {delta:.4f}")

📊 장비 중요도 분석 중: 100%|██████████| 24/24 [00:40<00:00,  1.71s/it]

1. 무기 ➜ 전투력 예측 RMSE 변화량: 5.0923
2. 뱃지 ➜ 전투력 예측 RMSE 변화량: 0.2533
3. 모자 ➜ 전투력 예측 RMSE 변화량: 0.1940
4. 훈장 ➜ 전투력 예측 RMSE 변화량: 0.1739
5. 하의 ➜ 전투력 예측 RMSE 변화량: 0.1460
6. 신발 ➜ 전투력 예측 RMSE 변화량: 0.1455
7. 장갑 ➜ 전투력 예측 RMSE 변화량: 0.1450
8. 망토 ➜ 전투력 예측 RMSE 변화량: 0.1434
9. 어깨장식 ➜ 전투력 예측 RMSE 변화량: 0.1300
10. 포켓 아이템 ➜ 전투력 예측 RMSE 변화량: 0.1274





## 중요도 Top 3 (무기, 뱃지, 모자) 만 사용한 경량 모델 구축 및 실험

In [None]:
# 설정
top_slots = ['무기', '뱃지', '모자']
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/data/item_nomissingvalues_copy.csv', na_values=[], keep_default_na=False)
df_stat = pd.read_csv('/content/drive/MyDrive/data/merged/stat_merged.csv')
df_stat_trimmed = df_stat[['nickname', 'subclass', '전투력']].drop_duplicates()
df = df.merge(df_stat_trimmed, on=['nickname', 'subclass'], how='left')
df.dropna(subset=['전투력'], inplace=True)

# Top-3 장비만 필터링
df = df[df['equipment_slot'].isin(top_slots)].copy()

# 전처리
cat_cols = [
    'subclass', 'equipment_slot', 'main_stat_type', 'item_group',
    'starforce_scroll_flag', 'potential_option_grade', 'additional_potential_option_grade',
    'main_pot_grade_summary', 'add_pot_grade_summary', 'potential_status'
]
num_cols = [
    'boss_damage_total', 'ignore_monster_armor_total', 'all_stat_total', 'damage_total',
    'boss_damage_add', 'damage_add', 'all_stat_add', 'starforce', 'special_ring_level',
    'bonus_stat_total', 'mainstat_total', 'power_total', 'mainstat_add', 'power_add',
    'mainstat_etc', 'power_etc', 'mainstat_starforce', 'power_starforce'
]

df = df.dropna(subset=cat_cols + num_cols + ['전투력'])
encoders = {col: LabelEncoder().fit(df[col]) for col in cat_cols}
for col in cat_cols:
    df[col] = encoders[col].transform(df[col])
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df['log_전투력'] = np.log1p(df['전투력'])

# 시퀀스 구성
max_len = 3  # 장비 개수 (무기, 뱃지, 모자)
df = df.sort_values("nickname")

x_cat_list, x_cont_list, mask_list, y_list = [], [], [], []

for name, group in df.groupby("nickname"):
    cat = group[cat_cols].values.astype(np.int64)
    cont = group[num_cols].values.astype(np.float32)
    valid_len = len(group)

    if valid_len < max_len:
        pad_cat = np.zeros((max_len - valid_len, len(cat_cols)))
        pad_cont = np.zeros((max_len - valid_len, len(num_cols)))
        pad_mask = [0] * (max_len - valid_len)
        cat = np.vstack([cat, pad_cat])
        cont = np.vstack([cont, pad_cont])
        mask = [1] * valid_len + pad_mask
    else:
        cat = cat[:max_len]
        cont = cont[:max_len]
        mask = [1] * max_len

    x_cat_list.append(torch.tensor(cat, dtype=torch.long))
    x_cont_list.append(torch.tensor(cont, dtype=torch.float32))
    mask_list.append(torch.tensor(mask, dtype=torch.float32))
    y_list.append(np.log1p(group['전투력'].iloc[0]))

x_cat = torch.stack(x_cat_list)
x_cont = torch.stack(x_cont_list)
mask = torch.stack(mask_list)
y = torch.tensor(y_list, dtype=torch.float32)

equip_counts = mask.sum(dim=1).unsqueeze(1).repeat(1, x_cont.shape[1]).unsqueeze(2)
x_cont_with_count = torch.cat([x_cont, equip_counts], dim=2)
embedding_info = {col: df[col].nunique() for col in cat_cols}

In [None]:
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

# 모델 정의 (기존 구조 그대로 재사용)
class DeepMaskedModel(nn.Module):
    def __init__(self, embedding_info, num_cont_features, emb_dim=8):
        super().__init__()
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_classes, emb_dim)
            for col, num_classes in embedding_info.items()
        })
        self.total_emb_dim = len(embedding_info) * emb_dim
        self.input_dim = self.total_emb_dim + num_cont_features

        self.phi = nn.Sequential(
            nn.Linear(self.input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.rho = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def _embed(self, x_cat):
        embs = [self.embeddings[col](x_cat[..., i]) for i, col in enumerate(self.embeddings)]
        return torch.cat(embs, dim=-1)

    def forward(self, x_cat, x_cont, mask):
        emb_cat = self._embed(x_cat)
        x = torch.cat([emb_cat, x_cont], dim=-1)
        encoded = self.phi(x)
        masked = encoded * mask.unsqueeze(-1)
        pooled = masked.sum(dim=1)
        return self.rho(pooled).squeeze()

# 5-Fold 학습 및 평가
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmses = []
r2s = []

for fold, (train_idx, val_idx) in enumerate(kf.split(x_cat)):
    print(f"\n📂 Fold {fold + 1}")

    train_set = TensorDataset(x_cat[train_idx], x_cont_with_count[train_idx], mask[train_idx], y[train_idx])
    val_set = TensorDataset(x_cat[val_idx], x_cont_with_count[val_idx], mask[val_idx], y[val_idx])

    train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=256, shuffle=False)

    model = DeepMaskedModel(embedding_info, x_cont_with_count.shape[-1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    for epoch in range(10):
        model.train()
        total_loss = 0
        for xcb, xfb, mb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xcb.to(device), xfb.to(device), mb.to(device))
            loss = criterion(preds, yb.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(xcb)
        print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader.dataset):.4f}")

    # Fine-tune: 예측 오차 상위 30%
    model.eval()
    with torch.no_grad():
        pred_all = model(x_cat.to(device), x_cont_with_count.to(device), mask.to(device)).cpu().numpy()
        y_all = y.numpy()
    abs_errors = np.abs(pred_all - y_all)
    threshold = np.percentile(abs_errors, 70)
    error_indices = np.where(abs_errors > threshold)[0]

    ft_set = TensorDataset(
        x_cat[error_indices], x_cont_with_count[error_indices], mask[error_indices], y[error_indices]
    )
    ft_loader = DataLoader(ft_set, batch_size=32, shuffle=True)

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    for epoch in range(3):
        total_loss = 0
        for xcb, xfb, mb, yb in ft_loader:
            optimizer.zero_grad()
            preds = model(xcb.to(device), xfb.to(device), mb.to(device))
            loss = criterion(preds, yb.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(xcb)
        print(f"🔧 Fine-tune Epoch {epoch+1}: Loss = {total_loss / len(ft_loader.dataset):.4f}")

    # 평가
    model.eval()
    with torch.no_grad():
        final_pred = model(x_cat.to(device), x_cont_with_count.to(device), mask.to(device)).cpu().numpy()
    final_rmse = np.sqrt(mean_squared_error(y_all, final_pred))
    final_r2 = r2_score(y_all, final_pred)
    print(f"✅ Fold {fold+1} Final RMSE: {final_rmse:,.2f}, R²: {final_r2:.4f}")
    rmses.append(final_rmse)
    r2s.append(final_r2)

# 최종 성능
print(f"\n Top-3 장비 기반 평균 성능: RMSE = {np.mean(rmses):,.2f}, R² = {np.mean(r2s):.4f}")


📂 Fold 1
Epoch 1: Loss = 10.3448
Epoch 2: Loss = 0.5981
Epoch 3: Loss = 0.5621
Epoch 4: Loss = 0.5551
Epoch 5: Loss = 0.5630
Epoch 6: Loss = 0.5667
Epoch 7: Loss = 0.5554
Epoch 8: Loss = 0.5606
Epoch 9: Loss = 0.5485
Epoch 10: Loss = 0.5381
🔧 Fine-tune Epoch 1: Loss = 1.3257
🔧 Fine-tune Epoch 2: Loss = 1.2987
🔧 Fine-tune Epoch 3: Loss = 1.2880
✅ Fold 1 Final RMSE: 0.86, R²: 0.5394

📂 Fold 2
Epoch 1: Loss = 8.7736
Epoch 2: Loss = 0.6277
Epoch 3: Loss = 0.5846
Epoch 4: Loss = 0.5818
Epoch 5: Loss = 0.5789
Epoch 6: Loss = 0.5630
Epoch 7: Loss = 0.5674
Epoch 8: Loss = 0.5500
Epoch 9: Loss = 0.5501
Epoch 10: Loss = 0.5539
🔧 Fine-tune Epoch 1: Loss = 1.3338
🔧 Fine-tune Epoch 2: Loss = 1.3002
🔧 Fine-tune Epoch 3: Loss = 1.2948
✅ Fold 2 Final RMSE: 0.79, R²: 0.6108

📂 Fold 3
Epoch 1: Loss = 8.3407
Epoch 2: Loss = 0.6105
Epoch 3: Loss = 0.5752
Epoch 4: Loss = 0.5794
Epoch 5: Loss = 0.5650
Epoch 6: Loss = 0.5713
Epoch 7: Loss = 0.5651
Epoch 8: Loss = 0.5648
Epoch 9: Loss = 0.5433
Epoch 10: Loss

## 중요도 Top 5 (무기, 뱃지, 모자, 훈장, 장갑) 만 사용한 경량 모델 구축 및 실험

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 설정
top_5_slots = ['무기', '뱃지', '모자', '훈장', '장갑']
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/data/item_nomissingvalues_copy.csv', na_values=[], keep_default_na=False)
df_stat = pd.read_csv('/content/drive/MyDrive/data/merged/stat_merged.csv')
df_stat_trimmed = df_stat[['nickname', 'subclass', '전투력']].drop_duplicates()
df = df.merge(df_stat_trimmed, on=['nickname', 'subclass'], how='left')
df.dropna(subset=['전투력'], inplace=True)

# Top-5 장비만 필터링
df = df[df['equipment_slot'].isin(top_5_slots)].copy()

# 전처리
cat_cols = [
    'subclass', 'equipment_slot', 'main_stat_type', 'item_group',
    'starforce_scroll_flag', 'potential_option_grade', 'additional_potential_option_grade',
    'main_pot_grade_summary', 'add_pot_grade_summary', 'potential_status'
]
num_cols = [
    'boss_damage_total', 'ignore_monster_armor_total', 'all_stat_total', 'damage_total',
    'boss_damage_add', 'damage_add', 'all_stat_add', 'starforce', 'special_ring_level',
    'bonus_stat_total', 'mainstat_total', 'power_total', 'mainstat_add', 'power_add',
    'mainstat_etc', 'power_etc', 'mainstat_starforce', 'power_starforce'
]

df = df.dropna(subset=cat_cols + num_cols + ['전투력'])
encoders = {col: LabelEncoder().fit(df[col]) for col in cat_cols}
for col in cat_cols:
    df[col] = encoders[col].transform(df[col])
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df['log_전투력'] = np.log1p(df['전투력'])

# 시퀀스 구성
max_len = 5  # 장비 개수
df = df.sort_values("nickname")

x_cat_list, x_cont_list, mask_list, y_list = [], [], [], []

for name, group in df.groupby("nickname"):
    cat = group[cat_cols].values.astype(np.int64)
    cont = group[num_cols].values.astype(np.float32)
    valid_len = len(group)

    if valid_len < max_len:
        pad_cat = np.zeros((max_len - valid_len, len(cat_cols)))
        pad_cont = np.zeros((max_len - valid_len, len(num_cols)))
        pad_mask = [0] * (max_len - valid_len)
        cat = np.vstack([cat, pad_cat])
        cont = np.vstack([cont, pad_cont])
        mask = [1] * valid_len + pad_mask
    else:
        cat = cat[:max_len]
        cont = cont[:max_len]
        mask = [1] * max_len

    x_cat_list.append(torch.tensor(cat, dtype=torch.long))
    x_cont_list.append(torch.tensor(cont, dtype=torch.float32))
    mask_list.append(torch.tensor(mask, dtype=torch.float32))
    y_list.append(np.log1p(group['전투력'].iloc[0]))

x_cat = torch.stack(x_cat_list)
x_cont = torch.stack(x_cont_list)
mask = torch.stack(mask_list)
y = torch.tensor(y_list, dtype=torch.float32)

equip_counts = mask.sum(dim=1).unsqueeze(1).repeat(1, x_cont.shape[1]).unsqueeze(2)
x_cont_with_count = torch.cat([x_cont, equip_counts], dim=2)
embedding_info = {col: df[col].nunique() for col in cat_cols}

In [None]:
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

# 모델 정의 (기존 구조 그대로 재사용)
class DeepMaskedModel(nn.Module):
    def __init__(self, embedding_info, num_cont_features, emb_dim=8):
        super().__init__()
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_classes, emb_dim)
            for col, num_classes in embedding_info.items()
        })
        self.total_emb_dim = len(embedding_info) * emb_dim
        self.input_dim = self.total_emb_dim + num_cont_features

        self.phi = nn.Sequential(
            nn.Linear(self.input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.rho = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def _embed(self, x_cat):
        embs = [self.embeddings[col](x_cat[..., i]) for i, col in enumerate(self.embeddings)]
        return torch.cat(embs, dim=-1)

    def forward(self, x_cat, x_cont, mask):
        emb_cat = self._embed(x_cat)
        x = torch.cat([emb_cat, x_cont], dim=-1)
        encoded = self.phi(x)
        masked = encoded * mask.unsqueeze(-1)
        pooled = masked.sum(dim=1)
        return self.rho(pooled).squeeze()

# 5-Fold 학습 및 평가
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmses = []
r2s = []

for fold, (train_idx, val_idx) in enumerate(kf.split(x_cat)):
    print(f"\n📂 Fold {fold + 1}")

    train_set = TensorDataset(x_cat[train_idx], x_cont_with_count[train_idx], mask[train_idx], y[train_idx])
    val_set = TensorDataset(x_cat[val_idx], x_cont_with_count[val_idx], mask[val_idx], y[val_idx])

    train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=256, shuffle=False)

    model = DeepMaskedModel(embedding_info, x_cont_with_count.shape[-1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    for epoch in range(10):
        model.train()
        total_loss = 0
        for xcb, xfb, mb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xcb.to(device), xfb.to(device), mb.to(device))
            loss = criterion(preds, yb.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(xcb)
        print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader.dataset):.4f}")

    # Fine-tune: 예측 오차 상위 30%
    model.eval()
    with torch.no_grad():
        pred_all = model(x_cat.to(device), x_cont_with_count.to(device), mask.to(device)).cpu().numpy()
        y_all = y.numpy()
    abs_errors = np.abs(pred_all - y_all)
    threshold = np.percentile(abs_errors, 70)
    error_indices = np.where(abs_errors > threshold)[0]

    ft_set = TensorDataset(
        x_cat[error_indices], x_cont_with_count[error_indices], mask[error_indices], y[error_indices]
    )
    ft_loader = DataLoader(ft_set, batch_size=32, shuffle=True)

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    for epoch in range(3):
        total_loss = 0
        for xcb, xfb, mb, yb in ft_loader:
            optimizer.zero_grad()
            preds = model(xcb.to(device), xfb.to(device), mb.to(device))
            loss = criterion(preds, yb.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(xcb)
        print(f"🔧 Fine-tune Epoch {epoch+1}: Loss = {total_loss / len(ft_loader.dataset):.4f}")

    # 평가
    model.eval()
    with torch.no_grad():
        final_pred = model(x_cat.to(device), x_cont_with_count.to(device), mask.to(device)).cpu().numpy()
    final_rmse = np.sqrt(mean_squared_error(y_all, final_pred))
    final_r2 = r2_score(y_all, final_pred)
    print(f"✅ Fold {fold+1} Final RMSE: {final_rmse:,.2f}, R²: {final_r2:.4f}")
    rmses.append(final_rmse)
    r2s.append(final_r2)

# 최종 성능
print(f"\n Top-5 장비 기반 평균 성능: RMSE = {np.mean(rmses):,.2f}, R² = {np.mean(r2s):.4f}")


📂 Fold 1
Epoch 1: Loss = 8.2146
Epoch 2: Loss = 0.7032
Epoch 3: Loss = 0.5847
Epoch 4: Loss = 0.5602
Epoch 5: Loss = 0.5450
Epoch 6: Loss = 0.5460
Epoch 7: Loss = 0.5583
Epoch 8: Loss = 0.5517
Epoch 9: Loss = 0.5293
Epoch 10: Loss = 0.5296
🔧 Fine-tune Epoch 1: Loss = 1.3020
🔧 Fine-tune Epoch 2: Loss = 1.2799
🔧 Fine-tune Epoch 3: Loss = 1.2626
✅ Fold 1 Final RMSE: 0.75, R²: 0.6493

📂 Fold 2
Epoch 1: Loss = 8.4987
Epoch 2: Loss = 0.6026
Epoch 3: Loss = 0.5527
Epoch 4: Loss = 0.5299
Epoch 5: Loss = 0.5319
Epoch 6: Loss = 0.5229
Epoch 7: Loss = 0.5277
Epoch 8: Loss = 0.5118
Epoch 9: Loss = 0.4880
Epoch 10: Loss = 0.5087
🔧 Fine-tune Epoch 1: Loss = 1.3041
🔧 Fine-tune Epoch 2: Loss = 1.2635
🔧 Fine-tune Epoch 3: Loss = 1.2449
✅ Fold 2 Final RMSE: 0.70, R²: 0.6969

📂 Fold 3
Epoch 1: Loss = 7.2001
Epoch 2: Loss = 0.6577
Epoch 3: Loss = 0.5735
Epoch 4: Loss = 0.5499
Epoch 5: Loss = 0.5594
Epoch 6: Loss = 0.5455
Epoch 7: Loss = 0.5395
Epoch 8: Loss = 0.5342
Epoch 9: Loss = 0.5315
Epoch 10: Loss 

## 코사인 유사도 기반 유사 유저 추천

In [None]:
# phi(x) 벡터 추출 함수 정의
def extract_user_vector(model, x_cat, x_cont, mask):
    model.eval()
    with torch.no_grad():
        emb_cat = model._embed(x_cat)
        x = torch.cat([emb_cat, x_cont], dim=-1)
        encoded = model.phi(x)
        masked = encoded * mask.unsqueeze(-1)
        pooled = masked.sum(dim=1)
    return pooled.cpu().numpy()

In [None]:
# 고레벨 유저 벡터 생성 & 저장
from torch.utils.data import DataLoader
import numpy as np

# 고레벨 유저 기준: 전투력 상위 30% 등
high_power_idx = np.argsort(-y.numpy())[:int(len(y) * 0.3)]

# 벡터 생성
model.eval()
hl_vecs = []
hl_nicks = []

for i in high_power_idx:
    vec = extract_user_vector(
        model,
        x_cat[i:i+1].to(device),
        x_cont_with_count[i:i+1].to(device),
        mask[i:i+1].to(device)
    )[0]
    hl_vecs.append(vec)
    hl_nicks.append(df[df['nickname'] == df['nickname'].unique()[i]]['nickname'].iloc[0])

hl_vecs = np.stack(hl_vecs)
hl_nicks = np.array(hl_nicks)

# 저장
np.save("high_level_user_vectors.npy", hl_vecs)
np.save("high_level_user_nicks.npy", hl_nicks)

In [None]:
# 직업 기반 평균값 사전 구축 -> 유저가 정보를 입력하지 않는 경우에도 자동으로 채워줄 수 있게끔
from collections import defaultdict
import numpy as np

# 모든 인코딩 컬럼 디코딩 (범주형)
df_decoded = df.copy()
for col in cat_cols:
    if col in encoders:
        df_decoded[col] = encoders[col].inverse_transform(df[col])

# 사전 구조 초기화
subclass_profiles = defaultdict(lambda: defaultdict(dict))
top_slots = ['무기', '모자', '장갑'] # 훈장과 뱃지 아이템의 경우, 유저들의 별도 강화가 불가능한 부위기 때문에 제외

# 사전 구축
for subclass in df_decoded['subclass'].unique():
    for slot in top_slots:
        # 해당 직업-부위 데이터 필터링
        filtered = df_decoded[
            (df_decoded['subclass'] == subclass) &
            (df_decoded['equipment_slot'] == slot)
        ]
        if filtered.empty:
            continue

        # 범주형: 최빈값 저장 (디코딩된 상태)
        for col in cat_cols:
            try:
                mode_val = filtered[col].mode(dropna=True)
                if not mode_val.empty:
                    subclass_profiles[subclass][slot][col] = mode_val.iloc[0]
            except:
                continue

        # 수치형: 정규화된 평균값 → 원래 스케일로 역변환 후 저장
        for col in num_cols:
            try:
                mean_val = filtered[col].mean()
                if np.isnan(mean_val):
                    continue
                col_index = num_cols.index(col)
                dummy_vec = np.zeros(len(num_cols))
                dummy_vec[col_index] = mean_val
                original_val = scaler.inverse_transform([dummy_vec])[0][col_index]
                subclass_profiles[subclass][slot][col] = round(original_val, 2)
            except:
                continue

In [None]:
# 사용 예시: 나이트로드 유저가 장갑 정보 입력 안 했을 때
slot_profile = subclass_profiles['나이트로드']['장갑']

print("🔹 주스탯 유형:", slot_profile.get('main_stat_type', 'N/A'))
print("🔹 스타포스 평균:", slot_profile.get('starforce', 'N/A'))
print("🔹 장비 세트:", slot_profile.get('item_group', 'N/A'))

🔹 주스탯 유형: LUK
🔹 스타포스 평균: 20.8
🔹 장비 세트: 아케인셰이드


In [None]:
# 직업 기반 평균값 -> 저장 코드
import pickle

with open("subclass_profiles.pkl", "wb") as f:
    pickle.dump(dict(subclass_profiles), f)  # defaultdict는 dict로 변환해서 저장

In [None]:
# 직업 기반 평균값 -> 불러오기 코드
with open("subclass_profiles.pkl", "rb") as f:
    subclass_profiles = pickle.load(f)

In [None]:
user_input = {
    'subclass': '카인',
    '무기': {
        'item_group': '도전자',       # 장비 세트
        'starforce': 15,              # 스타포스 강화 수치
        'mainstat_total': 124,        # 장비에 붙은 최종 주스탯
        'power_total': 80,            # 장비에 붙은 최종 공격력/마력
        'all_stat_total': 12,         # 장비에 붙은 최종 올스탯
        'potential_option_grade': '유니크',            # 잠재옵션 등급
        'additional_potential_option_grade': '에픽',   # 에디셔널 잠재옵션 등급
        'potential_option_1_grade': 'S',               # 잠재옵션 1의 분류
        'potential_option_2_grade': 'S',               # 잠재옵션 2의 분류
        'potential_option_3_grade': 'B',               # 잠재옵션 3의 분류
        'additional_potential_option_1_grade': 'B',    # 에디셔널 잠재옵션 1의 분류
        'additional_potential_option_2_grade': '기타', # 에디셔널 잠재옵션 2의 분류
        'additional_potential_option_3_grade': '기타'  # 에디셔널 잠재옵션 3의 분류
},
    '모자': {
        'item_group': '파프니르',       # 장비 세트
        'starforce': 20,                # 스타포스 강화 수치
        'mainstat_total': 300,          # 장비에 붙은 최종 주스탯
        'power_total': 78,              # 장비에 붙은 최종 공격력/마력
        'all_stat_total': 6,            # 장비에 붙은 최종 올스탯
        'potential_option_grade': '레전드리',          # 잠재옵션 등급
        'additional_potential_option_grade': '에픽',   # 에디셔널 잠재옵션 등급
        'potential_option_1_grade': 'S',               # 잠재옵션 1의 분류
        'potential_option_2_grade': 'S',               # 잠재옵션 2의 분류
        'potential_option_3_grade': 'S',               # 잠재옵션 3의 분류
        'additional_potential_option_1_grade': 'B',    # 에디셔널 잠재옵션 1의 분류
        'additional_potential_option_2_grade': '기타', # 에디셔널 잠재옵션 2의 분류
        'additional_potential_option_3_grade': 'B'     # 에디셔널 잠재옵션 3의 분류
},
    '장갑': {
        'item_group': '앱솔랩스',       # 장비 세트
        'starforce': 18,                # 스타포스 강화 수치
        'mainstat_total': 100,          # 장비에 붙은 최종 주스탯
        'power_total': 70,              # 장비에 붙은 최종 공격력/마력
        'all_stat_total': 0,            # 장비에 붙은 최종 올스탯
        'potential_option_grade': '유니크',            # 잠재옵션 등급
        'additional_potential_option_grade': '에픽',   # 에디셔널 잠재옵션 등급
        'potential_option_1_grade': 'S',               # 잠재옵션 1의 분류
        'potential_option_2_grade': 'S',               # 잠재옵션 2의 분류
        'potential_option_3_grade': '기타',            # 잠재옵션 3의 분류
        'additional_potential_option_1_grade': 'S',    # 에디셔널 잠재옵션 1의 분류
        'additional_potential_option_2_grade': 'S',    # 에디셔널 잠재옵션 2의 분류
        'additional_potential_option_3_grade': '기타'  # 에디셔널 잠재옵션 3의 분류
}
}

In [None]:
# 중요도 높은 아이템 정보 입력 기반 유사 유저 추천
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity

# 직업별 주스탯 매핑
main_stat_map = {
    "STR": ["히어로", "아델", "소울마스터", "아란", "제로", "팔라딘", "다크나이트", "카이저",
            "데몬슬레이어", "미하일", "블래스터", "은월", "바이퍼", "스트라이커", "캐논마스터", "아크"],
    "DEX": ["윈드브레이커", "메르세데스", "보우마스터", "패스파인더", "신궁", "카인", "와일드헌터",
            "엔젤릭버스터", "캡틴", "메카닉"],
    "INT": ["비숍", "아크메이지(불,독)", "아크메이지(썬,콜)", "라라", "배틀메이지", "에반", "루미너스",
            "키네시스", "플레임위자드", "일리움"],
    "LUK": ["나이트워커", "섀도어", "나이트로드", "듀얼블레이더", "팬텀", "호영", "칼리", "카데나"],
    "HP": ["데몬어벤져"],
    "STR_DEX_LUK": ["제논"]
}

def get_main_stat(subclass):
    for stat, jobs in main_stat_map.items():
        if subclass in jobs:
            return stat
    return "기타"

# 장비 정보 자동 채움
def fill_missing_slots(user_input, subclass_profiles, top_slots):
    subclass = user_input['subclass']
    main_stat = get_main_stat(subclass)
    filled = {}

    for slot in top_slots:
        if slot in user_input:
            filled[slot] = user_input[slot]
        else:
            filled[slot] = subclass_profiles.get(subclass, {}).get(slot, {}).copy()

        if 'main_stat_type' not in filled[slot] or filled[slot]['main_stat_type'] is None:
            filled[slot]['main_stat_type'] = main_stat

    return filled

# 인코딩 + 스케일링 + 텐서 변환
def encode_and_scale(filled_slots, subclass, encoders, scaler, cat_cols, num_cols):
    x_cat_rows, x_cont_rows = [], []

    for slot in filled_slots:
        row_cat, row_cont = [], []

        for col in cat_cols:
            if col == 'subclass':
                val = subclass
            elif col == 'equipment_slot':
                val = slot
            else:
                val = filled_slots[slot].get(col, subclass_profiles.get(subclass, {}).get(slot, {}).get(col, None))

            if val is None:
                val = encoders[col].classes_[0]
            if isinstance(val, list):
                val = '+'.join(val)
            if val not in encoders[col].classes_:
                val = encoders[col].classes_[0]

            row_cat.append(encoders[col].transform([val])[0])

        for col in num_cols:
            val = filled_slots[slot].get(col, subclass_profiles.get(subclass, {}).get(slot, {}).get(col, 0.0))
            row_cont.append(val)

        x_cat_rows.append(row_cat)
        x_cont_rows.append(row_cont)

    x_cat_tensor = torch.tensor(x_cat_rows, dtype=torch.long).unsqueeze(0)
    x_cont_tensor = torch.tensor(scaler.transform(x_cont_rows), dtype=torch.float32).unsqueeze(0)
    mask_tensor = torch.tensor([[1] * len(filled_slots)], dtype=torch.float32)

    equip_counts = mask_tensor.sum(dim=1, keepdim=True).repeat(1, x_cont_tensor.shape[1], 1)
    x_cont_tensor = torch.cat([x_cont_tensor, equip_counts], dim=2)

    return x_cat_tensor, x_cont_tensor, mask_tensor

# phi(x) 벡터 추출
def extract_user_vector(model, x_cat, x_cont, mask):
    model.eval()
    with torch.no_grad():
        emb_cat = model._embed(x_cat.to(device))
        x = torch.cat([emb_cat, x_cont.to(device)], dim=-1)
        encoded = model.phi(x)
        masked = encoded * mask.to(device).unsqueeze(-1)
        pooled = masked.sum(dim=1)
    return pooled.cpu().numpy()[0]

# 유사도 계산 (subclass 동일한 유저만 대상)
def recommend_similar_users(user_vec, hl_vecs, hl_nicks, df_stat_trimmed, target_subclass, top_k=5):
    # subclass 기준 필터링
    nick_to_idx = {nick: i for i, nick in enumerate(hl_nicks)}
    filtered_nicks = df_stat_trimmed[df_stat_trimmed['subclass'] == target_subclass]['nickname'].tolist()
    valid_indices = [nick_to_idx[nick] for nick in filtered_nicks if nick in nick_to_idx]

    hl_vecs_filtered = hl_vecs[valid_indices]
    hl_nicks_filtered = [hl_nicks[i] for i in valid_indices]

    # 유사도 계산
    sims = cosine_similarity(user_vec.reshape(1, -1), hl_vecs_filtered).flatten()
    top_idx = sims.argsort()[-top_k:][::-1]
    return [(hl_nicks_filtered[i], sims[i]) for i in top_idx]

# 🔧 전체 실행 예시 흐름
user_input = {
    'subclass': '카인',
    '무기': {
        'item_group': '도전자',       # 장비 세트
        'starforce': 15,              # 스타포스 강화 수치
        'mainstat_total': 124,        # 장비에 붙은 최종 주스탯
        'power_total': 80,            # 장비에 붙은 최종 공격력/마력
        'all_stat_total': 12,         # 장비에 붙은 최종 올스탯
        'potential_option_grade': '유니크',            # 잠재옵션 등급
        'additional_potential_option_grade': '에픽',   # 에디셔널 잠재옵션 등급
        'potential_option_1_grade': 'S',               # 잠재옵션 1의 분류
        'potential_option_2_grade': 'S',               # 잠재옵션 2의 분류
        'potential_option_3_grade': 'B',               # 잠재옵션 3의 분류
        'additional_potential_option_1_grade': 'B',    # 에디셔널 잠재옵션 1의 분류
        'additional_potential_option_2_grade': '기타', # 에디셔널 잠재옵션 2의 분류
        'additional_potential_option_3_grade': '기타'  # 에디셔널 잠재옵션 3의 분류
},
    '모자': {
        'item_group': '파프니르',       # 장비 세트
        'starforce': 20,                # 스타포스 강화 수치
        'mainstat_total': 300,          # 장비에 붙은 최종 주스탯
        'power_total': 78,              # 장비에 붙은 최종 공격력/마력
        'all_stat_total': 6,            # 장비에 붙은 최종 올스탯
        'potential_option_grade': '레전드리',          # 잠재옵션 등급
        'additional_potential_option_grade': '에픽',   # 에디셔널 잠재옵션 등급
        'potential_option_1_grade': 'S',               # 잠재옵션 1의 분류
        'potential_option_2_grade': 'S',               # 잠재옵션 2의 분류
        'potential_option_3_grade': 'S',               # 잠재옵션 3의 분류
        'additional_potential_option_1_grade': 'B',    # 에디셔널 잠재옵션 1의 분류
        'additional_potential_option_2_grade': '기타', # 에디셔널 잠재옵션 2의 분류
        'additional_potential_option_3_grade': 'B'     # 에디셔널 잠재옵션 3의 분류
},
    '장갑': {
        'item_group': '앱솔랩스',       # 장비 세트
        'starforce': 18,                # 스타포스 강화 수치
        'mainstat_total': 100,          # 장비에 붙은 최종 주스탯
        'power_total': 70,              # 장비에 붙은 최종 공격력/마력
        'all_stat_total': 0,            # 장비에 붙은 최종 올스탯
        'potential_option_grade': '유니크',            # 잠재옵션 등급
        'additional_potential_option_grade': '에픽',   # 에디셔널 잠재옵션 등급
        'potential_option_1_grade': 'S',               # 잠재옵션 1의 분류
        'potential_option_2_grade': 'S',               # 잠재옵션 2의 분류
        'potential_option_3_grade': '기타',            # 잠재옵션 3의 분류
        'additional_potential_option_1_grade': 'S',    # 에디셔널 잠재옵션 1의 분류
        'additional_potential_option_2_grade': 'S',    # 에디셔널 잠재옵션 2의 분류
        'additional_potential_option_3_grade': '기타'  # 에디셔널 잠재옵션 3의 분류
}
}

top_slots = ['무기', '모자', '장갑']

filled = fill_missing_slots(user_input, subclass_profiles, top_slots)
x_cat_new, x_cont_new, mask_new = encode_and_scale(
    filled_slots=filled,
    subclass=user_input['subclass'],
    encoders=encoders,
    scaler=scaler,
    cat_cols=cat_cols,
    num_cols=num_cols
)

user_vec = extract_user_vector(model, x_cat_new, x_cont_new, mask_new)

# df_stat_trimmed = df_stat[['nickname', 'subclass']].drop_duplicates() 가정됨
recommendations = recommend_similar_users(
    user_vec=user_vec,
    hl_vecs=hl_vecs,
    hl_nicks=hl_nicks,
    df_stat_trimmed=df_stat_trimmed,
    target_subclass=user_input['subclass']
)

print(f"\n📌 직업: {user_input['subclass']}  |  주스탯: {get_main_stat(user_input['subclass'])}")
print("✨ 유사한 고레벨 유저 추천 ✨\n")
for i, (nick, sim) in enumerate(recommendations, 1):
    print(f"{i}. 닉네임: {nick}  |  유사도: {sim:.4f}")


📌 직업: 카인  |  주스탯: DEX
✨ 유사한 고레벨 유저 추천 ✨

1. 닉네임: 샌핑  |  유사도: 0.9997
2. 닉네임: 김만떽  |  유사도: 0.9997
3. 닉네임: 꾸운감자  |  유사도: 0.9997
4. 닉네임: 카링까지  |  유사도: 0.9997
5. 닉네임: 동양의인재  |  유사도: 0.9997




In [None]:
# Top-3 중요 장비 기반 전투력 예측 성능 테스트 (무기, 모자, 장갑)
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 설정
top_3_slots = ['무기', '모자', '장갑']
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/data/item_nomissingvalues_copy.csv', na_values=[], keep_default_na=False)
df_stat = pd.read_csv('/content/drive/MyDrive/data/merged/stat_merged.csv')
df_stat_trimmed = df_stat[['nickname', 'subclass', '전투력']].drop_duplicates()
df = df.merge(df_stat_trimmed, on=['nickname', 'subclass'], how='left')
df.dropna(subset=['전투력'], inplace=True)

# 전처리
cat_cols = [
    'subclass', 'equipment_slot', 'main_stat_type', 'item_group',
    'starforce_scroll_flag', 'potential_option_grade', 'additional_potential_option_grade',
    'main_pot_grade_summary', 'add_pot_grade_summary', 'potential_status'
]
num_cols = [
    'boss_damage_total', 'ignore_monster_armor_total', 'all_stat_total', 'damage_total',
    'boss_damage_add', 'damage_add', 'all_stat_add', 'starforce', 'special_ring_level',
    'bonus_stat_total', 'mainstat_total', 'power_total', 'mainstat_add', 'power_add',
    'mainstat_etc', 'power_etc', 'mainstat_starforce', 'power_starforce'
]

# Top-3 장비만 필터링
df = df[df['equipment_slot'].isin(top_3_slots)].copy()
df = df.dropna(subset=cat_cols + num_cols + ['전투력'])

# 인코딩 및 스케일링
encoders = {col: LabelEncoder().fit(df[col]) for col in cat_cols}
for col in cat_cols:
    df[col] = encoders[col].transform(df[col])
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df['log_전투력'] = np.log1p(df['전투력'])

# 시퀀스 구성
max_len = len(top_3_slots)
df = df.sort_values("nickname")

x_cat_list, x_cont_list, mask_list, y_list = [], [], [], []

for name, group in df.groupby("nickname"):
    cat = group[cat_cols].values.astype(np.int64)
    cont = group[num_cols].values.astype(np.float32)
    valid_len = len(group)

    if valid_len < max_len:
        pad_cat = np.zeros((max_len - valid_len, len(cat_cols)))
        pad_cont = np.zeros((max_len - valid_len, len(num_cols)))
        pad_mask = [0] * (max_len - valid_len)
        cat = np.vstack([cat, pad_cat])
        cont = np.vstack([cont, pad_cont])
        mask = [1] * valid_len + pad_mask
    else:
        cat = cat[:max_len]
        cont = cont[:max_len]
        mask = [1] * max_len

    x_cat_list.append(torch.tensor(cat, dtype=torch.long))
    x_cont_list.append(torch.tensor(cont, dtype=torch.float32))
    mask_list.append(torch.tensor(mask, dtype=torch.float32))
    y_list.append(np.log1p(group['전투력'].iloc[0]))

x_cat = torch.stack(x_cat_list)
x_cont = torch.stack(x_cont_list)
mask = torch.stack(mask_list)
y = torch.tensor(y_list, dtype=torch.float32)

# equip count 추가
equip_counts = mask.sum(dim=1).unsqueeze(1).repeat(1, x_cont.shape[1]).unsqueeze(2)
x_cont = torch.cat([x_cont, equip_counts], dim=2)

embedding_info = {col: df[col].nunique() for col in cat_cols}

# 모델 학습 및 평가
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmses, r2s = [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(x_cat)):
    model = DeepMaskedModel(embedding_info, x_cont.shape[-1])
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = torch.nn.MSELoss()

    train_set = torch.utils.data.TensorDataset(x_cat[train_idx], x_cont[train_idx], mask[train_idx], y[train_idx])
    val_set = torch.utils.data.TensorDataset(x_cat[val_idx], x_cont[val_idx], mask[val_idx], y[val_idx])
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=256)

    for epoch in range(5):
        model.train()
        for xc, xc2, m, yt in train_loader:
            optimizer.zero_grad()
            pred = model(xc, xc2, m)
            loss = criterion(pred, yt)
            loss.backward()
            optimizer.step()

    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for xc, xc2, m, yt in val_loader:
            pred = model(xc, xc2, m)
            preds.append(pred.numpy())
            targets.append(yt.numpy())

    preds = np.concatenate(preds)
    targets = np.concatenate(targets)
    rmse = np.sqrt(mean_squared_error(targets, preds))
    r2 = r2_score(targets, preds)
    rmses.append(rmse)
    r2s.append(r2)
    print(f"Fold {fold+1}: RMSE={rmse:.4f}, R2={r2:.4f}")

print(f"\n📊 최종 평균 성능 (무기+모자+장갑): RMSE={np.mean(rmses):.4f}, R2={np.mean(r2s):.4f}")

Fold 1: RMSE=0.5819, R2=0.5351
Fold 2: RMSE=0.6507, R2=0.5182
Fold 3: RMSE=0.5945, R2=0.6485
Fold 4: RMSE=0.6057, R2=0.5964
Fold 5: RMSE=0.5636, R2=0.7043

📊 최종 평균 성능 (무기+모자+장갑): RMSE=0.5993, R2=0.6005


In [None]:
# Top-10 중요 장비 기반 전투력 예측 성능 테스트 (무기, 모자, 하의, 신발, 장갑, 망토, 어깨장식)
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 설정
selected_slots = ['무기', '모자', '하의', '신발', '장갑', '망토', '어깨장식']
selected_features = ['item_group', 'starforce', 'mainstat_total', 'power_total',
                     'potential_option_grade', 'additional_potential_option_grade']
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/data/item_nomissingvalues_copy.csv')
df_stat = pd.read_csv('/content/drive/MyDrive/data/merged/stat_merged.csv')
df_stat_trimmed = df_stat[['nickname', 'subclass', '전투력']].drop_duplicates()
df = df.merge(df_stat_trimmed, on=['nickname', 'subclass'], how='left')
df.dropna(subset=['전투력'], inplace=True)

# 필터링 및 전처리
df = df[df['equipment_slot'].isin(selected_slots)].copy()
df = df.dropna(subset=selected_features + ['전투력'])

# 피처 분리
cat_cols = ['item_group', 'potential_option_grade', 'additional_potential_option_grade']
num_cols = ['starforce', 'mainstat_total', 'power_total']

# 인코딩 + 스케일링
encoders = {col: LabelEncoder().fit(df[col]) for col in cat_cols}
for col in cat_cols:
    df[col] = encoders[col].transform(df[col])
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df['log_전투력'] = np.log1p(df['전투력'])

# 시퀀스 구성
max_len = len(selected_slots)
df = df.sort_values("nickname")

x_cat_list, x_cont_list, mask_list, y_list = [], [], [], []

for name, group in df.groupby("nickname"):
    cat = group[cat_cols].values.astype(np.int64)
    cont = group[num_cols].values.astype(np.float32)
    valid_len = len(group)

    if valid_len < max_len:
        pad_cat = np.zeros((max_len - valid_len, len(cat_cols)))
        pad_cont = np.zeros((max_len - valid_len, len(num_cols)))
        pad_mask = [0] * (max_len - valid_len)
        cat = np.vstack([cat, pad_cat])
        cont = np.vstack([cont, pad_cont])
        mask = [1] * valid_len + pad_mask
    else:
        cat = cat[:max_len]
        cont = cont[:max_len]
        mask = [1] * max_len

    x_cat_list.append(torch.tensor(cat, dtype=torch.long))
    x_cont_list.append(torch.tensor(cont, dtype=torch.float32))
    mask_list.append(torch.tensor(mask, dtype=torch.float32))
    y_list.append(np.log1p(group['전투력'].iloc[0]))

x_cat = torch.stack(x_cat_list)
x_cont = torch.stack(x_cont_list)
mask = torch.stack(mask_list)
y = torch.tensor(y_list, dtype=torch.float32)

# 장비 개수 feature 추가
equip_counts = mask.sum(dim=1).unsqueeze(1).repeat(1, x_cont.shape[1]).unsqueeze(2)
x_cont = torch.cat([x_cont, equip_counts], dim=2)

embedding_info = {col: df[col].nunique() for col in cat_cols}

# 모델 학습 및 평가
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmses, r2s = [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(x_cat)):
    model = DeepMaskedModel(embedding_info, x_cont.shape[-1])
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = torch.nn.MSELoss()

    train_set = torch.utils.data.TensorDataset(x_cat[train_idx], x_cont[train_idx], mask[train_idx], y[train_idx])
    val_set = torch.utils.data.TensorDataset(x_cat[val_idx], x_cont[val_idx], mask[val_idx], y[val_idx])
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=256)

    for epoch in range(5):
        model.train()
        for xc, xc2, m, yt in train_loader:
            optimizer.zero_grad()
            pred = model(xc, xc2, m)
            loss = criterion(pred, yt)
            loss.backward()
            optimizer.step()

    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for xc, xc2, m, yt in val_loader:
            pred = model(xc, xc2, m)
            preds.append(pred.numpy())
            targets.append(yt.numpy())

    preds = np.concatenate(preds)
    targets = np.concatenate(targets)
    rmse = np.sqrt(mean_squared_error(targets, preds))
    r2 = r2_score(targets, preds)
    rmses.append(rmse)
    r2s.append(r2)
    print(f"Fold {fold+1}: RMSE={rmse:.4f}, R2={r2:.4f}")

print(f"\n📊 최종 평균 성능 (7개 장비 핵심 피처): RMSE={np.mean(rmses):.4f}, R2={np.mean(r2s):.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = encoders[col].transform(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = encoders[col].transform(df[col])


Fold 1: RMSE=0.7845, R2=0.4024
Fold 2: RMSE=0.6897, R2=0.5501
Fold 3: RMSE=0.6644, R2=0.5335
Fold 4: RMSE=0.6826, R2=0.4811
Fold 5: RMSE=0.6304, R2=0.5600

📊 최종 평균 성능 (7개 장비 핵심 피처): RMSE=0.6903, R2=0.5054
