In [None]:
import pandas as pd
import numpy as np



In [None]:
# Load Data
op = 0 # 0: Local, 1: Kaggle

if not op: # Local
    train_path = './data/train.csv'
    test_path = './data/test.csv'
    sub_path = './data/sample_submission.csv'
    save_path = './data/submission.csv'
else:  # Kaggle
    train_path = '/kaggle/input/playground-series-s5e4/train.csv'
    test_path = '/kaggle/input/playground-series-s5e4/test.csv'
    sub_path = '/kaggle/input/playground-series-s5e4/sample_submission.csv'    
    save_path = '/kaggle/working/submission.csv'
    

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_sub = pd.read_csv(sub_path)


# Check NaN
print(df_train.isnull().sum())
print("=====================================")
print(df_test.isnull().sum())

df_train.head()

id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64
id                                 0
Podcast_Name                       0
Episode_Title                      0
Episode_Length_minutes         28736
Genre                              0
Host_Popularity_percentage         0
Publication_Day                    0
Publication_Time                   0
Guest_Popularity_percentage    48832
Number_of_Ads                      0
Episode_Sentiment                  0
dtype: int64


Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [3]:
# Check the type of string data
comparion_cols = ['Podcast_Name', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

for col in comparion_cols:
    re = np.array_equal(
        np.sort(df_train[col].unique()), 
        np.sort(df_test[col].unique())
    )
    print(f"{col} : {re}")

Podcast_Name : True
Genre : True
Publication_Day : True
Publication_Time : True
Episode_Sentiment : True


In [4]:
# Drop Feature 
drop_cols = ['Episode_Title']
df_train = df_train.drop(columns=drop_cols)
df_test = df_test.drop(columns=drop_cols)

# Fill NaN
def fill_NaN(df, target_col, group_col, method):
    df[target_col] = df[target_col].fillna(df.groupby(group_col)[target_col].transform(method))

nan_list = [
    ['Episode_Length_minutes','Podcast_Name','mean'],
    ['Number_of_Ads', 'Podcast_Name','mean'], 
    ['Guest_Popularity_percentage', 'Podcast_Name','mean']
]

for l in nan_list:
    fill_NaN(df_train, l[0], l[1], l[2])
    fill_NaN(df_test, l[0], l[1], l[2])


print(df_train.isnull().sum())
print("=====================================")
print(df_test.isnull().sum())
df_train.head()

id                             0
Podcast_Name                   0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
Listening_Time_minutes         0
dtype: int64
id                             0
Podcast_Name                   0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
dtype: int64


Unnamed: 0,id,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,64.388461,True Crime,74.81,Thursday,Night,52.241037,0.0,Positive,31.41998
1,1,Joke Junction,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [5]:
# Encoding - One-hot
onehot_cols = ['Podcast_Name', 'Genre', 'Publication_Time', 'Publication_Day', 'Episode_Sentiment']
df_train = pd.get_dummies(df_train, columns=onehot_cols, drop_first=False)
df_test = pd.get_dummies(df_test, columns=onehot_cols, drop_first=False)
df_train, df_test = df_train.align(df_test, join='left', axis=1, fill_value=0)

print(df_train.columns.size == df_test.columns.size)
df_train


True


Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes,Podcast_Name_Athlete's Arena,Podcast_Name_Brain Boost,Podcast_Name_Business Briefs,Podcast_Name_Business Insights,...,Publication_Day_Friday,Publication_Day_Monday,Publication_Day_Saturday,Publication_Day_Sunday,Publication_Day_Thursday,Publication_Day_Tuesday,Publication_Day_Wednesday,Episode_Sentiment_Negative,Episode_Sentiment_Neutral,Episode_Sentiment_Positive
0,0,64.388461,74.81,52.241037,0.0,31.41998,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
1,1,119.800000,66.95,75.950000,2.0,88.01241,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
2,2,73.900000,69.97,8.970000,0.0,44.92531,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
3,3,67.170000,57.22,78.700000,2.0,46.27824,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
4,4,110.510000,80.07,58.680000,3.0,75.61031,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,749995,75.660000,69.36,52.431959,0.0,56.87058,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
749996,749996,75.750000,35.21,51.273843,2.0,45.46242,False,False,True,False,...,False,False,True,False,False,False,False,False,True,False
749997,749997,30.980000,78.58,84.890000,0.0,15.26000,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
749998,749998,108.980000,45.39,93.270000,0.0,100.72939,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False


In [None]:
import torch
from torch import nn, optim
import numpy as np
from torch.amp import GradScaler, autocast

# 0. 이전 DataLoader 워커 정리 (노트북 세션에 남아있는 워커가 있다면 제거)
import gc
try:
    del train_loader, val_loader
except NameError:
    pass
gc.collect()

# 1. 디바이스 설정 & CuDNN 벤치마크
device = torch.device(
    'cuda' if torch.cuda.is_available()
    else 'mps'  if torch.backends.mps.is_available()
    else 'cpu'
)
print(f"✅ Device: {device}")
if device.type == 'cuda':
    print(f"🚀 GPU: {torch.cuda.get_device_name(0)}, "
          f"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB")
    torch.backends.cudnn.benchmark = True

# 2. 데이터 로드 & CPU 정규화
X_raw = df_train.drop(columns=['Listening_Time_minutes','id']).to_numpy(dtype=np.float32)
y_raw = df_train['Listening_Time_minutes'].to_numpy(dtype=np.float32).reshape(-1,1)

X_cpu = torch.from_numpy(X_raw)
y_cpu = torch.from_numpy(y_raw)

# 정규화 파라미터
x_mean, x_std = X_cpu.mean(0), X_cpu.std(0)
y_mean, y_std = y_cpu.mean(), y_cpu.std()

# CPU에서 정규화
X_cpu = (X_cpu - x_mean) / x_std
y_cpu = (y_cpu - y_mean) / y_std

print(f"✅ X shape: {X_cpu.shape}, y shape: {y_cpu.shape}")

# 3. train/val split (풀 배치)
n_total = X_cpu.size(0)
n_train = int(n_total * 0.8)

X_tr_cpu = X_cpu[:n_train]
y_tr_cpu = y_cpu[:n_train]
X_val_cpu = X_cpu[n_train:]
y_val_cpu = y_cpu[n_train:]

# 4. GPU로 전체 텐서 이동
X_tr  = X_tr_cpu.to(device, non_blocking=True)
y_tr  = y_tr_cpu.to(device, non_blocking=True)
X_val = X_val_cpu.to(device, non_blocking=True)
y_val = y_val_cpu.to(device, non_blocking=True)

print(f"✅ Training on device: X_tr {X_tr.device}, y_tr {y_tr.device}")

# 5. 모델 정의 (마지막에 Softplus 추가하여 음수 예측 방지)
class PodcastModel(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, 300), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(300, 200),         nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(200, 100),         nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(100, 100),         nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(100,  50),         nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(50,    1),
        )
        self.apply(self._init_weights)
    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight)
            m.bias.data.fill_(0.01)
    def forward(self, x):
        return self.net(x)

model     = PodcastModel(X_tr.size(1)).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4, momentum=0.9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=30
)
scaler    = GradScaler()

print(f"✅ Model on: {next(model.parameters()).device}")

# 6. 풀 배치 학습 루프 (Mixed Precision + Early Stopping)
epochs, print_every = 10000, 100
best_val, patience, stop_cnt = float('inf'), 30, 0

for ep in range(1, epochs+1):
    # — Train —
    model.train()
    optimizer.zero_grad()
    with autocast(device_type=device.type):
        preds = model(X_tr)
        loss  = criterion(preds, y_tr)
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    train_loss = loss.item()

    # — Validate —
    model.eval()
    with torch.no_grad(), autocast(device_type=device.type):
        val_loss = criterion(model(X_val), y_val).item()

    scheduler.step(val_loss)

    # Early stopping
    if val_loss < best_val:
        best_val, stop_cnt = val_loss, 0
    else:
        stop_cnt += 1
        if stop_cnt >= patience:
            print(f"Early stopping at epoch {ep}")
            break

    # Logging
    if ep % print_every == 0:
        lr = optimizer.param_groups[0]['lr']
        used = torch.cuda.memory_allocated()/1e9 if device.type=='cuda' else 0
        print(f"Epoch {ep:4d} | Train {train_loss:.4f} | Val {val_loss:.4f} | "
              f"LR {lr:.1e} | GPU Mem {used:.2f} GB")

# 7. 최종 검증 RMSE 계산
rmse_norm = np.sqrt(best_val)
rmse_real = rmse_norm * y_std.item()
print(f"[Validation RMSE] normalized: {rmse_norm:.4f}, real: {rmse_real:.4f}")

# 8. 테스트셋 전처리 & 예측 (풀 배치)
X_test_raw = df_test.drop(columns=['Listening_Time_minutes','id']).to_numpy(dtype=np.float32)
X_test_cpu = torch.from_numpy(X_test_raw)
X_test_cpu = (X_test_cpu - x_mean) / x_std
X_test = X_test_cpu.to(device, non_blocking=True)

model.eval()
with torch.no_grad(), autocast(device_type=device.type):
    preds_test = model(X_test).cpu()

# 추가로 만약 Softplus 대신 클리핑 적용을 원하면:
preds_test = torch.clamp(preds_test, min=0)

predictions = (preds_test * y_std + y_mean).squeeze().numpy()

# 9. 제출 파일 생성
df_sub = df_test.copy()
df_sub['Listening_Time_minutes'] = predictions
df_sub[['id','Listening_Time_minutes']].to_csv(save_path, index=False)
print(f"✅ Submission saved to {save_path}")

# 10. 메모리 정리
torch.mps.empty_cache()

✅ Device: mps
✅ X shape: torch.Size([750000, 76]), y shape: torch.Size([750000, 1])
✅ Training on device: X_tr mps:0, y_tr mps:0


TypeError: Adam.__init__() got an unexpected keyword argument 'momentum'

: 