# Import

In [12]:
%pip install pytorch-tabnet

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor



# Data Load

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Data Preprocessing

In [4]:
#특성과 타겟 변수 분리
train = train.drop(columns=['ID'], axis = 1)
test = test.drop(columns=['ID'], axis = 1)

In [5]:
# 설립연도 타입 변환 (int -> object)
train['설립연도'] =train['설립연도'].astype('object')
test['설립연도'] =test['설립연도'].astype('object')

category_features = ['설립연도','국가','분야','투자단계','기업가치(백억원)']
numeric_features = ['직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)']
bool_features = ['인수여부','상장여부']

# LabelEncoder 객체를 각 범주형 feature별로 따로 저장하여 사용
encoders = {}

# 범주형 데이터를 encoding
for feature in category_features:
    encoders[feature] = LabelEncoder()
    train[feature] = train[feature].fillna('Missing')
    test[feature] = test[feature].fillna('Missing')
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# 불리언 값을 0과 1로 변환 ('Yes' → 1, 'No' → 0 으로 변환)
bool_map = {'Yes': 1, 'No': 0}

for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)

# 수치형 변수 결측치를 평균값으로 대체
for feature in numeric_features:
    mean_value = train[feature].mean()
    train[feature] = train[feature].fillna(mean_value)
    test[feature] = test[feature].fillna(mean_value)

# TabNet용 범주형 변수 인덱스(cat_idxs) 및 차원(cat_dims) 설정
features = [col for col in train.columns if col != '성공확률']
cat_idxs = [features.index(col) for col in category_features]
cat_dims = [train[col].max() + 1 for col in category_features]

  train[feature] = train[feature].fillna('Missing')
  test[feature] = test[feature].fillna('Missing')


In [11]:
# [1] 기업가치 문자열 처리 함수
def parse_value_range(value):
    try:
        return float(value)
    except:
        if isinstance(value, str) and '이상' in value:
            return float(value.replace('이상', '').strip())
        return np.nan

# [2] 변수 정의
category_features = ['설립연도','국가','분야','투자단계','기업가치(백억원)']
numeric_features = ['직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)']
bool_features = ['인수여부','상장여부']

# [3] 범주형 처리
train['설립연도'] = train['설립연도'].astype('object')
test['설립연도'] = test['설립연도'].astype('object')

encoders = {}
for feature in category_features:
    encoders[feature] = LabelEncoder()
    train[feature] = train[feature].fillna('Missing').astype(str)
    test[feature] = test[feature].fillna('Missing').astype(str)
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# [4] 불리언 처리
bool_map = {'Yes': 1, 'No': 0}
for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)

# [5] 기업가치 수치화
train['기업가치(중간값)'] = train['기업가치(백억원)'].apply(parse_value_range)
test['기업가치(중간값)'] = test['기업가치(백억원)'].apply(parse_value_range)

# [6] 수치형 결측값 처리
all_numeric_features = numeric_features + ['기업가치(중간값)']
for feature in all_numeric_features:
    mean_value = train[feature].mean()
    train[feature] = train[feature].fillna(mean_value)
    test[feature] = test[feature].fillna(mean_value)

# [7] 파생 변수 생성 함수
def create_derived_features(df):
    df = df.copy()
    df['직원 1인당 매출'] = df['연매출(억원)'] / df['직원 수']
    df['고객 1인당 매출'] = df['연매출(억원)'] / df['고객수(백만명)']
    df['고객 1인당 투자금'] = df['총 투자금(억원)'] / df['고객수(백만명)']
    df['투자 대비 가치'] = df['기업가치(중간값)'] / df['총 투자금(억원)']
    df['SNS비율'] = df['SNS 팔로워 수(백만명)'] / df['고객수(백만명)']
    df.fillna(0, inplace=True)
    return df

train = create_derived_features(train)
test = create_derived_features(test)

# [8] TabNet 입력용 feature 설정
features = [col for col in train.columns if col != '성공확률']
cat_idxs = [features.index(col) for col in category_features]
cat_dims = [train[col].nunique() for col in category_features]


  train[feature] = train[feature].fillna('Missing').astype(str)
  test[feature] = test[feature].fillna('Missing').astype(str)


# K-Fold Model Training

In [None]:
# 타겟 지정
target = train['성공확률']
X = train[features]
y = target

# KFold 설정
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

models = [] # 모델 저장 리스트
cv_scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}/{N_FOLDS}")

    X_train = X.iloc[train_idx].values
    y_train = y.iloc[train_idx].values.reshape(-1, 1)

    X_valid = X.iloc[valid_idx].values
    y_valid = y.iloc[valid_idx].values.reshape(-1, 1)

    # 비지도 사전학습
    print("▶ Pretraining...")

    pretrainer = TabNetPretrainer(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0
    )

    pretrainer.fit(
        X_train=X_train,
        max_epochs=100,
        batch_size=512,
        virtual_batch_size=64
    )


    # 지도 학습
    print("▶ Fine-tuning...")
    model = TabNetRegressor(
        n_d=32,
        n_a=32,
        n_steps=5,
        gamma=1.5,
        lambda_sparse=1e-4,
        optimizer_fn=torch.optim.AdamW,
        optimizer_params=dict(lr=2e-2),
        seed=42,
        verbose=0
    )

    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        from_unsupervised=pretrainer,
        eval_metric=['mae'],
        max_epochs=100,
        patience=10
    )

    # 모델을 메모리에 저장
    models.append(model)
    cv_scores.append(model.best_cost)

print("\n✅ 모든 fold 모델 학습 완료!")


# Fold별 성능 평가
all_mae, all_rmse, all_r2 = [], [], []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    X_valid = X.iloc[valid_idx].values
    y_valid = y.iloc[valid_idx].values.reshape(-1, 1)

    preds = models[fold].predict(X_valid)

    mae = mean_absolute_error(y_valid, preds)
    mse = mean_squared_error(y_valid, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_valid, preds)

    print(f"Fold {fold+1}: MAE = {mae:.4f}, RMSE = {rmse:.4f}, R² = {r2:.4f}")
    all_mae.append(mae)
    all_rmse.append(rmse)
    all_r2.append(r2)

# 전체 평균 출력
print("\n📊 Overall Performance:")
print(f"Average MAE  : {np.mean(all_mae):.4f}")
print(f"Average RMSE : {np.mean(all_rmse):.4f}")
print(f"Average R²   : {np.mean(all_r2):.4f}")


🔁 Fold 1/5
▶ Pretraining...




# K-Fold Model Prediction

In [9]:
# 저장된 모델들로 예측
predictions_list = []

for fold, model in enumerate(models):
    print(f"Predict with fold {fold+1}")
    preds = model.predict(test[features].values)
    predictions_list.append(preds)

# 평균 예측
final_predictions = np.mean(predictions_list, axis=0)
print(final_predictions)

Predict with fold 1
Predict with fold 2
Predict with fold 3
Predict with fold 4
Predict with fold 5
[[0.5266214 ]
 [0.54752624]
 [0.56245315]
 ...
 [0.56448543]
 [0.5250262 ]
 [0.55547893]]


# Submission

In [None]:
sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')