In [5]:
!pip install pytorch-tabnet
# ✅ 필요 패키지 설치
# %pip install pytorch-tabnet lightgbm xgboost

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
#from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import torch
from pytorch_tabnet.tab_model import TabNetRegressor
import xgboost as xgb
import lightgbm as lgb



In [6]:
# ✅ 데이터 로딩
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# ✅ 기본 전처리
train.drop(columns=['ID'], inplace=True)
test.drop(columns=['ID'], inplace=True)

# 기업가치 수치화

def parse_value_range(value):
    try:
        return float(value)
    except:
        if isinstance(value, str) and '이상' in value:
            return float(value.replace('이상', '').strip())
        return np.nan

# 변수 분류
category_features = ['설립연도','국가','분야','투자단계','기업가치(백억원)']
numeric_features = ['직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)']
bool_features = ['인수여부','상장여부']

# 범주형 처리
train['설립연도'] = train['설립연도'].astype('object')
test['설립연도'] = test['설립연도'].astype('object')

encoders = {}
for feature in category_features:
    encoders[feature] = LabelEncoder()
    train[feature] = train[feature].fillna('Missing').astype(str)
    test[feature] = test[feature].fillna('Missing').astype(str)
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# 불리언 처리
bool_map = {'Yes': 1, 'No': 0}
for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)

# 기업가치 수치화
train['기업가치(중간값)'] = train['기업가치(백억원)'].apply(parse_value_range)
test['기업가치(중간값)'] = test['기업가치(백억원)'].apply(parse_value_range)

# 수치형 결측치 처리
all_numeric_features = numeric_features + ['기업가치(중간값)']
for feature in all_numeric_features:
    mean_value = train[feature].mean()
    train[feature] = train[feature].fillna(mean_value)
    test[feature] = test[feature].fillna(mean_value)

# 파생 변수 생성
def create_derived_features(df):
    df = df.copy()
    df['직원 1인당 매출'] = df['연매출(억원)'] / df['직원 수']
    df['고객 1인당 매출'] = df['연매출(억원)'] / df['고객수(백만명)']
    df['고객 1인당 투자금'] = df['총 투자금(억원)'] / df['고객수(백만명)']
    df['투자 대비 가치'] = df['기업가치(중간값)'] / df['총 투자금(억원)']
    df['SNS비율'] = df['SNS 팔로워 수(백만명)'] / df['고객수(백만명)']

    # 추가 변수
    df['투자 대비 연매출'] = df['연매출(억원)'] / (df['총 투자금(억원)'] + 1e-3)
    df['가치 대비 연매출'] = df['연매출(억원)'] / (df['기업가치(중간값)'] + 1e-3)
    df['연매출 성장 잠재력'] = df['SNS 팔로워 수(백만명)'] * df['고객수(백만명)']
    df.fillna(0, inplace=True)
    return df

  train[feature] = train[feature].fillna('Missing').astype(str)
  test[feature] = test[feature].fillna('Missing').astype(str)


In [7]:
train = create_derived_features(train)
test = create_derived_features(test)

# ✅ X, y 준비
features = [col for col in train.columns if col != '성공확률']
X = train[features]
y = train['성공확률']

# ✅ 앙상블 준비
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# OOF & Test 예측 저장
tabnet_oof = np.zeros((X.shape[0],))
xgb_oof = np.zeros((X.shape[0],))
lgb_oof = np.zeros((X.shape[0],))

tabnet_test_preds = np.zeros((test.shape[0], N_FOLDS))
xgb_test_preds = np.zeros((test.shape[0], N_FOLDS))
lgb_test_preds = np.zeros((test.shape[0], N_FOLDS))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}/{N_FOLDS}")
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    # TabNet
    tabnet = TabNetRegressor(
        n_d=32,
        n_a=32,
        n_steps=5,
        gamma=1.5,
        lambda_sparse=1e-4,
        optimizer_fn=torch.optim.AdamW,
        optimizer_params=dict(lr=2e-2),
        seed=42,
        verbose=0
    )

    tabnet.fit(X_train.values, y_train.values.reshape(-1,1),
               eval_set=[(X_valid.values, y_valid.values.reshape(-1,1))],
               patience=10, max_epochs=200)
    tabnet_oof[valid_idx] = tabnet.predict(X_valid.values).squeeze()
    tabnet_test_preds[:, fold] = tabnet.predict(test.values).squeeze()

    # XGBoost
    xgb_model = xgb.XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
      )
    xgb_model.fit(X_train, y_train)
    xgb_oof[valid_idx] = xgb_model.predict(X_valid)
    xgb_test_preds[:, fold] = xgb_model.predict(test)

    # LightGBM
    lgb_model = lgb.LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    lgb_model.fit(X_train, y_train)
    lgb_oof[valid_idx] = lgb_model.predict(X_valid)
    lgb_test_preds[:, fold] = lgb_model.predict(test)

# ✅ 스태킹 메타 모델 학습
stacked_X = np.vstack([tabnet_oof, xgb_oof, lgb_oof]).T
stacked_test = np.vstack([
    tabnet_test_preds.mean(axis=1),
    xgb_test_preds.mean(axis=1),
    lgb_test_preds.mean(axis=1)
]).T

meta_model = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.03,
    max_depth=3,
    random_state=42,
    n_jobs=-1
)
meta_model.fit(stacked_X, y)
final_preds = meta_model.predict(stacked_test)

# ✅ 제출
sample_submission['성공확률'] = final_preds
sample_submission.to_csv('stacking_submission.csv', index=False, encoding='utf-8-sig')

print("\n✅ 스태킹 앙상블 완료 및 제출 파일 저장됨!")


🔁 Fold 1/5

Early stopping occurred at epoch 46 with best_epoch = 36 and best_val_0_mse = 0.07567




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000946 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3210
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 21
[LightGBM] [Info] Start training from score 0.534486

🔁 Fold 2/5

Early stopping occurred at epoch 65 with best_epoch = 55 and best_val_0_mse = 0.06322




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000968 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3209
[LightGBM] [Info] Number of data points in the train set: 3501, number of used features: 21
[LightGBM] [Info] Start training from score 0.537104

🔁 Fold 3/5

Early stopping occurred at epoch 75 with best_epoch = 65 and best_val_0_mse = 0.05999




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000956 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3207
[LightGBM] [Info] Number of data points in the train set: 3501, number of used features: 21
[LightGBM] [Info] Start training from score 0.537332

🔁 Fold 4/5

Early stopping occurred at epoch 51 with best_epoch = 41 and best_val_0_mse = 0.0621




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000987 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3208
[LightGBM] [Info] Number of data points in the train set: 3501, number of used features: 21
[LightGBM] [Info] Start training from score 0.538018

🔁 Fold 5/5

Early stopping occurred at epoch 63 with best_epoch = 53 and best_val_0_mse = 0.05914




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000952 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3208
[LightGBM] [Info] Number of data points in the train set: 3501, number of used features: 21
[LightGBM] [Info] Start training from score 0.539760
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 4376, number of used features: 3
[LightGBM] [Info] Start training from score 0.537340

✅ 스태킹 앙상블 완료 및 제출 파일 저장됨!




In [8]:
#final_preds # 내림차순 정렬해서 보기

# 2차원 → 1차원으로 평탄화 (flatten 또는 ravel 사용)
flat_preds = final_preds.flatten()

# 내림차순 정렬
sorted_preds = np.sort(flat_preds)[::-1]
print(sorted_preds)

[0.71982014 0.70978599 0.67803871 ... 0.4187887  0.41854157 0.41808014]
