In [1]:
!pip install numpy pandas scikit-learn



In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from math import sqrt

In [9]:
# 데이터 불러오기
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')
original_test_data = test_data.copy()

In [None]:
# 'Income'을 예측 변수로, 나머지를 특성으로 사용
X = train_data.drop(['ID', 'Income', ''], axis=1)
y = train_data['Income']

# 범주형 변수를 더미 변수로 변환
X = pd.get_dummies(X)
test_data = pd.get_dummies(test_data.drop(['ID'], axis=1))
test_data = test_data.reindex(columns=X.columns, fill_value=0)

# 기본 모델 정의
base_models = [
    ('rf_model', RandomForestRegressor(random_state=42)),
    ('gb_model', GradientBoostingRegressor(random_state=42))
]

# 스태킹을 위한 훈련 데이터 준비
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 기본 모델의 예측 저장을 위한 배열 초기화
train_pred = np.zeros((X_train.shape[0], len(base_models)))
val_pred = np.zeros((X_val.shape[0], len(base_models)))
test_pred = np.zeros((test_data.shape[0], len(base_models)))

# 기본 모델 훈련 및 예측
for i, (name, model) in enumerate(base_models):
    print(f'Training {name}...')
    model.fit(X_train, y_train)
    train_pred[:, i] = model.predict(X_train)
    val_pred[:, i] = model.predict(X_val)
    test_pred[:, i] = model.predict(test_data)

# 메타 모델 훈련
meta_model = LinearRegression()
meta_model.fit(val_pred, y_val)

# 최종 예측
final_predictions = meta_model.predict(test_pred)

# 예측 결과를 제출 형식에 맞게 저장
submission = pd.DataFrame({'ID': original_test_data['ID'], 'Income': final_predictions})
submission.to_csv('stacked_submission.csv', index=False)

print('Stacked model predictions saved to stacked_submission.csv')

In [10]:
# 특성과 타겟 분리
X = pd.get_dummies(train_data.drop(['ID', 'Income', 'Race'], axis=1))
y = train_data['Income']
test_data = pd.get_dummies(test_data.drop(['ID'], axis=1))
test_data = test_data.reindex(columns=X.columns, fill_value=0)

# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_data_scaled = scaler.transform(test_data)

# 하이퍼파라미터 그리드
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 3, 10],
    'min_samples_split': [2, 5]
}

# 교차 검증을 위한 KFold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 기본 모델 정의 및 하이퍼파라미터 검색
rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=kf)
gb = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=kf)

base_models = [
    ('rf_model', rf),
    ('gb_model', gb)
]

# 스태킹을 위한 메타 모델 정의
meta_model = Ridge()

# 스태킹 모델 훈련 및 예측 함수
def stack_models_predict(X, y, test_data, base_models, meta_model):
    train_pred = np.zeros((X.shape[0], len(base_models)))
    test_pred = np.zeros((test_data.shape[0], len(base_models)))
    
    for i, (_, model) in enumerate(base_models):
        model.fit(X, y)
        train_pred[:, i] = model.predict(X)
        test_pred[:, i] = model.predict(test_data)
        
    meta_model.fit(train_pred, y)
    return meta_model.predict(test_pred)

# 최종 예측
final_predictions = stack_models_predict(X_scaled, y, test_data_scaled, base_models, meta_model)

# 예측 결과를 제출 형식에 맞게 저장
submission = pd.DataFrame({'ID': original_test_data['ID'], 'Income': final_predictions})
submission.to_csv('stacked_submission_with_improvements.csv', index=False)

print('Stacked model with improvements predictions saved to stacked_submission_with_improvements.csv')

Stacked model with improvements predictions saved to stacked_submission_with_improvements.csv
