In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, classification_report, confusion_matrix, make_scorer, accuracy_score
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from optuna.samplers import TPESampler

# 데이터 로드
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# === 결측치 처리 ===
columns_fill_knn = ['해당층', '총층', '전용면적', '방수', '욕실수', '총주차대수']
imputer = KNNImputer(n_neighbors=1)
train[columns_fill_knn] = imputer.fit_transform(train[columns_fill_knn])
test[columns_fill_knn] = imputer.transform(test[columns_fill_knn])

# === 레이블 인코딩 ===
label_encode_cols = ['중개사무소', '게재일', '제공플랫폼', '방향']
for col in label_encode_cols:
    le = LabelEncoder()
    combined_data = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(combined_data)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# === 원-핫 인코딩 ===
one_hot_cols = ['매물확인방식', '주차가능여부']
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_encoded = one_hot_encoder.fit_transform(train[one_hot_cols])
test_encoded = one_hot_encoder.transform(test[one_hot_cols])
train = pd.concat([train.drop(columns=one_hot_cols), pd.DataFrame(train_encoded, index=train.index)], axis=1)
test = pd.concat([test.drop(columns=one_hot_cols), pd.DataFrame(test_encoded, index=test.index)], axis=1)

# === ID 컬럼 제거 ===
train = train.drop(columns=['ID'])
test_id = test['ID']
test = test.drop(columns=['ID'])

# 데이터 분리
X = train.drop(columns=['허위매물여부'])
y = train['허위매물여부']

# 컬럼 이름을 문자열로 변환
X.columns = X.columns.astype(str)
test.columns = test.columns.astype(str)

# SMOTE로 데이터 증강
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# 컬럼 이름을 문자열로 변환
X_res.columns = X_res.columns.astype(str)

# Optuna objective function
def objective(trial):
    # 하이퍼파라미터 제시
    n_features_to_select = trial.suggest_int('n_features_to_select', 5, X_res.shape[1])
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 20)

    # Recursive Feature Elimination (RFE) 적용
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    rfe = RFE(model, n_features_to_select=n_features_to_select)
    X_res_rfe = rfe.fit_transform(X_res, y_res)

    # 교차 검증 설정 - Cross Validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scorer = make_scorer(accuracy_score)

    # 교차 검증 수행 및 평균 점수 계산
    cv_scores = cross_val_score(model, X_res_rfe, y_res, cv=kf, scoring=scorer, n_jobs=-1)
    mean_accuracy = cv_scores.mean()

    return mean_accuracy

# 최적화 진행
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=80, show_progress_bar=True)

# 최적의 하이퍼파라미터와 결과 출력
print(f"Best hyperparameters: {study.best_params}")
print(f"Best score: {study.best_value}")

# 최적의 하이퍼파라미터로 모델 재학습
best_n_features_to_select = study.best_params['n_features_to_select']
best_n_estimators = study.best_params['n_estimators']
best_max_depth = study.best_params['max_depth']

rfe = RFE(RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth, random_state=42), n_features_to_select=best_n_features_to_select)
X_res_rfe = rfe.fit_transform(X_res, y_res)
model = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth, random_state=42)
model.fit(X_res_rfe, y_res)

# 테스트 데이터에 동일한 특징 선택 적용
test_rfe = rfe.transform(test[X_res.columns])

# 클러스터 예측
train['RFC'] = model.predict(X_res_rfe[:len(train)])  # 길이를 맞춰줍니다
test['RFC'] = model.predict(test_rfe)

# === 테스트 데이터 예측 및 저장 ===
submission = pd.DataFrame({'ID': test_id, '허위매물여부': test['RFC']})
submission.to_csv('submission_8.csv', index=False)
print("Submission file saved to 'submission_8.csv'")