In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score

# 1️⃣ Load dữ liệu
df = pd.read_csv("final_features_full.csv")

# 2️⃣ Tách X và y
X = df.drop("label", axis=1)
y = df["label"]

# 3️⃣ Chuẩn hóa các cột số
columns_to_scale = [
    'raised_amount_usd', 'funding_total_usd', 'raised_amount_m', 'raised_amount_b',
    'raised_amount_k', 'num_investors', 'success_rate', 'failure_rate', 'success_amount',
    'fail_amount', 'total_amount', 'avg_funding_per_investor', 'num_of_success_companies',
    'num_of_failed_companies', 'funded_at_year', 'funded_at_month', 'funded_at_day',
    'funding_total_m', 'funding_total_b', 'funding_total_k', 'company_age', 'funding_duration_days',
    'avg_funding_duration_per_round', 'funding_duration_months', 'funding_duration_years',
    'founded_at_year', 'founded_at_month', 'founded_at_day', 'first_funding_year',
    'first_funding_month', 'first_funding_day', 'last_funding_year', 'last_funding_month',
    'last_funding_day', 'top_past_sector', 'top_current_sector', 'top_future_sector', 'funding_frequency'
]
scaler = StandardScaler()
X_scaled_part = pd.DataFrame(scaler.fit_transform(X[columns_to_scale]), columns=columns_to_scale)
X_remaining = X.drop(columns=columns_to_scale)
X_final = pd.concat([X_scaled_part.reset_index(drop=True), X_remaining.reset_index(drop=True)], axis=1)

# 4️⃣ Tách train/test
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# 5️⃣ Khởi tạo base learners
base_learners = [
    ('dt', DecisionTreeClassifier(max_depth=5, random_state=42)),
    ('xgb', XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
]

# 6️⃣ Meta learner
meta_learner = LogisticRegression(max_iter=200, random_state=42)

# 7️⃣ Khởi tạo stacking model
stacking_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1,
    passthrough=False
)

# 8️⃣ Huấn luyện stacking model
stacking_model.fit(X_train, y_train)

# 9️⃣ Dự đoán và đánh giá
y_pred = stacking_model.predict(X_test)
y_proba = stacking_model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))
print("AUC score:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.949928469241774
F1 score: 0.9689755603393694
AUC score: 0.9809036272638261

Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.86      0.87       959
         1.0       0.97      0.97      0.97      3934

    accuracy                           0.95      4893
   macro avg       0.92      0.91      0.92      4893
weighted avg       0.95      0.95      0.95      4893

