In [None]:
!pip install catboost



In [None]:
import pandas as pd
from scipy.optimize import minimize
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats.mstats import winsorize
import warnings
from lightgbm import LGBMClassifier
warnings.filterwarnings("ignore")

In [None]:
# --- veri yükle ---
data = pd.read_csv("/content/tess.csv", comment='#')

In [None]:
# --- binary label ---
data['label'] = data['tfopwg_disp'].apply(lambda x: 1 if x in ['KP', 'CP'] else 0)

In [None]:
# --- feature columns (int/float olanlar) ---
feature_cols = [
    'ra','dec','st_teff','st_logg','st_rad','st_dist',
    'st_pmra','st_pmdec','st_tmag','pl_orbper','pl_rade',
    'pl_trandep','pl_trandurh','pl_eqt','pl_insol', 'pl_tranmid', 'pl_pnum'
]

In [None]:
# --- winsorize ---
for col in feature_cols:
    data[col] = winsorize(data[col], limits=[0.01,0.01])

In [None]:
# --- eksik değerleri KNNImputer ile doldur ---
imputer = KNNImputer(n_neighbors=5)
X = pd.DataFrame(imputer.fit_transform(data[feature_cols]), columns=feature_cols)
y = data['label'].reset_index(drop=True)

In [None]:
# --- train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# --- en iyi hyperparametreler ile modeller ---
seed = 42
cat_model = CatBoostClassifier(
    iterations=992,
    learning_rate=0.1477,
    depth=5,
    l2_leaf_reg=2.42,
    loss_function='Logloss',
    verbose=0,
    random_seed=seed
)
xgb_model = XGBClassifier(
    n_estimators=654,
    learning_rate=0.0582,
    max_depth=7,
    subsample=0.8595,
    colsample_bytree=0.9961,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=seed
)
lgbm_model = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.0582,
    max_depth=7,
    subsample=0.8595,
    colsample_bytree=0.9961,
    random_state=42,
    verbosity=-1
)

In [None]:
# --- fit ---
cat_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)

In [None]:
w1, w2 = 0.5931129060965006, 0.13535353285850302
w3 = 1 - w1 - w2
#w1, w2, w3 = 0.4, 0.35, 0.25

In [None]:
# --- ensemble weights ---
weights = [0.4, 0.3, 0.3]  # e.g., cat=0.4, xgb=0.3, lgbm=0.3
cat_pred_proba = cat_model.predict_proba(X_test)[:,1]
xgb_pred_proba = xgb_model.predict_proba(X_test)[:,1]
lgbm_pred_proba = lgbm_model.predict_proba(X_test)[:,1]
#ensemble_pred_proba = (0.4 * cat_pred_proba + 0.3 * xgb_pred_proba + 0.3 * lgbm_pred_proba)
#ensemble_pred = (ensemble_pred_proba >= 0.5).astype(int)
ensemble_pred_proba = (w1 * cat_pred_proba + w2 * xgb_pred_proba + w3 * lgbm_pred_proba)
ensemble_pred = (ensemble_pred_proba >= 0.5).astype(int)


In [None]:
# --- test metrics ---
print("Ensemble ACCURACY:", accuracy_score(y_test, ensemble_pred))
print("Ensemble PRECISION:", precision_score(y_test, ensemble_pred))
print("Ensemble RECALL:", recall_score(y_test, ensemble_pred))
print("Ensemble F1:", f1_score(y_test, ensemble_pred))

Ensemble ACCURACY: 0.8807040417209909
Ensemble PRECISION: 0.7115384615384616
Ensemble RECALL: 0.4457831325301205
Ensemble F1: 0.5481481481481482


In [None]:
# --- 20-Fold Stratified CV ensemble ---
kf = StratifiedKFold(n_splits=20, shuffle=True, random_state=seed)
fold_acc = []

In [None]:
for i, (train_idx, val_idx) in enumerate(kf.split(X, y), 1):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

    cat_model.fit(X_tr, y_tr)
    xgb_model.fit(X_tr, y_tr)
    lgbm_model.fit(X_tr, y_tr)

    cat_p = cat_model.predict_proba(X_va)[:,1]
    xgb_p = xgb_model.predict_proba(X_va)[:,1]
    lgbm_p = lgbm_model.predict_proba(X_va)[:,1]

    #weights = [0.4, 0.3, 0.3]
    #ensemble_p = (weights[0]*cat_p + weights[1]*xgb_p + weights[2]*lgbm_p)
    ensemble_p = (w1*cat_p + w2*xgb_p + w3*lgbm_p)
    ensemble_pred = (ensemble_p >= 0.5).astype(int)

    acc = accuracy_score(y_va, ensemble_pred)
    fold_acc.append(acc)
    print(f"Fold {i}: ACC = {acc:.4f}")

Fold 1: ACC = 0.8776
Fold 2: ACC = 0.8698
Fold 3: ACC = 0.8698
Fold 4: ACC = 0.8854
Fold 5: ACC = 0.9141
Fold 6: ACC = 0.8646
Fold 7: ACC = 0.8828
Fold 8: ACC = 0.8932
Fold 9: ACC = 0.9034
Fold 10: ACC = 0.8825
Fold 11: ACC = 0.8851
Fold 12: ACC = 0.8799
Fold 13: ACC = 0.8825
Fold 14: ACC = 0.8851
Fold 15: ACC = 0.8930
Fold 16: ACC = 0.8695
Fold 17: ACC = 0.9008
Fold 18: ACC = 0.8695
Fold 19: ACC = 0.8668
Fold 20: ACC = 0.8956


In [None]:
import pickle
from google.colab import files

cat_model.fit(X, y)
xgb_model.fit(X, y)
lgbm_model.fit(X, y)

ensemble_models = {
    'catboost': cat_model,
    'xgboost': xgb_model,
    'lightgbm': lgbm_model
}

filename = 'ensemble_models.pkl'
with open(filename, 'wb') as f:
    pickle.dump(ensemble_models, f)

print(f"Ensemble models saved to {filename}")


files.download(filename)

Ensemble models saved to ensemble_models.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print("\n--- 20-Fold Summary ---")
print("Average ACCURACY:", np.mean(fold_acc))
print("Best Fold ACCURACY:", np.max(fold_acc))


--- 20-Fold Summary ---
Average ACCURACY: 0.883543434508268
Best Fold ACCURACY: 0.9140625
