In [9]:
import optuna
from optuna.samplers import TPESampler
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from catboost import CatBoostRegressor
import random

import os
from warnings import filterwarnings
filterwarnings('ignore')

np.random.seed(42)
random.seed(42)
os.environ['PYTHONHASHSEED'] = '42'

In [10]:
# 1) Veriyi oku
data = pd.read_table(r"C:\Users\borac\Desktop\LUAD_miRNA\input\TCGA_LUAD_miRNA_expression_disease_status.txt", header=0, index_col=0)

# 2) Transpoz al
data = data.transpose()

X = data.iloc[:, :-1]  # Son sütun hariç tüm sütunlar
y = data.iloc[:, -1]   # Son sütun

y = pd.to_numeric(y, errors='coerce')  # Hatalı dönüşüm varsa NaN olur

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# 5️⃣ Optuna Objective Fonksiyonu
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 10, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "border_count": trial.suggest_int("border_count", 1, 255),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.0, 1.0),
        "random_seed": 42,
        "verbose": 2,
        "loss_function": "RMSE"
    }

    model = CatBoostRegressor(**params,)
    
    # 5-fold CV ile RMSE (negatif işareti dahil)
    scores = cross_val_score(
        model,
        X_train_scaled,
        y_train,
        cv=5,
        scoring="r2"
    )

    return scores.mean()  # maximize (negatif RMSE olduğundan büyük = daha iyi)


In [None]:
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("\n🎯 En iyi parametreler:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

print(f"\nOrtalama CV R²: {study.best_value:.4f}")

best_params = study.best_params

[I 2025-10-22 12:10:19,580] A new study created in memory with name: no-name-0f1926c8-6bb0-4556-af9c-3f38bbc6368a


  0%|          | 0/100 [00:00<?, ?it/s]

0:	learn: 0.2222125	total: 260ms	remaining: 1m 38s
2:	learn: 0.1672951	total: 313ms	remaining: 39.4s
4:	learn: 0.1342167	total: 364ms	remaining: 27.3s
6:	learn: 0.1165663	total: 416ms	remaining: 22.2s
8:	learn: 0.0990602	total: 469ms	remaining: 19.4s
10:	learn: 0.0871781	total: 524ms	remaining: 17.6s
12:	learn: 0.0720549	total: 578ms	remaining: 16.4s
14:	learn: 0.0624605	total: 628ms	remaining: 15.3s
16:	learn: 0.0561018	total: 679ms	remaining: 14.5s
18:	learn: 0.0511571	total: 732ms	remaining: 13.9s
20:	learn: 0.0479974	total: 784ms	remaining: 13.4s
22:	learn: 0.0437431	total: 838ms	remaining: 13s
24:	learn: 0.0411168	total: 892ms	remaining: 12.7s
26:	learn: 0.0386476	total: 946ms	remaining: 12.4s
28:	learn: 0.0363341	total: 1s	remaining: 12.1s
30:	learn: 0.0341655	total: 1.1s	remaining: 12.4s
32:	learn: 0.0321292	total: 1.15s	remaining: 12.2s
34:	learn: 0.0302164	total: 1.21s	remaining: 11.9s
36:	learn: 0.0284180	total: 1.26s	remaining: 11.7s
38:	learn: 0.0267308	total: 1.32s	remaini

In [6]:
best_model = CatBoostRegressor(
    **best_params,
    loss_function="RMSE",
    random_seed=42,
    verbose=2
)
best_model.fit(X_train_scaled, y_train)

0:	learn: 0.2172908	total: 16.7ms	remaining: 5.77s
2:	learn: 0.1675500	total: 37.9ms	remaining: 4.33s
4:	learn: 0.1305855	total: 59.6ms	remaining: 4.07s
6:	learn: 0.1084039	total: 81.4ms	remaining: 3.94s
8:	learn: 0.0936330	total: 104ms	remaining: 3.88s
10:	learn: 0.0817261	total: 125ms	remaining: 3.8s
12:	learn: 0.0713348	total: 146ms	remaining: 3.74s
14:	learn: 0.0628903	total: 168ms	remaining: 3.7s
16:	learn: 0.0566744	total: 189ms	remaining: 3.67s
18:	learn: 0.0535627	total: 211ms	remaining: 3.63s
20:	learn: 0.0495966	total: 232ms	remaining: 3.6s
22:	learn: 0.0474696	total: 255ms	remaining: 3.58s
24:	learn: 0.0458638	total: 277ms	remaining: 3.56s
26:	learn: 0.0422611	total: 299ms	remaining: 3.53s
28:	learn: 0.0403086	total: 321ms	remaining: 3.5s
30:	learn: 0.0394165	total: 343ms	remaining: 3.48s
32:	learn: 0.0376525	total: 365ms	remaining: 3.46s
34:	learn: 0.0355121	total: 386ms	remaining: 3.43s
36:	learn: 0.0335176	total: 408ms	remaining: 3.41s
38:	learn: 0.0305898	total: 430ms	re

<catboost.core.CatBoostRegressor at 0x16160a597f0>

In [12]:
# 8️⃣ Test performansı (R² ve MSE)

y_pred = best_model.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("\n📈 Model Performansı:")
print(f"  R² Score: {r2:.4f}")
print(f"  Mean Squared Error (MSE): {mse:.4f}")


📈 Model Performansı:
  R² Score: 0.8568
  Mean Squared Error (MSE): 0.0149


In [13]:
# 9️⃣ Özellik önemleri (Feature Importances)

feat_imp_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": best_model.get_feature_importance()
}).sort_values(by="Importance", ascending=False)


In [14]:
# Özellik önemlerini kaydet
feat_imp_df.to_csv(
    "C:/Users/borac/Desktop/LUAD_miRNA/output/feature_selection/cat_boost/LUAD_mirna_CatBoost_SelectedFeatures2.txt",
    sep='\t', index=False
)

feat_imp_df.to_csv(
    "C:/Users/borac/Desktop/LUAD_miRNA/output/feature_selection/cat_boost/LUAD_mirna_CatBoost_SelectedFeatures2.csv",
    index=False
)
print(f"\nToplam seçilen özellik sayısı: {len(feat_imp_df)}")
print(feat_imp_df.head())



Toplam seçilen özellik sayısı: 1881
           Feature  Importance
286    hsa-mir-210   74.404853
1849   hsa-mir-9-2    3.381347
1458   hsa-mir-625    2.355705
180   hsa-mir-135b    2.267598
714    hsa-mir-429    1.995024


In [15]:
print("\n🎯 En iyi parametreler (tekrar):")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

print(f"\nOrtalama CV R²: {study.best_value:.4f}")


🎯 En iyi parametreler (tekrar):
  iterations: 346
  learning_rate: 0.19577473064950685
  depth: 5
  l2_leaf_reg: 4.662674123433278
  border_count: 132
  bagging_temperature: 0.4157157058728041
  random_strength: 0.5160886348622336

Ortalama CV R²: 0.8275
