In [7]:
import optuna
import xgboost as xgb
import pandas as pd
import numpy as np
import random, os

from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from warnings import filterwarnings
filterwarnings("ignore")

# 1Ô∏è‚É£ Sabit rastgelelik (reproducibility)
# -----------------------------------------------------------------------------
np.random.seed(42)
random.seed(42)
os.environ["PYTHONHASHSEED"] = "42"

In [8]:
# 1) Veriyi oku
data = pd.read_table(r"C:\Users\borac\Desktop\LUAD_miRNA\input\TCGA_LUAD_miRNA_expression_disease_status.txt", header=0, index_col=0)

# 2) Transpoz al
data = data.transpose()

# 3) √ñzellikleri ve hedef deƒüi≈ükeni ayƒ±r
X = data.iloc[:, :-1]  # Son s√ºtun hari√ß t√ºm s√ºtunlar
y = data.iloc[:, -1]   # Son s√ºtun

# 4) Y'yi sayƒ±sal deƒüere √ßevir
y = pd.to_numeric(y, errors='coerce')  # Hatalƒ± d√∂n√º≈ü√ºm varsa NaN olur

# 5) Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# 4Ô∏è‚É£ Optuna objective fonksiyonu
# -----------------------------------------------------------------------------
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 800),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1,100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "n_jobs": -1,
        "random_state": 42
        #"booster": "gbtree",
        #"tree_method": "hist",
        #"use_label_encoder": False,

    }

    model = xgb.XGBRegressor(**params)

    # 5-fold cross-validation ile R¬≤ √∂l√ß√ºm√º
    scores = cross_val_score(
        model,
        X_train_scaled,
        y_train,
        cv=5,
        scoring="r2"
    )
    return scores.mean()  # maximize R¬≤

In [10]:
# 5Ô∏è‚É£ Optimizasyon
# -----------------------------------------------------------------------------
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("\nüéØ En iyi parametreler:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

print(f"\nOrtalama CV R¬≤: {study.best_value:.4f}")

[I 2025-10-22 14:30:40,491] A new study created in memory with name: no-name-4e7db41e-daef-47e1-aaa4-83de04db6c19


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-10-22 14:30:42,786] Trial 0 finished with value: 0.6170883655548096 and parameters: {'n_estimators': 362, 'learning_rate': 0.07969454818643935, 'max_depth': 8, 'min_child_weight': 60, 'subsample': 0.5780093202212182, 'colsample_bytree': 0.40919616423534183, 'gamma': 0.05808361216819946, 'reg_alpha': 0.8661761457749352, 'reg_lambda': 0.6011150117432088}. Best is trial 0 with value: 0.6170883655548096.
[I 2025-10-22 14:30:46,952] Trial 1 finished with value: 0.08252892494201661 and parameters: {'n_estimators': 596, 'learning_rate': 0.0010994335574766201, 'max_depth': 10, 'min_child_weight': 84, 'subsample': 0.6061695553391381, 'colsample_bytree': 0.42727747704497043, 'gamma': 0.18340450985343382, 'reg_alpha': 0.3042422429595377, 'reg_lambda': 0.5247564316322378}. Best is trial 0 with value: 0.6170883655548096.
[I 2025-10-22 14:30:49,893] Trial 2 finished with value: 0.6468300938606262 and parameters: {'n_estimators': 402, 'learning_rate': 0.0038234752246751854, 'max_depth': 7, 'm

In [5]:
# 6Ô∏è‚É£ En iyi modelin yeniden eƒüitimi
# -----------------------------------------------------------------------------
best_params = study.best_params
best_model = xgb.XGBRegressor(
    **best_params,
    objective="reg:squarederror",
    eval_metric="rmse",
    random_state=42
)
best_model.fit(X_train_scaled, y_train)

In [11]:
# 7Ô∏è‚É£ Test performansƒ±
# -----------------------------------------------------------------------------
y_pred = best_model.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("\nüìà Model Performansƒ±:")
print(f"  R¬≤ Score: {r2:.4f}")
print(f"  Mean Squared Error (MSE): {mse:.4f}")


üìà Model Performansƒ±:
  R¬≤ Score: 0.8599
  Mean Squared Error (MSE): 0.0145


In [12]:
# 8Ô∏è‚É£ √ñzellik √∂nemleri (Feature Importances)
# -----------------------------------------------------------------------------
feat_imp_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": best_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

feat_imp_df.to_csv(
    "C:/Users/borac/Desktop/LUAD_miRNA/output/feature_selection/xg_boost/LUAD_mirna_XGBoost_SelectedFeatures.txt",
    sep='\t', index=False
)

feat_imp_df.to_csv(
    "C:/Users/borac/Desktop/LUAD_miRNA/output/feature_selection/xg_boost/LUAD_mirna_XGBoost_SelectedFeatures.csv",
    index=False
)

print(f"\nToplam se√ßilen √∂zellik sayƒ±sƒ±: {len(feat_imp_df)}")
print(feat_imp_df.head(10))


Toplam se√ßilen √∂zellik sayƒ±sƒ±: 1881
           Feature  Importance
286    hsa-mir-210    0.038528
1848   hsa-mir-9-1    0.025422
1718  hsa-mir-6892    0.024076
1126   hsa-mir-503    0.019109
466   hsa-mir-3200    0.019020
74    hsa-mir-1247    0.018821
100   hsa-mir-1266    0.018553
1447   hsa-mir-615    0.018164
1458   hsa-mir-625    0.017193
1877    hsa-mir-96    0.015611
