In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# NumPy 2.0 uyumluluk yaması
if not hasattr(np, "int"):   np.int   = int
if not hasattr(np, "float"): np.float = float
if not hasattr(np, "bool"):  np.bool  = bool


ModuleNotFoundError: No module named 'boruta'

In [25]:
# 0) RANDOM STATE sabitle
RSEED = 42

In [26]:
# Veriyi oku
data = pd.read_table(r"C:\Users\borac\Desktop\LUAD_miRNA\input\TCGA_LUAD_miRNA_expression_disease_status.txt", header=0, index_col=0)

# Transpoz al
data = data.transpose()

# Özellikleri ve hedef değişkeni ayır
X = data.iloc[:, :-1]  # Son sütun hariç tüm sütunlar
y = data.iloc[:, -1]   # Son sütun

# Y'yi sayısal değere çevir
y = pd.to_numeric(y, errors='coerce')  # Hatalı dönüşüm varsa NaN olur

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
# 7) RF Regresör (Boruta için estimator)

model = RandomForestRegressor(
    n_estimators=500, 
    max_depth =  5, 
    random_state=42, 
    n_jobs=16
)

In [28]:
# 8) Boruta (yalnızca TRAIN verisinde fit!)
feat_selector = BorutaPy(
    estimator=model,
    n_estimators='auto',
    max_iter=350,
    random_state=RSEED,
    verbose=2
)
feat_selector.fit(X_train_scaled, y_train.values)

Iteration: 	1 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	2 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	3 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	4 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	5 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	6 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	7 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	8 / 350
Confirmed: 	0
Tentative: 	10
Rejected: 	1871
Iteration: 	9 / 350
Confirmed: 	2
Tentative: 	8
Rejected: 	1871
Iteration: 	10 / 350
Confirmed: 	2
Tentative: 	8
Rejected: 	1871
Iteration: 	11 / 350
Confirmed: 	2
Tentative: 	8
Rejected: 	1871
Iteration: 	12 / 350
Confirmed: 	2
Tentative: 	7
Rejected: 	1872
Iteration: 	13 / 350
Confirmed: 	2
Tentative: 	7
Rejected: 	1872
Iteration: 	14 / 350
Confirmed: 	2
Tentative: 	7
Rejected: 	1872
Iteration: 	15 / 350
Confirmed: 	2
Tentative: 	7
Rejected: 	1872
Iteration: 	16 / 350
Confirmed: 	

0,1,2
,estimator,RandomForestR...0x25E120B4A40)
,n_estimators,'auto'
,perc,100
,alpha,0.05
,two_step,True
,max_iter,350
,random_state,RandomState(M... 0x25E120B4A40
,verbose,2

0,1,2
,n_estimators,80
,criterion,'squared_error'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [151]:
# 9) Seçilen özellik maskesi ve isimleri
mask = feat_selector.support_
selected_features = X_train.columns[mask]

print(f"\nSeçilen özellik sayısı: {mask.sum()} / {X_train.shape[1]}")
print("İlk 20 seçili özellik:", list(selected_features[:20]))


Iteration: 	1 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	2 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	3 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	4 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	5 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	6 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	7 / 350
Confirmed: 	0
Tentative: 	1881
Rejected: 	0
Iteration: 	8 / 350
Confirmed: 	0
Tentative: 	14
Rejected: 	1867
Iteration: 	9 / 350
Confirmed: 	2
Tentative: 	12
Rejected: 	1867
Iteration: 	10 / 350
Confirmed: 	2
Tentative: 	12
Rejected: 	1867
Iteration: 	11 / 350
Confirmed: 	2
Tentative: 	12
Rejected: 	1867
Iteration: 	12 / 350
Confirmed: 	5
Tentative: 	9
Rejected: 	1867
Iteration: 	13 / 350
Confirmed: 	5
Tentative: 	9
Rejected: 	1867
Iteration: 	14 / 350
Confirmed: 	5
Tentative: 	9
Rejected: 	1867
Iteration: 	15 / 350
Confirmed: 	5
Tentative: 	9
Rejected: 	1867
Iteration: 	16 / 350
Confirmed

In [None]:
# 10) Seçimi hem train hem test'e uygula (ölçeklenmiş matrislerde aynı indekslerle)
X_train_sel = X_train_scaled[:, mask]
X_test_sel  = X_test_scaled[:, mask]

In [None]:
# 11) Nihai modeli train_selected ile eğit, test_selected üzerinde değerlendir
model = rf.fit(X_train_sel, y_train.values)
pred_test = model.predict(X_test_sel)

rmse = mean_squared_error(y_test.values, pred_test, squared=False)
r2 = r2_score(y_test.values, pred_test)
print(f"\n------TEST METRİKLERİ------\nRMSE: {rmse:.4f} | R²: {r2:.4f}")

In [None]:
# 12) Özellik destek ve sıralamalarını kaydet
out_txt = "C:\Users\borac\Desktop\LUAD_miRNA\output\feature_selection\boruta\10EKIMBORUTASI"
with open(out_txt, "w", encoding="utf-8") as f:
    f.write("------Support and Ranking for each feature------\n")
    for i, col in enumerate(X_train.columns):
        status = "PASS" if feat_selector.support_[i] else "REJECT"
        f.write(f"{status}\t{col}\tRank={feat_selector.ranking_[i]}\n")

    f.write("\n------Selected Feature List------\n")
    for col in selected_features:
        f.write(f"{col}\n")

In [27]:
print("\n------Support and Ranking for each feature------")
for i in range(len(feat_selector.support_)):
    if feat_selector.support_[i]:
        print("Passes the test: ", X.columns[i],
              " - Ranking: ", feat_selector.ranking_[i])
    else:
        print("Doesn't pass the test: ",
              X.columns[i], " - Ranking: ", feat_selector.ranking_[i])


------Support and Ranking for each feature------
Doesn't pass the test:  hsa-let-7a-1  - Ranking:  248
Doesn't pass the test:  hsa-let-7a-2  - Ranking:  383
Doesn't pass the test:  hsa-let-7a-3  - Ranking:  222
Doesn't pass the test:  hsa-let-7b  - Ranking:  78
Doesn't pass the test:  hsa-let-7c  - Ranking:  296
Passes the test:  hsa-let-7d  - Ranking:  1
Doesn't pass the test:  hsa-let-7e  - Ranking:  292
Doesn't pass the test:  hsa-let-7f-1  - Ranking:  252
Doesn't pass the test:  hsa-let-7f-2  - Ranking:  319
Doesn't pass the test:  hsa-let-7g  - Ranking:  172
Doesn't pass the test:  hsa-let-7i  - Ranking:  685
Doesn't pass the test:  hsa-mir-1-1  - Ranking:  571
Doesn't pass the test:  hsa-mir-1-2  - Ranking:  571
Doesn't pass the test:  hsa-mir-100  - Ranking:  475
Doesn't pass the test:  hsa-mir-101-1  - Ranking:  64
Doesn't pass the test:  hsa-mir-101-2  - Ranking:  38
Doesn't pass the test:  hsa-mir-103a-1  - Ranking:  517
Doesn't pass the test:  hsa-mir-103a-2  - Ranking:  13

In [29]:
# features selected by Boruta
X_filtered = feat_selector.transform(np.array(X))

print("\n------Selected Features------\n")
print(X_filtered)

# train the model
model.fit(X_filtered, y)

# compute predictions
predictions = model.predict(X_filtered)

# create a dataframe with real predictions and values
df = pd.DataFrame({'pred': predictions, 'observed': y})

# let's print the dataframe
print("\n------Predictions and real values------\n")
print(df)

# compute RMSE
mse = ((df['pred'] - df['observed']) ** 2).mean()
rmse = np.sqrt(mse)
print("\n------RMSE------\n", round(rmse, 3))


------Selected Features------

[[ 2888   232  1713 ...    27    20   169]
 [ 2779   170  2531 ...    89    88   221]
 [ 3902   808 20632 ...   273   282  7382]
 ...
 [ 2506    48  3507 ...   897   845    21]
 [ 3631    68  6909 ...  1021   988   397]
 [ 3454    16  6197 ...  7340  7423    57]]

------Predictions and real values------

                                  pred  observed
TCGA-97-7937-01A-11H-2169-13  1.000000         1
TCGA-64-5774-01A-01T-1627-13  1.000000         1
TCGA-MP-A4TK-01A-11H-A24S-13  1.000000         1
TCGA-78-7167-01A-11H-2065-13  1.000000         1
TCGA-49-AAR2-01A-11H-A39B-13  1.000000         1
...                                ...       ...
TCGA-78-7163-11A-01H-2065-13  0.128713         0
TCGA-50-5933-11A-01H-2169-13  0.000000         0
TCGA-50-7109-11A-01H-2038-13  0.000000         0
TCGA-44-7667-11A-01H-2065-13  0.250825         0
TCGA-44-7662-11A-01H-2065-13  0.000000         0

[565 rows x 2 columns]

------RMSE------
 0.04


In [31]:
# Özellik seçim sonuçlarını .txt dosyasına kaydet
with open("boruta_selected_features_test03_maxiter500.txt", "w") as f:
    f.write("------Support and Ranking for each feature------\n")
    for i in range(len(feat_selector.support_)):
        if feat_selector.support_[i]:
            f.write(f"Passes the test:  {X.columns[i]}  - Ranking:  {feat_selector.ranking_[i]}\n")
        else:
            f.write(f"Doesn't pass the test:  {X.columns[i]}  - Ranking:  {feat_selector.ranking_[i]}\n")


In [4]:
import sys, platform
print(sys.executable)  # Bu, kullanılan python.exe’nin tam yolu
print(platform.python_version())


C:\Users\borac\anaconda3\envs\boruta_env\python.exe
3.11.13
