In [None]:
# ----------------------------
# Script: boruta_feature_selection.py
# Purpose: Perform Boruta feature selection on lncRNA expression data
# Author: [Your Name]
# ----------------------------

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# ----------------------------
# Step 1: Load Data
# ----------------------------

# Expression matrix (samples as rows, genes as columns)
data = pd.read_csv("data/batch_corrected_lncRNA_expression.csv", index_col=0)

# Labels file: binary classification (e.g., tumor = 1, normal = 0)
labels = pd.read_csv("data/sample_labels.csv", index_col=0)
y = labels['label'].values

# Transpose if genes are rows
if data.shape[0] != len(y):
    data = data.T

X = data.values
feature_names = data.columns

# ----------------------------
# Step 2: Preprocessing
# ----------------------------

# Scale features (Boruta works best with scaled data)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ----------------------------
# Step 3: Initialize Random Forest and Boruta
# ----------------------------

rf = RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=-1, class_weight='balanced')
boruta_selector = BorutaPy(estimator=rf, n_estimators='auto', verbose=2, random_state=42)

# ----------------------------
# Step 4: Run Boruta
# ----------------------------

boruta_selector.fit(X_scaled, y)

# ----------------------------
# Step 5: Get Important Features
# ----------------------------

# Get names of selected features
selected_features = feature_names[boruta_selector.support_]

# Save selected features
pd.Series(selected_features).to_csv("results/boruta_selected_lncRNAs.csv", index=False)

# Optional: Plot feature rankings
importances = boruta_selector.ranking_
sorted_idx = np.argsort(importances)

plt.figure(
