In [2]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   --- ------------------------------------ 0.8/8.7 MB 2.0 MB/s eta 0:00:05
   -------- ------------------------------- 1.8/8.7 MB 3.1 MB/s eta 0:00:03
   ------------- -------------------------- 2.9/8.7 MB 3.7 MB/s eta 0:00:02
   ------------------- -------------------- 4.2/8.7 MB 4.2 MB/s eta 0:00:02
   -------------------------- ------------- 5.8/8.7 MB 4.7 MB/s eta 0:00:01
   -------------------------------- ------- 7.1/8.7 MB 5.0 MB/s eta 0:00:01
   ---------


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

In [12]:
# Load your final, complete dataset
df = pd.read_csv("D:\\python_progs\\Final_year_proj\\Datasets\\features_with_target.csv")


y = df['TARGET']
X = df.drop(columns=['TARGET', 'Protein', 'Gene Names'])
X=X.fillna(0)

# It's good practice to scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

Features (X) shape: (20420, 36)
Target (y) shape: (20420,)


### SelectKBest

In [14]:
# We will select the top 18 features
selector_kbest = SelectKBest(score_func=mutual_info_classif, k=18)
selector_kbest.fit(X_scaled, y)

# Get the list of the top 18 feature names
kbest_features = X.columns[selector_kbest.get_support()]

print("--- Top 18 Features (SelectKBest) ---")
print(list(kbest_features))

--- Top 18 Features (SelectKBest) ---
['Negative_Residues', 'AAC_Q', 'AAC_M', 'AAC_G', 'AAC_C', 'AAC_P', 'AAC_S', 'AAC_I', 'AAC_K', 'AAC_V', 'AAC_Y', 'AAC_D', 'AAC_N', 'Degree', 'Betweenness_Centrality', 'Closeness_Centrality', 'Eigenvector_Centrality', 'GO_Essential_Score']


### Recursive Feature Elimination (RFE)

In [8]:
model = RandomForestClassifier(n_estimators=50, random_state=42)

# Initialize RFE
selector_rfe = RFE(estimator=model, n_features_to_select=18, step=1)
selector_rfe.fit(X_scaled, y)

# Get the list of the top 18 feature names
rfe_features = X.columns[selector_rfe.support_]

print("--- Top 18 Features (RFE) ---")
print(list(rfe_features))

--- Top 18 Features (RFE) ---
['Molecular_Weight', 'Instability_Index', 'Net_Charge_pH7', 'AAC_Q', 'AAC_R', 'AAC_M', 'AAC_G', 'AAC_L', 'AAC_W', 'AAC_P', 'AAC_S', 'AAC_I', 'AAC_K', 'AAC_A', 'AAC_E', 'AAC_V', 'AAC_D', 'AAC_N']


### Random Forest Importance

In [9]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_scaled, y)

# Get feature importances
importances = rf_model.feature_importances_

rf_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Get the top 18 feature names
rf_features = rf_importance_df.head(18)['Feature']

print("--- Top 18 Features (Random Forest) ---")
print(list(rf_features))

--- Top 18 Features (Random Forest) ---
['AAC_I', 'AAC_K', 'AAC_V', 'AAC_S', 'Instability_Index', 'AAC_P', 'AAC_D', 'Shannon_Entropy', 'AAC_Q', 'AAC_M', 'AAC_T', 'Net_Charge_pH7', 'AAC_N', 'AAC_F', 'AAC_G', 'AAC_W', 'AAC_H', 'AAC_E']


In [10]:
# Get the sets of top features
kbest_set = set(kbest_features)
rfe_set = set(rfe_features)
rf_set = set(rf_features)

# Find features that appear in ALL THREE lists (highest confidence)
common_in_all_3 = kbest_set.intersection(rfe_set).intersection(rf_set)
print("\n--- Features in Top 18 for ALL 3 Methods (High Confidence) ---")
print(list(common_in_all_3))

# Find features that appear in at least TWO lists (strong confidence)
common_in_2_or_more = (kbest_set.intersection(rfe_set) |
                     kbest_set.intersection(rf_set) |
                     rfe_set.intersection(rf_set))

print("\n--- Features in Top 18 for at Least 2 Methods (Strong Confidence) ---")
print(list(common_in_2_or_more))


--- Features in Top 18 for ALL 3 Methods (High Confidence) ---
['AAC_E', 'AAC_Q', 'AAC_K', 'AAC_M', 'AAC_V', 'AAC_G', 'AAC_D', 'AAC_I', 'AAC_N']

--- Features in Top 18 for at Least 2 Methods (Strong Confidence) ---
['AAC_E', 'AAC_Q', 'AAC_M', 'AAC_G', 'AAC_D', 'AAC_I', 'AAC_N', 'AAC_P', 'AAC_S', 'Instability_Index', 'AAC_W', 'AAC_K', 'AAC_V', 'Net_Charge_pH7']


### Feature Selection Rationale

This dataset contains the final 18 features selected for model training. The goal was to create a robust and interpretable model by balancing statistical performance with domain knowledge.

The features were chosen using a hybrid approach based on three selection methods (SelectKBest, RFE, and Random Forest Importance):

1.  **High-Confidence (9 features):** These are 9 Amino Acid Composition (AAC) features (`AAC_E`, `AAC_Q`, etc.) that were identified as "top 18" by *all three* selection methods. This indicates they are very strong and reliable predictors.
2.  **Domain Knowledge (4 features):** We manually included 4 features that are highly relevant to protein essentiality:
    * `GO_Essential_Score` (Annotation data)
    * `Degree_Centrality` (Network hub status)
    * `Betweenness_Centrality` (Network bottleneck status)
    * `Eigenvector_Centrality` (Network influence)
3.  **Strong-Confidence (5 features):** To complete the set, we included 5 features (like `Instability_Index` and `Net_Charge_pH7`) that were identified by at least two selection methods, balancing out the list with key physicochemical properties.

In [15]:
# Define the final 18 features we selected
final_18_features = [
    # High-Confidence (All 3 Methods)
    'AAC_E', 'AAC_Q', 'AAC_K', 'AAC_M', 'AAC_V', 'AAC_G', 
    'AAC_D', 'AAC_I', 'AAC_N',
    
    # Domain Knowledge (From SelectKBest)
    'GO_Essential_Score', 'Degree_Centrality', 'Betweenness_Centrality', 'Eigenvector_Centrality',
    
    # Strong-Confidence (2 Methods)
    'Instability_Index', 'Net_Charge_pH7', 'AAC_P', 'AAC_S', 'AAC_W'
]

# Define the target column
target_column = 'TARGET'

# Create the final DataFrame
df_model_ready = df[final_18_features + [target_column]]

# Save the new, model-ready dataset
df_model_ready.to_csv("D:\\python_progs\\Final_year_proj\\Datasets\\best_features_dataset.csv", index=False)