In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
"""
A Recursive Feature Elimination (RFE) using a Random Forest Regressor. Input is a .csv file with in the first column SMILES strings, and in the following
two columns wheter they inhibit PKM2 or ERK2. It outputs a .csv file with only the most important descriptors remaining. 
"""
# Load your data
df = pd.read_csv('C:/Users/20192891/Documents/Master/Q4/8CC00/Assignment 3/filtered_molecules222.csv')

# Separate features (X) and targets (y)
X = df.drop(columns=['SMILES', 'PKM2_inhibition', 'ERK2_inhibition'])
y_pkm2 = df['PKM2_inhibition']
y_erk2 = df['ERK2_inhibition']

def perform_rfe(X, y, n_features_to_select=30):
    """
    Perform Recursive Feature Elimination (RFE) using a Random Forest Regressor.

    RFE is a feature selection technique that recursively fits a model and removes the weakest descriptors 
    until the  number of descriptors is reached. This method helps in identifying the most important 
    features for predicting the target variables, wheter the molecules inhibit PKM2 or ERK2.

    Parameters:
    - X (pd.DataFrame): The input features.
    - y (pd.Series): The target variable.
    - n_features_to_select (int): The number of features to select. Default is 30.

    Returns:
    - selected_features (pd.Index): The names of the selected features.
    """


    rf_regressor = RandomForestRegressor(n_estimators=100)
    rfe = RFE(estimator=rf_regressor, n_features_to_select=n_features_to_select)
    rfe.fit(X, y)
    selected_features = X.columns[rfe.support_]
    return selected_features

# Perform RFE for each target separately
selected_features_pkm2 = perform_rfe(X, y_pkm2)
selected_features_erk2 = perform_rfe(X, y_erk2)

# Combine the selected features
combined_selected_features = list(set(selected_features_pkm2) | set(selected_features_erk2))

# Filter the DataFrame to include only the combined selected features and necessary columns
columns_to_keep = combined_selected_features + ['SMILES', 'PKM2_inhibition', 'ERK2_inhibition']
filtered_df = df[columns_to_keep]

# Save the filtered dataframe to a CSV file
output_file = 'C:/Users/20192891/Documents/Master/Q4/8CC00/Assignment 3/filtered_molecules_top_features.csv'
filtered_df.to_csv(output_file, index=False)

print(f"Filtered data saved to {output_file}.")

# Optionally, print selected features for verification
print("\nSelected features for PKM2_inhibition:")
print(selected_features_pkm2)
print("\nSelected features for ERK2_inhibition:")
print(selected_features_erk2)
