In [1]:
!pip install deap



In [None]:

import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from deap import base, creator, tools, algorithms


RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


data_path = "Dataset_feature.csv"
data = pd.read_csv(data_path)


train_data = data[data['ref'] != 'DILIrank']
test_data = data[data['ref'] == 'DILIrank']

X_train = train_data.drop(['SMILES', 'Label', 'ref'], axis=1)
y_train = train_data['Label']
X_test = test_data.drop(['SMILES', 'Label', 'ref'], axis=1)
y_test = test_data['Label']


assert list(X_train.columns) == list(X_test.columns), "训练集和测试集特征列不一致"


N_GENERATIONS = 20  
POP_SIZE = 50       
P_CROSSOVER = 0.8   
P_MUTATION = 0.1   


def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:  
        return 0.0,
    
 
    X_train_selected = X_train.iloc[:, selected_features]
    model = RandomForestClassifier(random_state=RANDOM_SEED)
    scores = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='accuracy')
    return np.mean(scores),

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)


toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 2)  
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X_train.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)


toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)


population = toolbox.population(n=POP_SIZE)


algorithms.eaSimple(population, toolbox, cxpb=P_CROSSOVER, mutpb=P_MUTATION, 
                    ngen=N_GENERATIONS, verbose=True)


best_individual = tools.selBest(population, k=1)[0]
selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]
selected_columns = X_train.columns[selected_features]




X_train_selected = X_train[selected_columns]
X_test_selected = X_test[selected_columns]


model = RandomForestClassifier(random_state=RANDOM_SEED)
model.fit(X_train_selected, y_train)


y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)



final_train_data = train_data[['SMILES', 'Label', 'ref'] + selected_columns.tolist()]
final_test_data = test_data[['SMILES', 'Label', 'ref'] + selected_columns.tolist()]
train_data_selected = train_data[['SMILES', 'Label', 'ref'] + selected_columns.tolist()]
test_data_selected = test_data[['SMILES', 'Label', 'ref'] + selected_columns.tolist()]
combined_data = pd.concat([train_data_selected, test_data_selected], axis=0)


save_path_combined = "Features.csv"
combined_data.to_csv(save_path_combined, index=False)


gen	nevals
0  	50    
1  	40    
2  	40    
3  	38    
4  	40    
5  	43    
6  	34    
7  	42    
8  	37    
9  	43    
10 	46    
11 	40    
12 	43    
13 	45    
14 	39    
15 	38    
16 	44    
17 	42    
18 	41    
19 	47    
20 	47    
最佳特征子集: Index(['nhyd', 'nhet', 'ncof', 'ncocl', 'ncobr', 'ncoi', 'ncarb', 'nphos',
       'nsulph', 'nring',
       ...
       'Estate59', 'Estate61', 'Estate63', 'Estate66', 'Estate67', 'Estate69',
       'Estate70', 'Estate71', 'Estate72', 'Estate76'],
      dtype='object', length=218)
选择的特征数量: 218
测试集准确率: 0.9093
筛选后的数据集（训练+测试）已保存到: /Users/ggcl7/Desktop/药机器学习特征/selected_combined_features.csv
