In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import datasets
from sklearn.metrics import accuracy_score
from random import choices, randint, randrange, random, uniform
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from operator import itemgetter

population_size = [5, 7]
generation_size = [10, 20]
mutation_rate = 0.02

# 生成一个0和1的列表，其长度等于数据集中染色体的总数
def generate_chromosome(x_df):
    return [randint(0, 1) for _ in range(x_df.shape[1])]

def init_population(population_size, x_df):
    population = []
    for i in range(population_size):
        chromosome = generate_chromosome(x_df)
        population.append(chromosome)
    return population

def select_feature(chromosome, x_df):
    new_df = pd.DataFrame()
    for i in range(len(chromosome)):
        if chromosome[i] == 1:
            column_name = x_df.columns[i]  # ith column name extract from the dataframe x_df
            data = x_df[column_name]  # with the help of column_name extact column data
            new_df[column_name] = data  # data will insert into new dataframe
    return new_df

In [None]:
def fitness_function(chromosome, x_df, y_df):
    ct = 0
    for i in chromosome: #如果染色体生成[0,0,0,0，……],未选择任何特征，则抛出错误
        if i == 0:
            ct += 1

    if ct == len(chromosome):  
        chromosome[0] = 1

    select_feature_df = pd.DataFrame()
    select_feature_df = select_feature(chromosome, x_df)  

    xtrain, xtest, ytrain, ytest = train_test_split(select_feature_df, y_df)  

    clf = DecisionTreeClassifier() 
    clf.fit(xtrain, ytrain)  
    y_predict = clf.predict(xtest)
    score = accuracy_score(ytest, y_predict)  
    return score

In [None]:
def selection(population, x_df, y_df):
    fitness_score = []
    for chromosome in population:  
        score = fitness_function(chromosome, x_df, y_df)  
        fitness_score.append(score)  
    indx = np.argsort(fitness_score)[::-1]  
                                           

    selected_population = []
    for i in indx[:int(len(population) / 2)]:  
        selected_population.append(population[i])  #基于上面排序的fitness_score指数，选择生成更高适合度分数的总体

    return selected_population

# 轮盘赌方法进一步选择交叉的父对象
def select_parent(population, x_df, y_df):
    fit_score = []
    for chromosome in population:
        score = fitness_function(chromosome, x_df, y_df)  
        fit_score.append(score)  

    
    total_score = sum(fit_score)  
    selection_probability = []
    for chromosome_fitness in fit_score:  
        selection_probability.append(chromosome_fitness / total_score)  
    parent = choices(population, weights=selection_probability, k=2)  
    return parent  


def crossover(parent1, parent2):
    cross_over_point = randint(1, len(parent1) - 1)  
    offspring1 = parent1[:cross_over_point] + parent2[cross_over_point:] 
    offspring2 = parent2[:cross_over_point] + parent1[cross_over_point:]  
    return offspring1, offspring2  

# mutation
def mutation(child1, mut_prob):
    index = randint(0, len(child1) - 1)  

    if(mut_prob > random()): 
        if(child1[index] == 1):  
            child1[index] = 0  
        else:
            child1[index] = 1
    return child1

# 主要算法
def genetic_algo(population, generation, population_size, x_df, y_df):
    popul = selection(population, x_df, y_df)  
    new_population = []
    for j in range(int(population_size / 2)):  
        parent1, parent2 = select_parent(popul, x_df, y_df)  
        child1, child2 = crossover(parent1, parent2)  .
        child1 = mutation(child1, mutation_rate)  
        child2 = mutation(child2, mutation_rate)
        new_population.append(child1)
        new_population.append(child2)  
    population = new_population  
    return population

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

def multiobjective_fitness_function(chromosome, x_df, y_df):
    # Ensure at least one feature is selected
    if sum(chromosome) == 0:
        chromosome[randrange(len(chromosome))] = 1

    selected_features_df = select_feature(chromosome, x_df)
    xtrain, xtest, ytrain, ytest = train_test_split(selected_features_df, y_df)

    # 定义各个模型
    clf1 = RandomForestClassifier(n_estimators=100, random_state=1)
    clf2 = make_pipeline(StandardScaler(), SVC(probability=True, random_state=1))
    clf3 = lgb.LGBMClassifier()
    clf4 = DecisionTreeClassifier()
    # 创建投票分类器
    eclf = VotingClassifier(
        estimators=[('rf', clf1), ('svc', clf2), ('lgbm', clf3), ('dt', clf4)],
        voting='soft') # 使用'soft'来基于概率的平均值进行投票

    # 使用训练数据拟合模型
    eclf.fit(xtrain, ytrain)

    # 进行预测
    y_predict = eclf.predict(xtest)
    accuracy = accuracy_score(ytest, y_predict)
    
    # 第二个目标：选择的特征数量的倒数
    num_features_selected = sum(chromosome)
    inverse_feature_count = 1 / num_features_selected if num_features_selected else 0

    return accuracy, inverse_feature_count

In [3]:
def start_genatics(population_size, generation, x_df, y_df):
    population = init_population(population_size, x_df)  
    fitness_score = []
    for i in range(len(population)):
        score = fitness_function(population[i], x_df, y_df)
        fitness_score.append(score)  
    sorted_indices = np.argsort(fitness_score)[::-1] 

    t = 0
    list_ans = []
    list_ans.append([population[sorted_indices[0]], fitness_function(population[sorted_indices[0]], x_df, y_df)])  # append initial population best

    while t < generation:
        update_pop = genetic_algo(population, generation, population_size, x_df, y_df)  # call to the main algo function

        fitness_score = []
        for i in range(len(update_pop)):
            score = fitness_function(update_pop[i], x_df, y_df)
            fitness_score.append(score)  

        sorted_indices = np.argsort(fitness_score)[::-1]  

        list_ans.append([update_pop[sorted_indices[0]], fitness_function(update_pop[sorted_indices[0]], x_df, y_df)])  # store best one
        population = update_pop  
        t += 1  

    return list_ans

In [4]:
def main(x_df, y_df):
    res = []
    for k in range(len(population_size)):
        for m in range(len(generation_size)):
            genetic_ans = start_genatics(population_size[k], generation_size[m], x_df, y_df)
            genetic_ans = pd.DataFrame(genetic_ans)
            result = genetic_ans.sort_values(by=1, ascending=False)  
            # print(result.head(10))
            # result_df.head(10)
            selected_ft = result.iloc[0][0]  
            # print("total Features", len(cell_value))  
            count_OfSelected_ft = selected_ft.count(1)  
            print("selected features=", count_OfSelected_ft) 
            accuracyScore = fitness_function(selected_ft, x_df, y_df)  
            print("accuracy=", accuracyScore)
            res.append([population_size[k], generation_size[m], selected_ft, x_df.shape[1], count_OfSelected_ft, accuracyScore])
    return res

In [None]:
from deap import base, creator, tools, algorithms

creator.create("FitnessMulti", base.Fitness, weights=(1.0, 1.0)) # 两个目标：最大化准确率和最小化所选特征数
creator.create("Individual", list, fitness=creator.FitnessMulti)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=x_df.shape[1])
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", multiobjective_fitness_function, x_df=x_df, y_df=y_df)

toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)

toolbox.register("select", tools.selNSGA2)

def main():
    population = toolbox.population(n=100)
    
    algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=50, stats=None, halloffame=None, verbose=True)
    
    return population

if __name__ == "__main__":
    final_population = main()

**Global Search**

In [5]:
df1 = pd.read_csv('./data/spinels_ff.csv',index_col=0)

# from sklearn import preprocessing
# en=preprocessing.LabelEncoder()            
# df1['class']=en.fit_transform(df1['class'])

x1_df = df1.iloc[:,:-3]
x1_df

Unnamed: 0,V_a,R_b1,R_b2,A1_v,B1_v,C1_v,D_b,Ve_b,Ve_c,En_b,...,maxdiff_spheat,Comp_L3Norm,fwtmean_enthalpyAtomization,mode_enthalpyAtomization,max_mp,mode_eden,meandiff_NpValence,maxdiff_elaff,min_elaff,maxdiff_zungerad
0,1.0,0.65,0.65,1.0,1.0,1.0,7.44,7.0,6.0,1.55,...,751.0,0.868,237.846,249.0,1246.0,1.37,3.142,141.00,0.00,2.18
1,2.0,0.67,0.67,1.0,1.0,1.0,12.40,9.0,6.0,2.28,...,689.0,0.868,317.211,249.0,1964.0,1.37,3.142,141.00,0.00,2.05
2,2.0,0.80,0.80,1.0,1.0,1.0,7.31,3.0,6.0,1.78,...,686.0,0.868,237.131,249.0,842.0,1.37,2.928,138.63,2.37,2.53
3,4.0,0.69,0.69,1.0,1.0,1.0,8.91,10.0,6.0,1.91,...,474.0,0.868,330.653,249.0,1455.0,1.37,2.857,29.00,112.00,1.71
4,4.0,0.72,0.72,1.0,1.0,1.0,1.74,2.0,6.0,1.31,...,571.0,0.868,243.280,249.0,1538.0,1.37,3.142,141.00,0.00,1.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,2.0,0.87,0.87,1.0,1.0,1.0,6.97,3.0,6.0,1.10,...,866.0,0.868,193.967,227.0,819.0,1.37,3.142,195.00,0.00,2.30
307,2.0,0.62,0.62,1.0,1.0,1.0,7.15,6.0,6.0,1.66,...,247.0,0.868,266.784,197.0,1907.0,1.37,3.142,125.90,64.30,0.77
308,2.0,0.65,0.65,1.0,1.0,1.0,7.44,7.0,6.0,1.55,...,278.0,0.868,211.586,197.0,1246.0,1.37,3.142,190.20,0.00,0.55
309,2.0,0.91,0.91,1.0,1.0,1.0,8.55,3.0,6.0,1.22,...,63.0,0.868,211.443,197.0,1412.0,1.37,3.142,190.20,0.00,2.00


In [7]:
y1_df= df1.drop(df1.iloc[:, 0:-1], axis=1)#eg
res= main(x1_df,y1_df)
res= pd.DataFrame(res)
res.columns =['Population', 'Generation', 'selected_features','total_features','total_selected_faetues','Accuracy']
res

selected features= 39
accuracy= 0.8589743589743589
selected features= 43
accuracy= 0.8076923076923077
selected features= 43
accuracy= 0.9102564102564102
selected features= 37
accuracy= 0.8461538461538461


Unnamed: 0,Population,Generation,selected_features,total_features,total_selected_faetues,Accuracy
0,5,10,"[1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, ...",88,39,0.858974
1,5,20,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, ...",88,43,0.807692
2,7,10,"[1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, ...",88,43,0.910256
3,7,20,"[1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, ...",88,37,0.846154


In [8]:
result=res.sort_values(by='Accuracy',ascending=False) 

In [9]:
s1_ft=result.iloc[0][2]     
print("selected features is:",s1_ft)
print("Total number of features ",result.iloc[0][3] )
print("total selected Features",result.iloc[0][4])     
best_score=result.iloc[0][5]   
print("accuracy=",best_score)

selected features is: [1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1]
Total number of features  88
total selected Features 43
accuracy= 0.9102564102564102
