In [1]:
import numpy as np
import pandas as pd
from deap import base, creator, tools, algorithms
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv('C:/Users/Imran/OneDrive - Ontario Tech University/Documents/Capstone/standardized.csv')

In [4]:
X = df.drop(columns=['rar']).values  # Assuming 'target' is your rar variable column
y = df['rar'].values

In [5]:
X_train, X_sampled, y_train, y_sampled = train_test_split(X, y, test_size=0.5, random_state=42)

In [6]:
# Create a train-test split
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

In [11]:
# Genetic Algorithm setup
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.choice, [0, 1], p=[0.14, 0.86])  # Roughly select 61 out of 71 features
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=71)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [13]:
import numpy as np
from deap import base, creator, tools, algorithms
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed

In [15]:
num_features = X_train.shape[1]

In [17]:
# Define evaluation function
def evalIndividual(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) >= 61:
        return 1000,  # Return a high error if the number of selected features is not under 35
    
    X_train_sel = X_train[:, selected_features]
    X_test_sel = X_test[:, selected_features]
    
    model = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=-1)  # Reduced n_estimators and added parallelism
    model.fit(X_train_sel, y_train)
    predictions = model.predict(X_test_sel)
    mse = mean_squared_error(y_test, predictions)
    return mse,

# Create the DEAP toolbox and register the necessary functions
toolbox = base.Toolbox()

# Create types
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

# Attribute generator
toolbox.register("attr_bool", np.random.randint, 2)

# Structure initializers
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Register operators
toolbox.register("evaluate", evalIndividual)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

def main():
    population = toolbox.population(n=100)
    NGEN = 10
    CXPB, MUTPB = 0.5, 0.2

    for gen in range(NGEN):
        offspring = algorithms.varAnd(population, toolbox, cxpb=CXPB, mutpb=MUTPB)
        
        # Parallel evaluation of the fitness
        fits = Parallel(n_jobs=-1)(delayed(toolbox.evaluate)(ind) for ind in offspring)
        
        for fit, ind in zip(fits, offspring):
            ind.fitness.values = fit
        population = toolbox.select(offspring, k=len(population))
        print(f"Gen {gen}: Min MSE: {min(fits)}")

    best_ind = tools.selBest(population, k=1)[0]
    print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values))
    
    selected_features = [i for i, bit in enumerate(best_ind) if bit == 1]
    print(f"Selected features: {selected_features}")
    
    return selected_features

if __name__ == "__main__":
    selected_features = main()



Gen 0: Min MSE: (227258.5829753653,)
Gen 1: Min MSE: (214344.18580086503,)
Gen 2: Min MSE: (213343.58795869217,)
Gen 3: Min MSE: (212867.7862209041,)
Gen 4: Min MSE: (212746.63428168918,)
Gen 5: Min MSE: (212111.95898387782,)
Gen 6: Min MSE: (211888.5567965375,)
Gen 7: Min MSE: (210496.47520282282,)
Gen 8: Min MSE: (210496.47520282282,)
Gen 9: Min MSE: (210021.72873743792,)
Best individual is [0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1], (210021.72873743792,)
Selected features: [3, 4, 6, 8, 9, 13, 14, 15, 16, 17, 18, 19, 21, 24, 27, 29, 30, 33, 34, 37, 38, 39, 40, 43, 44, 45, 49, 50, 54, 63, 65, 67, 68, 69, 70]


In [23]:
# Generate new dataset with the selected features
X_new = X[:, selected_features]
df_new = pd.DataFrame(X_new, columns=[f'feature_{i}' for i in selected_features])
df_new['target'] = y

In [25]:
df_new

Unnamed: 0,feature_3,feature_4,feature_6,feature_8,feature_9,feature_13,feature_14,feature_15,feature_16,feature_17,...,feature_49,feature_50,feature_54,feature_63,feature_65,feature_67,feature_68,feature_69,feature_70,target
0,-0.124223,-0.239391,-0.361869,0.566677,1.187780,1.162989,0.351855,-1.203647,0.157068,-1.391322,...,-0.347194,-0.812037,-0.017717,-0.082367,0.0,1.008811,1.676534,-0.452861,1.255257,3677.0976
1,-0.124223,-0.239391,0.777414,0.040877,1.187780,2.403230,3.476626,1.462750,-0.262605,0.267194,...,-0.347194,-0.812037,-0.017717,-0.082367,0.0,1.008811,-0.596469,-0.452861,1.255257,27199.4424
2,3.801723,-0.239391,-0.361869,0.172327,1.187780,-0.697372,-0.689735,-0.019571,-0.516507,0.328621,...,-0.347194,-0.812037,-0.017717,-0.082367,0.0,-0.991266,-0.596469,-0.452861,1.255257,22156.0000
3,-0.124223,-0.239391,0.777414,-0.090572,-0.830823,0.542869,2.435036,-0.565470,1.932284,0.328621,...,-0.347194,-0.812037,-0.017717,-0.082367,0.0,1.008811,-0.596469,-0.452861,1.255257,12734.8000
4,-0.124223,-0.239391,-0.361869,-1.536520,-0.578497,-1.317492,-0.689735,-0.124659,0.041658,1.004312,...,-0.347194,1.231470,-0.017717,-0.082367,0.0,-0.991266,-0.596469,-0.452861,1.255257,13556.0800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1348054,-0.124223,-0.239391,-0.361869,0.698126,-0.326172,-0.697372,0.351855,-0.228791,-0.787196,0.390047,...,2.880233,-0.812037,-0.017717,-0.082367,0.0,-0.991266,-0.596469,-0.452861,1.255257,19708.2000
1348055,-0.124223,-0.239391,-0.361869,0.435227,0.683129,-0.077252,0.351855,0.941364,0.157068,0.512900,...,-0.347194,-0.812037,-0.017717,-0.082367,0.0,-0.991266,-0.596469,-0.452861,1.255257,33329.7216
1348056,-0.124223,-0.239391,-0.361869,1.618275,-0.830823,-1.317492,-0.689735,1.204466,0.261986,-2.189866,...,-0.347194,-0.812037,-0.017717,-0.082367,0.0,-0.991266,-0.596469,-0.452861,1.255257,16681.6000
1348057,-0.124223,-0.239391,-0.361869,-0.222022,1.187780,-0.697372,-0.689735,-0.288486,-0.094736,0.082915,...,-0.347194,1.231470,-0.017717,-0.082367,0.0,1.008811,1.676534,2.208183,1.255257,18046.4000


In [28]:
df_new.to_csv('C:/Users/Imran/OneDrive - Ontario Tech University/Documents/Capstone/0.5_test_GA.csv', index=False)