In [58]:
from sklearn import linear_model
import pandas as pd
import GA as GA
import numpy as np
import statsmodels.api as sm

### 1. Graduate admission rate predict
Data source: https://www.kaggle.com/datasets/mohansacharya/graduate-admissions/

Type of regression: Linear Regression

In [49]:
file_path = 'data/Admission_Predict.csv'
try:
    data = pd.read_csv(file_path)
except Exception as e:
    print("Error loading file with comma as delimiter:", e)

# prepare data
del data['Serial No.']
data.columns = ['_'.join(i.split(' ')).strip('_') for i in data.columns]

target_variable = 'Chance_of_Admit'

In [50]:
model1 = GA.GeneticAlgorithm(num_generations=1000,
                            population_size=500,
                            selection_method='tournament',
                            mutation_rate=0.02)
model1.fit(data, target_variable, FUN=linear_model.LinearRegression)
selected_variables, best_fitness = model1.run()

print("Selected Variables:", selected_variables)
print("Best Fitness (AIC):", best_fitness)

Selected Variables: ['GRE_Score', 'TOEFL_Score', 'LOR', 'CGPA', 'Research']
Best Fitness (AIC): -2196.375447155633


### 2. Wine quality
Data source: https://archive.ics.uci.edu/dataset/186/wine+quality

Type of regression: Linear Regression

In [51]:
# Load your data with the appropriate delimiter (e.g., comma)
file_path = 'data/winequality-red.csv'
try:
    data = pd.read_csv(file_path, delimiter=';')
except Exception as e:
    print("Error loading file with comma as delimiter:", e)

target_variable = 'quality'

In [52]:
model2 = GA.GeneticAlgorithm(num_generations=1000,
                            population_size=500,
                            selection_method='tournament',
                            mutation_rate=0.02)
model2.fit(data, target_variable, FUN=linear_model.LinearRegression)
selected_variables, best_fitness = model2.run()

print("Selected Variables:", selected_variables)
print("Best Fitness (AIC):", best_fitness)

Selected Variables: ['volatile acidity', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates', 'alcohol']
Best Fitness (AIC): -1380.7885490606827


### 3. Date Fruit Datasets
Data source: https://www.muratkoklu.com/datasets/

Type of regression: Logistic Regression


In [59]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [60]:
file_path = 'data/Date_Fruit_Datasets.xlsx'
try:
    data = pd.read_excel(file_path)
except Exception as e:
    print("Error loading file with comma as delimiter:", e)

scaler = StandardScaler()
label_encoder = LabelEncoder()
target_variable = 'Class'
data[target_variable] = label_encoder.fit_transform(data[target_variable])

data[data.columns[:-1]] = scaler.fit_transform(data[data.columns[:-1]])

In [64]:
model3 = GA.GeneticAlgorithm(num_generations=1000,
                            population_size=500,
                            selection_method='tournament',
                            mutation_rate=0.02)
model3.fit(data, target_variable, FUN=linear_model.LogisticRegression(max_iter=1000))
selected_variables, best_fitness = model3.run()

print("Selected Variables:", selected_variables)
print("Best Fitness (AIC):", best_fitness)

Selected Variables: ['PERIMETER', 'MAJOR_AXIS', 'MINOR_AXIS', 'ECCENTRICITY', 'EQDIASQ', 'SOLIDITY', 'CONVEX_AREA', 'ASPECT_RATIO', 'ROUNDNESS', 'COMPACTNESS', 'SHAPEFACTOR_1', 'SHAPEFACTOR_2', 'SHAPEFACTOR_3', 'SHAPEFACTOR_4', 'MeanRR', 'MeanRG', 'MeanRB', 'StdDevRR', 'StdDevRG', 'StdDevRB', 'SkewRG', 'SkewRB', 'KurtosisRR', 'KurtosisRB', 'EntropyRR', 'EntropyRG', 'EntropyRB', 'ALLdaub4RR', 'ALLdaub4RB']
Best Fitness (AIC): 343.1428550807006


### 4. Simulation

In [53]:
num_points = 500
num_features = 20
X = pd.DataFrame(np.random.rand(num_points, num_features))

true_coefficients = np.random.uniform(-5,5,size = num_features)
true_coefficients[10:] = np.zeros(10)
noise = np.random.normal(0, 0.5, size=(num_points, ))

y = X @ true_coefficients + noise

X['y'] = y

target_variable = 'y'

In [54]:
model4 = GA.GeneticAlgorithm(num_generations=2000,
                            population_size=2000,
                            selection_method='tournament',
                            mutation_rate=0.03)
model4.fit(X, target_variable, FUN=linear_model.LinearRegression)
selected_variables, best_fitness = model4.run()

print("GA(Linear Regression): Selected Variables:", selected_variables)
print("Best Fitness (AIC):", best_fitness)

GA(Linear Regression): Selected Variables: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 19]
Best Fitness (AIC): -734.6262024158393
