In [3]:
# Importing libraries
import json
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

# Configuration loading and parsing
def load_config(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def parse_config(config):
    design_data = config['design_state_data']
    session_info = design_data['session_info']
    dataset = session_info['dataset']

    target_info = design_data['target']
    target = target_info['target']
    prediction_type = target_info['prediction_type'].lower()

    features_info = design_data['feature_handling']
    numerical_features = [
        feature for feature, details in features_info.items()
        if details['is_selected'] and details['feature_variable_type'] == 'numerical'
    ]
    categorical_features = [
        feature for feature, details in features_info.items()
        if details['is_selected'] and details['feature_variable_type'] == 'text'
    ]

    imputation = {}
    for feature, details in features_info.items():
        if details['is_selected']:
            if 'feature_details' in details and 'missing_values' in details['feature_details']:
                if details['feature_details']['impute_with'] == 'Average of values':
                    imputation[feature] = 'mean'
                elif details['feature_details']['impute_with'] == 'custom':
                    imputation[feature] = details['feature_details']['impute_value']
                else:
                    imputation[feature] = 'median'

    feature_generation = design_data.get('feature_generation', {})
    feature_reduction = design_data.get('feature_reduction', {})

    algorithms = design_data.get('algorithms', {})

    return dataset, target, prediction_type, numerical_features, categorical_features, imputation, feature_generation, feature_reduction, algorithms

# Dataset
def load_data(file_path, features, target):
    df = pd.read_csv(file_path)
    return df[features], df[target]

# Preprocessing pipeline
def build_preprocessor(numerical_features, categorical_features, feature_generation, feature_reduction):
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # feature generation
    if feature_generation.get('linear_interactions'):
        interaction_features = feature_generation['linear_interactions']
        for pair in interaction_features:
            numerical_pipeline.steps.append(('poly', PolynomialFeatures(degree=2, include_bias=False)))

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    transformers = []
    if numerical_features:
        transformers.append(('num', numerical_pipeline, numerical_features))
    if categorical_features:
        transformers.append(('cat', categorical_pipeline, categorical_features))

    preprocessor = ColumnTransformer(transformers=transformers)

    # faeture reduction
    if feature_reduction.get('pca', {}).get('is_selected', False):
        n_components = feature_reduction['pca'].get('n_components', min(len(numerical_features), 10))
        preprocessor = Pipeline([
            ('column_transformer', preprocessor),
            ('pca', PCA(n_components=n_components))
        ])

    return preprocessor

# Model and hyperparameters
def get_model_and_params(algorithm_name, algorithm_config):
    if not algorithm_config.get('is_selected', False):
        return None, None

    models = {
        'RandomForestRegressor': (RandomForestRegressor(), {
            'n_estimators': list(range(algorithm_config.get('min_trees', 10), algorithm_config.get('max_trees', 20) + 1)),
            'max_depth': list(range(algorithm_config.get('min_depth', 20), algorithm_config.get('max_depth', 25) + 1)),
            'min_samples_leaf': list(range(algorithm_config.get('min_samples_per_leaf_min_value', 5),
                                           algorithm_config.get('min_samples_per_leaf_max_value', 10) + 1))
        }),
        'LinearRegression': (LinearRegression(), {}),
        'SVR': (SVR(), {
            'kernel': ['linear', 'rbf'],
            'C': [1, 10, 100],
            'gamma': ['scale', 'auto']
        }),
        'GradientBoostingRegressor': (GradientBoostingRegressor(), {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5]
        }),
        'DecisionTreeRegressor': (DecisionTreeRegressor(), {})
    }

    return models.get(algorithm_name, (None, None))

# Model Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)

# running main pipeline
def run_pipeline(json_file_path):
    config = load_config(json_file_path)
    dataset, target, prediction_type, numerical_features, categorical_features, imputation, feature_generation, feature_reduction, algorithms = parse_config(config)

    X, y = load_data(dataset, numerical_features + categorical_features, target)
    preprocessor = build_preprocessor(numerical_features, categorical_features, feature_generation, feature_reduction)
    
    for algo_name, algo_config in algorithms.items():
        model, param_grid = get_model_and_params(algo_name, algo_config)
        if model is None:
            continue
        
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        full_param_grid = {f'model__{param}': values for param, values in param_grid.items()}
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        grid_search = GridSearchCV(pipeline, full_param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)
        grid_search.fit(X_train, y_train)
        
        best_pipeline = grid_search.best_estimator_
        mae, mse, r2 = evaluate_model(best_pipeline, X_test, y_test)
        
        # Print output
        print(f"Model: {algo_name}")
        print(f"Best Parameters: {grid_search.best_params_}")
        print(f"Mean Absolute Error (MAE): {mae}")
        print(f"Mean Squared Error (MSE): {mse}")
        print(f"R-squared (R2): {r2}")
        print()
        
        joblib.dump(best_pipeline, f"{algo_name}_model.pkl")
        print(f"Saved best model to {algo_name}_model.pkl")


if __name__ == "__main__":
    run_pipeline(r'algoparams_from_ui.json')  # json file


Fitting 5 folds for each of 396 candidates, totalling 1980 fits
Model: RandomForestRegressor
Best Parameters: {'model__max_depth': 25, 'model__min_samples_leaf': 5, 'model__n_estimators': 20}
Mean Absolute Error (MAE): 0.025948368480934233
Mean Squared Error (MSE): 0.001035195458975586
R-squared (R2): 0.9983714521961964

Saved best model to RandomForestRegressor_model.pkl
