In [2]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

# Custom Transformer for Feature Generation (Dummy Example)
class FeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, config):
        self.config = config

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Implement feature generation based on config
        # This is a placeholder. Implement actual feature generation logic as needed.
        return X

# Function to load JSON configuration from a file
def load_config_from_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Corrected parse_config function
def parse_config(config):
    try:
        design_data = config['design_state_data']
        
        # Extract target information
        target_info = design_data['target']
        target = target_info['target']
        prediction_type = target_info['prediction_type'].lower()  # e.g., 'regression'
        
        # Extract feature handling information
        feature_handling = design_data['feature_handling']
        features = list(feature_handling.keys())
        
        # Determine feature types and imputation strategies
        numerical_features = []
        categorical_features = []
        imputation = {}
        
        for feature, details in feature_handling.items():
            feature_type = details.get('feature_variable_type', 'numerical')
            if feature_type == 'numerical':
                numerical_features.append(feature)
                feature_details = details.get('feature_details', {})
                missing_strategy = feature_details.get('missing_values', 'mean')
                imputation_strategy = 'mean' if missing_strategy == 'Impute' else 'median'
                imputation[feature] = imputation_strategy
            elif feature_type == 'text':
                categorical_features.append(feature)
            else:
                print(f"Unknown feature type for {feature}: {feature_type}")
        
        # Extract feature reduction details
        feature_reduction = design_data.get('feature_reduction', {})
        
        # Extract algorithms
        algorithms = design_data.get('algorithms', {})
        
        return {
            'target': target,
            'prediction_type': prediction_type,
            'numerical_features': numerical_features,
            'categorical_features': categorical_features,
            'imputation': imputation,
            'feature_reduction': feature_reduction,
            'algorithms': algorithms
        }
    except KeyError as e:
        print(f"Missing key in JSON configuration: {e}")
        raise

# Function to load data
def load_data(csv_file, features, target):
    df = pd.read_csv(csv_file)
    
    # Verify that all features and target exist in the DataFrame
    missing_features = [f for f in features if f not in df.columns]
    if missing_features:
        raise KeyError(f"The following features are missing in the CSV file: {missing_features}")
    
    if target not in df.columns:
        raise KeyError(f"The target column '{target}' is missing in the CSV file.")
    
    X = df[features]
    y = df[target]
    return X, y

# Function to build preprocessing pipelines
def build_preprocessor(numerical_features, categorical_features, imputation):
    # Numerical pipeline
    numerical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Default strategy; will be overridden
        ('scaler', StandardScaler())
    ])
    
    # Update imputation strategy for numerical features
    # Create a list of tuples for imputation strategies
    transformers = []
    if numerical_features:
        # If different features have different imputation strategies, use a custom imputer
        # Otherwise, use a single SimpleImputer
        if len(set(imputation.values())) > 1:
            # Different imputation strategies; need to handle individually
            # This is complex; for simplicity, we'll assume a single strategy
            # Alternatively, use separate pipelines for each strategy
            numerical_pipeline = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ])
        else:
            # Single imputation strategy
            numerical_pipeline = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy=list(set(imputation.values())).pop())),
                ('scaler', StandardScaler())
            ])
        
        transformers.append(('num', numerical_pipeline, numerical_features))
    
    # Categorical pipeline
    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    if categorical_features:
        transformers.append(('cat', categorical_pipeline, categorical_features))
    
    preprocessor = ColumnTransformer(transformers=transformers)
    return preprocessor

# Function to perform feature reduction
def apply_feature_reduction(X, y, feature_reduction_config):
    method = feature_reduction_config.get('feature_reduction_method', 'No Reduction')
    
    if method == 'No Reduction':
        return X, None
    
    elif method == 'Corr with Target':
        correlation = X.apply(lambda col: col.corr(y))
        selected_features = correlation[correlation.abs() > 0.1].index.tolist()
        X_reduced = X[selected_features]
        return X_reduced, None
    
    elif method == 'Tree-based':
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X, y)
        selector = SelectFromModel(model, threshold="mean", prefit=True)
        X_reduced = selector.transform(X)
        return X_reduced, selector
    
    elif method == 'PCA':
        pca = PCA(n_components=0.95, random_state=42)
        X_reduced = pca.fit_transform(X)
        return X_reduced, pca
    
    else:
        print(f"Unknown feature reduction method: {method}")
        return X, None

# Function to get model and its hyperparameter grid based on configuration
def get_model_and_params(algorithm_name, algorithm_config, prediction_type):
    if not algorithm_config.get('is_selected', False):
        return None, None
    
    if prediction_type == 'regression':
        if 'RandomForestRegressor' in algorithm_name:
            model = RandomForestRegressor(
                n_estimators=algorithm_config.get('min_trees', 100),
                max_depth=algorithm_config.get('max_depth', None),
                min_samples_leaf=algorithm_config.get('min_samples_per_leaf_min_value', 1),
                random_state=42
            )
            param_grid = {
                'n_estimators': [algorithm_config.get('min_trees', 100), algorithm_config.get('max_trees', 200)],
                'max_depth': [algorithm_config.get('min_depth', None), algorithm_config.get('max_depth', None)],
                'min_samples_leaf': [algorithm_config.get('min_samples_per_leaf_min_value', 1),
                                     algorithm_config.get('min_samples_per_leaf_max_value', 4)]
            }
            return model, param_grid
        
        elif 'LinearRegression' in algorithm_name:
            model = LinearRegression()
            param_grid = {
                'fit_intercept': [True, False],
                'normalize': [True, False]
            }
            return model, param_grid
        
        elif 'SVR' in algorithm_name:
            model = SVR()
            param_grid = {
                'kernel': ['linear', 'rbf'],
                'C': [1, 10, 100],
                'gamma': ['scale', 'auto']
            }
            return model, param_grid
        
        elif 'GradientBoostingRegressor' in algorithm_name:
            model = GradientBoostingRegressor(random_state=42)
            param_grid = {
                'n_estimators': [100, 200],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 5],
                'subsample': [0.8, 1.0]
            }
            return model, param_grid
        
        elif 'DecisionTreeRegressor' in algorithm_name:
            model = DecisionTreeRegressor(random_state=42)
            param_grid = {
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            return model, param_grid
        
        else:
            print(f"Unsupported algorithm for regression: {algorithm_name}")
            return None, None
    
    elif prediction_type == 'classification':
        # Implement classification models similarly
        pass
    
    else:
        print(f"Unsupported prediction type: {prediction_type}")
        return None, None

# Function to perform hyperparameter tuning
def perform_grid_search(pipeline, param_grid, X_train, y_train):
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Model Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R-squared (R2): {r2}")
    print("-" * 30)

# Main function to run the pipeline
def run_pipeline(json_file_path, csv_file):
    # Load configuration from the JSON file
    config = load_config_from_file(json_file_path)
    
    # Parse configuration
    parsed_config = parse_config(config)
    target = parsed_config['target']
    prediction_type = parsed_config['prediction_type']
    numerical_features = parsed_config['numerical_features']
    categorical_features = parsed_config['categorical_features']
    imputation = parsed_config['imputation']
    feature_reduction = parsed_config['feature_reduction']
    algorithms = parsed_config['algorithms']
    
    print("Parsed Configuration:")
    print(f"Target: {target}")
    print(f"Prediction Type: {prediction_type}")
    print(f"Numerical Features: {numerical_features}")
    print(f"Categorical Features: {categorical_features}")
    print(f"Imputation Strategies: {imputation}")
    print(f"Feature Reduction: {feature_reduction}")
    print(f"Algorithms: {algorithms}")
    print("=" * 50)
    
    # Load Data
    X, y = load_data(csv_file, numerical_features + categorical_features, target)
    
    # Build Preprocessor
    preprocessor = build_preprocessor(numerical_features, categorical_features, imputation)
    
    # Create a list to hold all trained models
    trained_models = []
    
    # Iterate over each selected algorithm
    for algo_name, algo_config in algorithms.items():
        if not algo_config.get('is_selected', False):
            continue  # Skip if not selected
        
        print(f"Processing Algorithm: {algo_name}")
        
        # Get model and hyperparameter grid
        model, param_grid = get_model_and_params(algo_name, algo_config, prediction_type)
        if model is None:
            continue  # Unsupported or not selected
        
        # Create a pipeline with preprocessor and model
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('feature_generator', FeatureGenerator(config)),  # Placeholder for feature generation
            ('model', model)
        ])
        
        # Define full parameter grid with pipeline steps
        full_param_grid = {}
        for param, values in param_grid.items():
            full_param_grid[f'model__{param}'] = values
        
        # Perform Grid Search
        print(f"Starting Grid Search for {algo_name}...")
        best_pipeline, best_params = perform_grid_search(pipeline, full_param_grid, X, y)
        print(f"Best Parameters for {algo_name}: {best_params}")
        
        # Train-Test Split for Evaluation
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Fit the best pipeline on training data
        best_pipeline.fit(X_train, y_train)
        
        # Evaluate the model
        evaluate_model(best_pipeline, X_test, y_test)
        
        # Save the trained model
        model_filename = f"{algo_name}_model.pkl"
        joblib.dump(best_pipeline, model_filename)
        print(f"Saved best model to {model_filename}")
        print("=" * 50)
        
        # Append to trained models list
        trained_models.append((algo_name, best_pipeline))
    
    print("All selected models have been processed and evaluated.")

# Example usage
if __name__ == "__main__":
    json_file_path = r'C:\Users\ASUS\Desktop\algoparams_from_ui.json'  # Replace with your JSON file path
    csv_file = 'iris.csv'  # Replace with your CSV file path
    run_pipeline(json_file_path, csv_file)


Parsed Configuration:
Target: petal_width
Prediction Type: regression
Numerical Features: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
Categorical Features: ['species']
Imputation Strategies: {'sepal_length': 'mean', 'sepal_width': 'mean', 'petal_length': 'mean', 'petal_width': 'mean'}
Feature Reduction: {'feature_reduction_method': 'Tree-based', 'num_of_features_to_keep': '4', 'num_of_trees': '5', 'depth_of_trees': '6'}
Algorithms: {'RandomForestClassifier': {'model_name': 'Random Forest Classifier', 'is_selected': False, 'min_trees': 10, 'max_trees': 30, 'feature_sampling_statergy': 'Default', 'min_depth': 20, 'max_depth': 30, 'min_samples_per_leaf_min_value': 5, 'min_samples_per_leaf_max_value': 50, 'parallelism': 0}, 'RandomForestRegressor': {'model_name': 'Random Forest Regressor', 'is_selected': True, 'min_trees': 10, 'max_trees': 20, 'feature_sampling_statergy': 'Default', 'min_depth': 20, 'max_depth': 25, 'min_samples_per_leaf_min_value': 5, 'min_samples_per_l