In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import r2_score, accuracy_score
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
print("Loading data and JSON configuration...")
# Load JSON
with open('algoparams_from_ui.json', 'r') as f:
    config = json.load(f)
design_state = config['design_state_data']

# Load dataset
dataset_path = design_state['session_info']['dataset']
df = pd.read_csv(dataset_path)

# Extract target and features
target = design_state['target']['target']
prediction_type = design_state['target']['prediction_type']
features = [f for f in df.columns if f != target and design_state['feature_handling'][f]['is_selected']]

X = df[features]
y = df[target]

Loading data and JSON configuration...


In [10]:
print("Setting up preprocessing...")
numerical_features = []
categorical_features = []

# Define named function for picklability
def to_1d_array(x):
    return x.iloc[:, 0].values

# Categorize features
for feature in features:
    feat_config = design_state['feature_handling'][feature]
    if feat_config['feature_variable_type'] == 'numerical':
        numerical_features.append(feature)
    elif feat_config['feature_variable_type'] == 'text':
        categorical_features.append(feature)

# Numerical preprocessing
numerical_transformers = []
for feature in numerical_features:
    feat_config = design_state['feature_handling'][feature]['feature_details']
    impute_strategy = 'mean' if feat_config['impute_with'] == 'Average of values' else 'constant'
    impute_value = feat_config.get('impute_value', 0)

    numerical_transformers.append(
        (feature, Pipeline([
            ('imputer', SimpleImputer(strategy=impute_strategy, fill_value=impute_value))
        ]), [feature])
    )

# Categorical preprocessing
categorical_transformers = []
for feature in categorical_features:
    n_features = design_state['feature_handling'][feature]['feature_details'].get('hash_columns', 10)
    n_features = 10 if n_features <= 0 else n_features  # Ensure n_features is positive
    categorical_transformers.append(
        (feature, Pipeline([
            ('to_1d', FunctionTransformer(to_1d_array, validate=False)),  # Use named function
            ('hasher', HashingVectorizer(n_features=n_features, alternate_sign=False))
        ]), [feature])
    )

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=numerical_transformers + categorical_transformers,
    remainder='passthrough'
)

Setting up preprocessing...


In [8]:
print("Generating features...")
import numpy as np

def generate_interactions(X, feature_indices=None):
    # If X is sparse, convert to dense for simplicity
    if hasattr(X, 'toarray'):
        X = X.toarray()

    # Default to first two columns if no indices provided
    if feature_indices is None:
        feature_indices = [(0, 1)]  # Pairwise product of first two features

    result = X.copy()
    for i, (idx1, idx2) in enumerate(feature_indices):
        new_feature = X[:, idx1] * X[:, idx2]
        result = np.column_stack((result, new_feature))
    return result

def generate_polynomial(X, feature_indices=None):
    # If X is sparse, convert to dense
    if hasattr(X, 'toarray'):
        X = X.toarray()

    # Default to first two columns
    if feature_indices is None:
        feature_indices = [(0, 1)]

    result = X.copy()
    for i, (idx1, idx2) in enumerate(feature_indices):
        new_feature = X[:, idx1] / (X[:, idx2] + 1e-10)  # Avoid division by zero
        result = np.column_stack((result, new_feature))
    return result

# Feature generation pipeline
feature_gen = Pipeline([
    ('linear', FunctionTransformer(generate_interactions, kw_args={'feature_indices': [(0, 1)]})),
    ('poly', FunctionTransformer(generate_polynomial, kw_args={'feature_indices': [(0, 1)]})),
    ('pairwise', FunctionTransformer(generate_interactions, kw_args={'feature_indices': [(0, 1)]}))
])

Generating features...


In [5]:
print("Setting up feature reduction...")
n_features = len(features) + len(linear_pairs) + len(poly_pairs) + len(pairwise_pairs)
n_features_to_keep = max(1, n_features - 1)  # n-1 features
num_trees = int(design_state['feature_reduction']['num_of_trees'])
depth = int(design_state['feature_reduction']['depth_of_trees'])

feature_selector = SelectFromModel(
    RandomForestRegressor(n_estimators=num_trees, max_depth=depth, random_state=1),
    max_features=n_features_to_keep
)

Setting up feature reduction...


In [6]:
print("Configuring models...")
model_configs = {
    'RandomForestClassifier': {
        'model': RandomForestClassifier,
        'params': {
            'n_estimators': list(range(design_state['algorithms']['RandomForestClassifier']['min_trees'],
                                      design_state['algorithms']['RandomForestClassifier']['max_trees'] + 1, 10)),
            'max_depth': list(range(design_state['algorithms']['RandomForestClassifier']['min_depth'],
                                   design_state['algorithms']['RandomForestClassifier']['max_depth'] + 1, 5)),
            'min_samples_leaf': list(range(design_state['algorithms']['RandomForestClassifier']['min_samples_per_leaf_min_value'],
                                          design_state['algorithms']['RandomForestClassifier']['min_samples_per_leaf_max_value'] + 1, 5))
        }
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor,
        'params': {
            'n_estimators': list(range(design_state['algorithms']['RandomForestRegressor']['min_trees'],
                                      design_state['algorithms']['RandomForestRegressor']['max_trees'] + 1, 5)),
            'max_depth': list(range(design_state['algorithms']['RandomForestRegressor']['min_depth'],
                                   design_state['algorithms']['RandomForestRegressor']['max_depth'] + 1, 5)),
            'min_samples_leaf': list(range(design_state['algorithms']['RandomForestRegressor']['min_samples_per_leaf_min_value'],
                                          design_state['algorithms']['RandomForestRegressor']['min_samples_per_leaf_max_value'] + 1, 5))
        }
    },
    'GBTClassifier': {
        'model': GradientBoostingClassifier,
        'params': {
            'n_estimators': design_state['algorithms']['GBTClassifier']['num_of_BoostingStages'],
            'max_depth': list(range(design_state['algorithms']['GBTClassifier']['min_depth'],
                                   design_state['algorithms']['GBTClassifier']['max_depth'] + 1)),
            'learning_rate': list(np.linspace(design_state['algorithms']['GBTClassifier']['min_stepsize'],
                                             design_state['algorithms']['GBTClassifier']['max_stepsize'], 3))
        }
    },
    'GBTRegressor': {
        'model': GradientBoostingRegressor,
        'params': {
            'n_estimators': design_state['algorithms']['GBTRegressor']['num_of_BoostingStages'],
            'max_depth': list(range(design_state['algorithms']['GBTRegressor']['min_depth'],
                                   design_state['algorithms']['GBTRegressor']['max_depth'] + 1)),
            'learning_rate': list(np.linspace(design_state['algorithms']['GBTRegressor']['min_stepsize'],
                                             design_state['algorithms']['GBTRegressor']['max_stepsize'], 3))
        }
    },
    'LinearRegression': {
        'model': LinearRegression,
        'params': {}
    },
    'LogisticRegression': {
        'model': LogisticRegression,
        'params': {
            'max_iter': list(range(design_state['algorithms']['LogisticRegression']['min_iter'],
                                  design_state['algorithms']['LogisticRegression']['max_iter'] + 1, 10)),
            'C': list(np.linspace(design_state['algorithms']['LogisticRegression']['min_regparam'],
                                 design_state['algorithms']['LogisticRegression']['max_regparam'], 3))
        }
    },
    'RidgeRegression': {
        'model': Ridge,
        'params': {
            'alpha': list(np.linspace(design_state['algorithms']['RidgeRegression']['min_regparam'],
                                     design_state['algorithms']['RidgeRegression']['max_regparam'], 3))
        }
    },
    'LassoRegression': {
        'model': Lasso,
        'params': {
            'alpha': list(np.linspace(design_state['algorithms']['LassoRegression']['min_regparam'],
                                     design_state['algorithms']['LassoRegression']['max_regparam'], 3))
        }
    },
    'ElasticNetRegression': {
        'model': ElasticNet,
        'params': {
            'alpha': list(np.linspace(design_state['algorithms']['ElasticNetRegression']['min_regparam'],
                                     design_state['algorithms']['ElasticNetRegression']['max_regparam'], 3)),
            'l1_ratio': list(np.linspace(design_state['algorithms']['ElasticNetRegression']['min_elasticnet'],
                                        design_state['algorithms']['ElasticNetRegression']['max_elasticnet'], 3))
        }
    },
    'xg_boost': {
        'model': xgb.XGBRegressor if prediction_type == 'Regression' else xgb.XGBClassifier,
        'params': {
            'n_estimators': [design_state['algorithms']['xg_boost']['max_num_of_trees']] if design_state['algorithms']['xg_boost']['max_num_of_trees'] > 0 else [100],
            'max_depth': design_state['algorithms']['xg_boost']['max_depth_of_tree'],
            'learning_rate': design_state['algorithms']['xg_boost']['learningRate']
        }
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor,
        'params': {
            'max_depth': list(range(design_state['algorithms']['DecisionTreeRegressor']['min_depth'],
                                   design_state['algorithms']['DecisionTreeRegressor']['max_depth'] + 1)),
            'min_samples_leaf': design_state['algorithms']['DecisionTreeRegressor']['min_samples_per_leaf']
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier,
        'params': {
            'max_depth': list(range(design_state['algorithms']['DecisionTreeClassifier']['min_depth'],
                                   design_state['algorithms']['DecisionTreeClassifier']['max_depth'] + 1)),
            'min_samples_leaf': design_state['algorithms']['DecisionTreeClassifier']['min_samples_per_leaf']
        }
    },
    'SVM': {
        'model': SVR if prediction_type == 'Regression' else SVC,
        'params': {
            'C': design_state['algorithms']['SVM']['c_value'],
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid'] if design_state['algorithms']['SVM']['linear_kernel'] else ['rbf']
        }
    },
    'SGD': {
        'model': SGDRegressor if prediction_type == 'Regression' else SGDClassifier,
        'params': {
            'alpha': design_state['algorithms']['SGD']['alpha_value'],
            'max_iter': [1000]
        }
    },
    'KNN': {
        'model': KNeighborsRegressor if prediction_type == 'Regression' else KNeighborsClassifier,
        'params': {
            'n_neighbors': design_state['algorithms']['KNN']['k_value']
        }
    },
    'extra_random_trees': {
        'model': ExtraTreesRegressor if prediction_type == 'Regression' else ExtraTreesClassifier,
        'params': {
            'n_estimators': design_state['algorithms']['extra_random_trees']['num_of_trees'],
            'max_depth': design_state['algorithms']['extra_random_trees']['max_depth'],
            'min_samples_leaf': design_state['algorithms']['extra_random_trees']['min_samples_per_leaf']
        }
    },
    'neural_network': {
        'model': MLPRegressor if prediction_type == 'Regression' else MLPClassifier,
        'params': {
            'hidden_layer_sizes': [tuple(design_state['algorithms']['neural_network']['hidden_layer_sizes'])],
            'max_iter': [design_state['algorithms']['neural_network']['max_iterations']] if design_state['algorithms']['neural_network']['max_iterations'] > 0 else [200]
        }
    }
}

# Filter selected model
selected_model = None
for model_name, config in design_state['algorithms'].items():
    if config['is_selected']:
        selected_model = model_name
        break

if not selected_model:
    raise ValueError("No model selected in JSON configuration.")

Configuring models...


In [11]:
print(f"Training pipeline with {selected_model}...")
# Train-test split
train_ratio = design_state['train']['train_ratio']
train_ratio = 0.6 if train_ratio < 0.6 else train_ratio
random_seed = design_state['train']['random_seed']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio, random_state=random_seed)

# Sample weights
weight_var = design_state['weighting_stratergy']['weighting_stratergy_weight_variable']
sample_weight = X_train[weight_var] if weight_var in X_train.columns else None

# Pipeline
model_class = model_configs[selected_model]['model']
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_gen', feature_gen),
    ('feature_select', feature_selector),
    ('model', model_class())
])

# Grid search
param_grid = {f"model__{k}": v for k, v in model_configs[selected_model]['params'].items()}
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=design_state['hyperparameters']['num_of_folds'],
    scoring='r2' if prediction_type == 'Regression' else 'accuracy',
    n_jobs=design_state['hyperparameters']['parallelism'] if design_state['hyperparameters']['parallelism'] > 0 else None
)

# Fit pipeline
grid_search.fit(X_train, y_train, model__sample_weight=sample_weight)

# Evaluate
y_pred = grid_search.predict(X_test)
score = r2_score(y_test, y_pred) if prediction_type == 'Regression' else accuracy_score(y_test, y_pred)
print(f"Final evaluation score ({'R²' if prediction_type == 'Regression' else 'Accuracy'}): {score:.4f}")

# Save pipeline
joblib.dump(grid_search, 'trained_pipeline.joblib')

Training pipeline with RandomForestRegressor...
Final evaluation score (R²): 0.8979


['trained_pipeline.joblib']