In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import json
import re
from striprtf.striprtf import rtf_to_text
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Parse RTF and Extract JSON
with open("algoparams_from_ui.json.rtf", "r", encoding="utf-8") as f:
    rtf_content = f.read()

plain_text = rtf_to_text(rtf_content)
json_candidates = re.findall(r'\{.*\}', plain_text, re.DOTALL)
if not json_candidates:
    raise ValueError("No JSON-like block found in RTF file")
try:
    config = json.loads(json_candidates[0])
except json.JSONDecodeError as e:
    print("Failed JSON Text Preview:", json_candidates[0][:300])
    raise ValueError("Failed to decode JSON.") from e

In [3]:
# Extract info from JSON
cfg = config['design_state_data']
target_col = cfg['target']['target']
prediction_type = cfg['target']['prediction_type'].lower()
features_cfg = cfg['feature_handling']
dataset_name = cfg['session_info']['dataset']

In [4]:
# Override with correct file
dataset_name = "iris.csv"

df = pd.read_csv(dataset_name)

In [5]:
# Feature handling
num_features = []
cat_features = []
impute_values = {}

for feat_name, feat_info in features_cfg.items():
    if feat_info['is_selected']:
        ftype = feat_info['feature_variable_type']
        details = feat_info['feature_details']
        if ftype == 'numerical':
            num_features.append(feat_name)
            if details['impute_with'] == 'Average of values':
                impute_values[feat_name] = ('mean', None)
            elif details['impute_with'] == 'custom':
                impute_values[feat_name] = ('constant', details['impute_value'])
        elif ftype == 'text':
            cat_features.append(feat_name)
            impute_values[feat_name] = ('most_frequent', None)

In [6]:
# Create transformers
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [7]:
# Feature Reduction
reduction_cfg = cfg.get('feature_reduction', {})
reduction_method = reduction_cfg.get('feature_reduction_method', 'No Reduction')
reduction = None

if reduction_method == 'PCA':
    reduction = ('reduction', PCA(n_components=int(reduction_cfg.get('num_of_features_to_keep', 4))))
elif reduction_method == 'Tree-based':
    tree_model = RandomForestRegressor(n_estimators=int(reduction_cfg.get('num_of_trees', 5)))
    reduction = ('reduction', SelectFromModel(tree_model, max_features=int(reduction_cfg.get('num_of_features_to_keep', 4)), threshold=-np.inf))

In [8]:
# Model Configurations
models = []
algos = cfg['algorithms']

for model_key, model_cfg in algos.items():
    if not model_cfg['is_selected'] or prediction_type != 'regression':
        continue

    if model_key == 'RandomForestRegressor':
        model = RandomForestRegressor()
        param_grid = {
            'model__n_estimators': list(range(model_cfg['min_trees'], model_cfg['max_trees'] + 1, 5)),
            'model__max_depth': list(range(model_cfg['min_depth'], model_cfg['max_depth'] + 1, 5)),
            'model__min_samples_leaf': list(range(model_cfg['min_samples_per_leaf_min_value'], model_cfg['min_samples_per_leaf_max_value'] + 1, 5))
        }
    elif model_key == 'GBTRegressor':
        model = GradientBoostingRegressor()
        param_grid = {
            'model__n_estimators': list(range(model_cfg['min_iter'], model_cfg['max_iter'] + 1, 5)),
            'model__learning_rate': [model_cfg['min_stepsize'], model_cfg['max_stepsize']],
            'model__max_depth': list(range(model_cfg['min_depth'], model_cfg['max_depth'] + 1))
        }
    elif model_key == 'LinearRegression':
        model = LinearRegression()
        param_grid = {}
    elif model_key == 'RidgeRegression':
        model = Ridge()
        param_grid = {
            'model__alpha': np.linspace(model_cfg['min_regparam'], model_cfg['max_regparam'], 3),
            'model__max_iter': list(range(model_cfg['min_iter'], model_cfg['max_iter'] + 1, 10))
        }
    elif model_key == 'LassoRegression':
        model = Lasso()
        param_grid = {
            'model__alpha': np.linspace(model_cfg['min_regparam'], model_cfg['max_regparam'], 3),
            'model__max_iter': list(range(model_cfg['min_iter'], model_cfg['max_iter'] + 1, 10))
        }
    elif model_key == 'ElasticNetRegression':
        model = ElasticNet()
        param_grid = {
            'model__alpha': np.linspace(model_cfg['min_regparam'], model_cfg['max_regparam'], 3),
            'model__l1_ratio': np.linspace(model_cfg['min_elasticnet'], model_cfg['max_elasticnet'], 3),
            'model__max_iter': list(range(model_cfg['min_iter'], model_cfg['max_iter'] + 1, 10))
        }
    elif model_key == 'DecisionTreeRegressor':
        model = DecisionTreeRegressor()
        param_grid = {
            'model__max_depth': list(range(model_cfg['min_depth'], model_cfg['max_depth'] + 1)),
            'model__min_samples_leaf': model_cfg['min_samples_per_leaf']
        }
    elif model_key == 'extra_random_trees':
        model = ExtraTreesRegressor()
        param_grid = {
            'model__n_estimators': model_cfg['num_of_trees'],
            'model__max_depth': model_cfg['max_depth'],
            'model__min_samples_leaf': model_cfg['min_samples_per_leaf']
        }
    else:
        continue

    models.append((model_key, model, param_grid))

In [9]:
# Train/test split
X = df[num_features + cat_features]
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Fit models and evaluate
for model_name, model_obj, grid_params in models:
    steps = [('preprocessor', preprocessor)]
    if reduction:
        steps.append(reduction)
    steps.append(('model', model_obj))
    
    pipe = Pipeline(steps=steps)
    grid = GridSearchCV(pipe, param_grid=grid_params, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    
    y_pred = grid.predict(X_test)
    print(f"Model: {model_name}")
    print(f"Best Params: {grid.best_params_}")
    print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")

Model: RandomForestRegressor
Best Params: {'model__max_depth': 25, 'model__min_samples_leaf': 5, 'model__n_estimators': 10}
R2 Score: 0.9956
MSE: 0.0028
