In [39]:
import numpy as np
import pandas as pd
import os
import json 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [40]:
def read_json_file(file_name="algoparams_from_ui.json"):
    """
    Reads and parses a JSON file. Converts RTF to text if necessary.
    :param file_name: Name of the JSON file to read.
    :return: Parsed JSON data.
    """
    if not os.path.exists(file_name):
        print(f"Error: File '{file_name}' not found.")
        exit()

    _, ext = os.path.splitext(file_name)

    try:
        if ext == ".json":
            with open(file_name, "r") as file:
                return json.load(file)

        elif ext == ".rtf":
            with open(file_name, "r") as file:
                from striprtf.striprtf import rtf_to_text  # Requires striprtf module
                text_content = rtf_to_text(file.read())
                return json.loads(text_content)

        else:
            print(f"Error: Unsupported file extension '{ext}'. Only .json and .rtf are supported.")
            exit()

    except json.JSONDecodeError:
        print(f"Error: The file '{file_name}' is not a valid JSON file.")
        exit()
    except Exception as e:
        print(f"Error: An unexpected error occurred: {e}")
        exit()

# Example usage
config = read_json_file()
print(config)

{'session_name': 'test', 'session_description': 'test', 'design_state_data': {'session_info': {'project_id': '1', 'experiment_id': 'kkkk-11', 'dataset': 'iris_modified.csv', 'session_name': 'test', 'session_description': 'test'}, 'target': {'prediction_type': 'Regression', 'target': 'petal_width', 'type': 'regression', 'partitioning': True}, 'train': {'policy': 'Split the dataset', 'time_variable': 'sepal_length', 'sampling_method': 'No sampling(whole data)', 'split': 'Randomly', 'k_fold': False, 'train_ratio': 0.2, 'random_seed': 0}, 'metrics': {'optomize_model_hyperparameters_for': 'AUC', 'optimize_threshold_for': 'F1 Score', 'compute_lift_at': 0, 'cost_matrix_gain_for_true_prediction_true_result': 1, 'cost_matrix_gain_for_true_prediction_false_result': 0, 'cost_matrix_gain_for_false_prediction_true_result': 0, 'cost_matrix_gain_for_false_prediction_false_result': 0}, 'feature_handling': {'sepal_length': {'feature_name': 'sepal_length', 'is_selected': True, 'feature_variable_type': '

In [41]:
config

{'session_name': 'test',
 'session_description': 'test',
 'design_state_data': {'session_info': {'project_id': '1',
   'experiment_id': 'kkkk-11',
   'dataset': 'iris_modified.csv',
   'session_name': 'test',
   'session_description': 'test'},
  'target': {'prediction_type': 'Regression',
   'target': 'petal_width',
   'type': 'regression',
   'partitioning': True},
  'train': {'policy': 'Split the dataset',
   'time_variable': 'sepal_length',
   'sampling_method': 'No sampling(whole data)',
   'split': 'Randomly',
   'k_fold': False,
   'train_ratio': 0.2,
   'random_seed': 0},
  'metrics': {'optomize_model_hyperparameters_for': 'AUC',
   'optimize_threshold_for': 'F1 Score',
   'compute_lift_at': 0,
   'cost_matrix_gain_for_true_prediction_true_result': 1,
   'cost_matrix_gain_for_true_prediction_false_result': 0,
   'cost_matrix_gain_for_false_prediction_true_result': 0,
   'cost_matrix_gain_for_false_prediction_false_result': 0},
  'feature_handling': {'sepal_length': {'feature_nam

In [42]:
data=pd.read_csv("iris.csv")
print(data)

     sepal_length  sepal_width  petal_length  petal_width         species
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
..            ...          ...           ...          ...             ...
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica

[150 rows x 5 columns]


In [43]:
def datapreprocessing(config,data):
    """
    Create a preprocessing pipeline for numerical and text features based on feature_handling.
    """
    feature_handling = config["design_state_data"]["feature_handling"]
    
    target = config["design_state_data"]["target"]["target"]
    
    feature_to_select = [feature for feature,desc in feature_handling.items() if desc["is_selected"]==True]
    
    new_data= data[feature_to_select]
    
    X_train, X_test, y_train, y_test = train_test_split( new_data.drop(columns=[target], axis=1),  new_data[target], test_size=0.2, random_state=config['design_state_data']['train']["random_seed"])
    
    preproFeatures = [col for col in X_train.columns if col in feature_to_select]
    
       
    numerical_features = [feat for feat in  preproFeatures if config["design_state_data"]["feature_handling"][feat]['feature_variable_type'] == 'numerical' ]

    text_features = [
        feat for feat in preproFeatures 
        if config["design_state_data"]["feature_handling"][feat]['feature_variable_type'] == 'text'
    ]
    
    num_tfs = []
    text_tfs = []
    
    for feature in numerical_features :
        if (config["design_state_data"]["feature_handling"][feature]["feature_details"]["missing_values"]== "Impute"):
            impute_with = config["design_state_data"]["feature_handling"][feature]["feature_details"]["impute_with"]
            if impute_with == "Average of values":
                imputer = SimpleImputer(strategy="mean")
            elif impute_with == "custom":
                imputer = SimpleImputer(strategy="constant", fill_value=config["design_state_data"]["feature_handling"][feature]["feature_details"]["impute_value"])
            num_tfs.append((feature, imputer, [feature]))

    for feature in text_features:
        if config["design_state_data"]["feature_handling"][feature]["feature_details"]["text_handling"] == 'Tokenize and hash':
            hasher = HashingVectorizer(n_features=2**config["design_state_data"]["feature_handling"][feature]["feature_details"]['hash_columns'])
            text_tfs.append((feature, hasher, feature))  
            
    transformers = num_tfs + text_tfs
    
    return ColumnTransformer(transformers),preproFeatures,X_train, X_test, y_train, y_test, target

preprocessor,preproFeatures,X_train, X_test, y_train, y_test, target= datapreprocessing(config,data)
    
    

In [44]:
preprocessor

In [45]:
def y_preprocessing(y_train, y_test, target, config):
    if (config["design_state_data"]["feature_handling"][target]['feature_details']['missing_values'] == 'Impute'):
            impute_with = config["design_state_data"]["feature_handling"][target]['feature_details']['impute_with']
            if impute_with == "Average of values":
                m_imputer = SimpleImputer(strategy="mean")
                y_train = m_imputer.fit_transform(y_train.values.reshape(-1, 1))
                y_test = m_imputer.fit_transform(y_test.values.reshape(-1, 1))
                return pd.Series(y_train.flatten()), pd.Series(y_test.flatten())
            elif impute_with == "custom":
                m_imputer = SimpleImputer(strategy="constant", fill_value=config["design_state_data"]["feature_handling"][target]['feature_details']['impute_value'])
                y_train = m_imputer.fit_transform(y_train.values.reshape(-1, 1))
                y_test = m_imputer.fit_transform(y_test.values.reshape(-1, 1))
                return pd.Series(y_train.flatten()), pd.Series(y_test.flatten())
            
y_train, y_test = y_preprocessing(y_train, y_test, target, config)

In [46]:
No_of_columns=[i for i in range(len(preproFeatures))]


In [47]:
def feature_reduction(config):
    feature_red=config["design_state_data"]["feature_reduction"]
    method=feature_red['feature_reduction_method']
    
    ## PCA
    if method == "PCA":
        num_components = int(feature_red['num_of_features_to_keep'])

        feature_reduction = ColumnTransformer(
            transformers=[
                ('pca', PCA(n_components=num_components), No_of_columns)
            ]
        )
    
    
    
    ## No Reduction
    elif method == "No Reduction":
        feature_reduction = ColumnTransformer(
            transformers=[
                
                ('no_reduction', 'passthrough', No_of_columns)
            ]
        )
    # Corr with Target Reduction
    elif method == "Corr With Target":
        feature_reduction = ColumnTransformer(
        transformers=[('var_thresh', VarianceThreshold(threshold=0.1), No_of_columns)],
        remainder='drop'  
        )
        
        
        
    elif method == "Tree-based":
        feature_reduction = ColumnTransformer(
        transformers=[
            (
                'feature_selector',
                SelectFromModel(
                    RandomForestRegressor(
                        n_estimators=int(config['design_state_data']['feature_reduction']['num_of_trees']),
                        max_depth=int(config['design_state_data']['feature_reduction']['depth_of_trees'])
                    ),
                    max_features=int(config['design_state_data']['feature_reduction']["num_of_features_to_keep"]),
                    threshold=-np.inf
                ),
                No_of_columns
            )])
        
    else:
        raise ValueError(f"Unknown feature reduction method: {method}")
    
    return feature_reduction   

feature_reduction = feature_reduction(config) 

In [48]:
def models_init(config):
    for algo, details in config["design_state_data"]["algorithms"].items():
       
        if details["is_selected"]==True: 
            

            # Regression Models
            if algo == "RandomForestRegressor":
                if (details['parallelism'] == 0):
                    jobs = -1
                else:
                    jobs = details['parallelism']
                algo_object = RandomForestRegressor(
                    n_estimators=details['max_trees'],
                    max_depth=details['max_depth'],
                    min_samples_leaf=details['min_samples_per_leaf_min_value'],
                    n_jobs=jobs
                )
            
            elif algo == "GBTRegressor":
                if (details['max_subsample'] > 1 and details['max_subsample'] < 0):
                    subsample = 1.0
                else:
                    subsample = details['max_subsample']
                algo_object = GradientBoostingRegressor(
                    n_estimators=details['num_of_BoostingStages'][1],  
                    max_depth=details['max_depth'],                   
                    learning_rate=details['max_stepsize'],
                    subsample = subsample,
                    max_features = details['fixed_number']
                )
            
            elif algo == "DecisionTreeRegressor":
                if (algo["use_best"] == True):
                    splitter = 'best'
                elif(algo["use_random"] == True):
                    splitter = 'random'
                else:
                    splitter = 'best'

                algo_object = DecisionTreeRegressor(
                    max_depth=details['max_depth'],
                    min_samples_leaf=details['min_samples_per_leaf'][0],
                    splitter=splitter
                    )

            elif algo == "LinearRegression":
                if (details['parallelism'] == 0):
                    jobs = -1
                else:
                    jobs = details['parallelism']
                algo_object = LinearRegression(n_jobs=jobs)
                
            elif algo == "RidgeRegression":
                algo_object = Ridge(
                alpha=details['min_regparam'], 
                max_iter=details['max_iter']
                )
                
            elif algo == "LassoRegression":
                algo_object = Lasso(
                    alpha=details['min_regparam'],
                    max_iter=details['max_iter']
                )

            elif algo == "ElasticNetRegression":
                algo_object = ElasticNet(
                    alpha=details['min_regparam'],
                    l1_ratio=details['min_elasticnet'],
                    max_iter=details['max_iter']
                )
            elif algo == "xg_boost":
                algo_object = XGBRegressor(
                    n_estimators=details['max_num_of_trees'],
                    max_depth=details['max_depth_of_tree'][1],
                    learning_rate=details['learningRate'][0],
                    alpha=details['l1_regularization'][0],
                    lambda_=details['l2_regularization'][0],   
                    gamma=details['gamma'][0],
                    min_child_weight=details['min_child_weight'][0],
                    subsample=details['sub_sample'][0],
                    colsample_bytree=details['col_sample_by_tree'][0],
                    random_state=details['random_state'],
                    n_jobs=details['parallelism']
                )
                
            elif algo == "SVM":
                if (details["linear_kernel"] == True):
                    kernel = 'linear'
                elif (details["sigmoid_kernel"] == True):
                    kernel = 'sigmoid' 
                elif (details["poly_kernel"] == True):
                    kernel = 'poly'
                else:
                    kernel = 'rbf'

                if (details["scale"] == True):
                    gamma = 'scale'
                elif (details["custom_gamma_values"] == True):
                    gamma = int(input("Enter the custom gamma value: "))
                else:
                    gamma = 'auto'

                algo_object = SVR(
                    kernel=kernel,
                    C=details['c_value'][0],
                    epsilon=0.1,
                    gamma=gamma,
                    tol=details['tolerance'],
                    max_iter=details['max_iterations']
                )

            elif algo == "SGD":
                if (details["use_l1_regularization"] == True):
                    penalty = "l1"
                elif (details["use_l2_regularization"] == True):
                    penalty = "l2"
                elif (details["use_elastic_net_regularization"] == True):
                    penalty = "elasticnet"
                else:
                    penalty = "l2"
                    
                algo_object = SGDRegressor(
                    max_iter=details['max_iterations'],
                    tol=details['tolerance'],
                    alpha=details['alpha_value'][0],
                    penalty=penalty
                    )
            
            
            
            elif algo == "KNN":
                if (algo["distance_weighting"] == True):
                    weight = 'distance'
                else:
                    weight = 'uniform'

                algo_object = KNeighborsRegressor(
                    n_neighbors=details['k_value'][0],
                    weights=weight,
                    algorithm=details['neighbour_finding_algorithm'],
                    p=algo['p_value']
                )
        
            elif algo =="extra_random_trees":
                if (details["feature_sampling_statergy"] == "Square root"):
                    max_features = "sqrt"
                elif (details["feature_sampling_statergy"] == "Logarithm"):
                    max_features = "log2"
                else:
                    max_features = 1.0

                algo_object = ExtraTreesRegressor(
                    n_estimators=details['num_of_trees'][1],
                    max_depth=details['max_depth'][1],
                    min_samples_leaf=details['min_samples_per_leaf'][1],
                    max_features=max_features,
                    n_jobs=details['parallelism']
                )
        
            elif algo == "neural_network":
                if (details['automatic_batching']== True):
                    batch_size = 'auto'
                else:
                    batch_size = 200
                algo_object = MLPRegressor(
                    hidden_layer_sizes=details['hidden_layer_sizes'],
                    activation='relu',  
                    solver=details['solver'],
                    alpha=details['alpha_value'],
                    max_iter=details['max_iterations'],
                    batch_size= batch_size,
                    learning_rate_init=details['initial_learning_rate'],
                    tol=algo['convergence_tolerance'],
                    early_stopping=details['early_stopping'],
                    shuffle=details['shuffle_data'] ,
                    beta_1=details['beta_1'],
                    beta_2=details['beta_2'],
                    epsilon=details['epsilon'],
                    power_t=details['power_t'],
                    momentum=details['momentum'],
                    nesterovs_momentum=details['use_nesterov_momentum']
                    )

            return algo_object

    # If no algorithm is selected, raise an error
    raise ValueError("No valid algorithm is selected in the JSON configuration.")
            
algo_object = models_init(config)            
            

In [49]:
steps=[("preprocessor", preprocessor), ("feature_reduction", feature_reduction), ("regressor", algo_object)]
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)

In [50]:
    
pipeline.predict(X_test)
pipeline.score(X_test, y_test)

0.9115443494805675

In [51]:
def my_grid_search_cv():
    para_grid = {}

    if isinstance(algo_object, RandomForestRegressor):
        para_grid = {
            "regressor__n_estimators": [100, 300, 500],  
            "regressor__max_depth": [None, 10, 30],     
            "regressor__min_samples_split": [2, 5, 10],  
        }

    elif isinstance(algo_object, GradientBoostingRegressor):
        para_grid = {
            "regressor__n_estimators": [100, 200, 300],    
            "regressor__learning_rate": [0.05, 0.1, 0.3],  
            "regressor__subsample": [0.7, 0.85, 1.0],      
            "regressor__max_depth": [3, 7, 15],            
        }

    elif isinstance(algo_object, LinearRegression):
        para_grid = {
            "regressor__fit_intercept": [True, False],  
            "regressor__normalize": [True, False],      
        }

    elif isinstance(algo_object, Ridge):
        para_grid = {
            "regressor__alpha": [0.01, 0.1, 1.0, 10.0],  
            "regressor__solver": ["auto", "svd", "cholesky"],  
        }

    elif isinstance(algo_object, Lasso):
        para_grid = {
            "regressor__alpha": [0.001, 0.01, 0.1, 1.0],  
            "regressor__selection": ["cyclic", "random"],  
        }

    elif isinstance(algo_object, ElasticNet):
        para_grid = {
            "regressor__alpha": [0.001, 0.01, 0.1],       
            "regressor__l1_ratio": [0.2, 0.5, 0.8],       
            "regressor__selection": ["cyclic", "random"], 
        }

    elif isinstance(algo_object, XGBRegressor):
        para_grid = {
            "regressor__n_estimators": [50, 150, 300],           
            "regressor__max_depth": [4, 8, 12],                 
            "regressor__learning_rate": [0.01, 0.05, 0.2],      
            "regressor__colsample_bytree": [0.7, 0.9, 1.0],     
            "regressor__reg_lambda": [0.5, 1.0, 2.0],           
        }

    elif isinstance(algo_object, DecisionTreeRegressor):
        para_grid = {
            "regressor__max_depth": [None, 15, 25],          
            "regressor__min_samples_split": [2, 5, 10],      
            "regressor__min_samples_leaf": [1, 3, 7],        
        }

    elif isinstance(algo_object, SVR):
        para_grid = {
            "regressor__kernel": ["linear", "poly", "rbf"],  
            "regressor__C": [0.5, 1.0, 5.0],                
            "regressor__epsilon": [0.01, 0.1, 0.5],         
        }

    elif isinstance(algo_object, SGDRegressor):
        para_grid = {
            "regressor__alpha": [0.0001, 0.001, 0.01],      
            "regressor__penalty": ["l2", "elasticnet"],     
            "regressor__learning_rate": ["constant", "adaptive"], 
        }

    elif isinstance(algo_object, KNeighborsRegressor):
        para_grid = {
            "regressor__n_neighbors": [2, 5, 15],           
            "regressor__weights": ["uniform", "distance"],  
            "regressor__metric": ["euclidean", "manhattan"], 
        }

    elif isinstance(algo_object, ExtraTreesRegressor):
        para_grid = {
            "regressor__n_estimators": [100, 300, 500],           
            "regressor__max_depth": [None, 10, 30],              
            "regressor__min_samples_split": [2, 5, 10],          
        }

    elif isinstance(algo_object, MLPRegressor):
        para_grid = {
            "regressor__hidden_layer_sizes": [(50, 50), (100,), (50, 100, 50)], 
            "regressor__activation": ["relu", "logistic"],                     
            "regressor__solver": ["adam", "sgd"],                              
            "regressor__learning_rate": ["constant", "adaptive"],              
        }

    return para_grid

para_grid = my_grid_search_cv()


In [52]:
grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid=para_grid,
    n_jobs=config['design_state_data']['hyperparameters']['parallelism'], 
    cv=config['design_state_data']['hyperparameters']['num_of_folds'], 
    scoring='r2', 
    verbose=2
)

# Fit the model
grid_search.fit(X_train, y_train)


print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

Fitting 6 folds for each of 27 candidates, totalling 162 fits
Best Parameters: {'regressor__max_depth': 30, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Best R2 Score: 0.9385959239074141


In [53]:
grid_search

In [54]:
best_model = grid_search.best_estimator_

In [55]:
y_pred = best_model.predict(X_test)

In [56]:
MSE = mean_squared_error(y_test, y_pred)
MAE = mean_absolute_error(y_test, y_pred)
R2 = r2_score(y_test, y_pred)
RMSE = np.sqrt(MSE)

In [57]:
print("Mean Squared Error (MSE):", MSE)
print("Mean Absolute Error (MAE):", MAE)
print("R-squared (R2):", R2)
print("Root Mean Squared Error (RMSE):", RMSE)

Mean Squared Error (MSE): 0.041367372697637524
Mean Absolute Error (MAE): 0.1414629408617412
R-squared (R2): 0.9138359243956727
Root Mean Squared Error (RMSE): 0.20338970646922505
