In [254]:
import os
import json
from striprtf.striprtf import rtf_to_text
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [255]:
def json_file_data():

    # print("Please enter your JSON file name(it must be in same folder): ")
    # file_name = input()
    file_name = "algoparams_from_ui.json"
    if(os.path.exists(file_name)):
        root,ext = os.path.splitext(file_name)

        if(ext != ".json"):
            if(ext == ".rtf"):
                with open(file_name, "r") as f:
                    required_data = rtf_to_text(f.read())
                    try:
                        d = json.loads(required_data)
                    except:
                        print("File is not JSON")
        else:
            with open(file_name, "r") as f:
                d = json.load(f)
    else:
        print("File not found")
        exit()
    return d

d = json_file_data()




In [256]:
df =pd.read_csv("iris.csv")
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [257]:
def data_preprocessing(df,d):
    feature_handling = d["design_state_data"]["feature_handling"]
    select_features = {}

    for fname,fdescrip in feature_handling .items():
        if fdescrip["is_selected"] == True :
            select_features[fname] = fdescrip

    select_features

    target_column_name = d['design_state_data']['target']['target']

    select_features_list = list(select_features.keys())
    ndf = df[select_features_list]

    X_train, X_test, y_train, y_test = train_test_split( ndf.drop(columns=[target_column_name], axis=1),  ndf[target_column_name], test_size=d['design_state_data']['train']["train_ratio"], random_state=d['design_state_data']['train']["random_seed"])

    f_list_for_preproc = []
    for col in X_train.columns:
        if col in select_features_list:
            f_list_for_preproc.append(col)

    num_features = []
    text_features = []

    for col in f_list_for_preproc:
        if(select_features[col]['feature_variable_type'] == 'numerical'):
            num_features.append(col)
        if(select_features[col]['feature_variable_type']== 'text'):
            text_features.append(col)


    num_transformers = []
    text_transformers = []

    for col in num_features:
        if (d["design_state_data"]["feature_handling"][col]['feature_details']['missing_values'] == 'Impute'):
            impute_with = select_features[col]['feature_details']['impute_with']
            if impute_with == "Average of values":
                imputer = SimpleImputer(strategy="mean")
            elif impute_with == "custom":
                imputer = SimpleImputer(strategy="constant", fill_value=select_features[col]['feature_details']['impute_value'])
            num_transformers.append((col, imputer, [col]))

    for col in text_features:
        if select_features[col]["feature_details"]["text_handling"] == 'Tokenize and hash':
            hasher = HashingVectorizer(n_features=2**select_features[col]["feature_details"]['hash_columns'])
            text_transformers.append((col, hasher, col))  

    transformers = num_transformers + text_transformers

    return ColumnTransformer(transformers),f_list_for_preproc,X_train, X_test, y_train, y_test, target_column_name


preprocessor,f_list_for_preproc,X_train, X_test, y_train, y_test, target_column_name= data_preprocessing(df,d)
        

In [258]:
def y_train_test_preprocessing(y_train, y_test, target_column_name, d):
    if (d["design_state_data"]["feature_handling"][target_column_name]['feature_details']['missing_values'] == 'Impute'):
            impute_with = d["design_state_data"]["feature_handling"][target_column_name]['feature_details']['impute_with']
            if impute_with == "Average of values":
                S_imputer = SimpleImputer(strategy="mean")
                y_train = S_imputer.fit_transform(y_train.values.reshape(-1, 1))
                y_test = S_imputer.fit_transform(y_test.values.reshape(-1, 1))
                return pd.Series(y_train.flatten()), pd.Series(y_test.flatten())
            elif impute_with == "custom":
                S_imputer = SimpleImputer(strategy="constant", fill_value=d["design_state_data"]["feature_handling"][target_column_name]['feature_details']['impute_value'])
                y_train = S_imputer.fit_transform(y_train.values.reshape(-1, 1))
                y_test = S_imputer.fit_transform(y_test.values.reshape(-1, 1))
                return pd.Series(y_train.flatten()), pd.Series(y_test.flatten())
            
y_train, y_test = y_train_test_preprocessing(y_train, y_test, target_column_name, d)
            



In [259]:
d["design_state_data"]["feature_handling"][target_column_name]['feature_details']['impute_with']

'custom'

In [260]:


col_no_list = []
for i in range(len(f_list_for_preproc)):
    col_no_list.append(i)

In [261]:
def my_feature_reduction(d):
    f_r =d["design_state_data"]["feature_reduction"]
    f_r_method = f_r['feature_reduction_method']

    if (f_r_method == 'No Reduction'):
        feature_reduction = ColumnTransformer(
            transformers=[
                
                ('no_reduction', 'passthrough', col_no_list)
            ]
        )

    elif (f_r_method == 'Correlation with target'):
        feature_reduction = ColumnTransformer(
        transformers=[('var_thresh', VarianceThreshold(threshold=0.1), col_no_list)],
        remainder='drop'  
        )

    elif (f_r_method == 'Tree-based'):
        feature_reduction = ColumnTransformer(
        transformers=[
            (
                'feature_selector',
                SelectFromModel(
                    RandomForestRegressor(
                        n_estimators=int(d['design_state_data']['feature_reduction']['num_of_trees']),
                        max_depth=int(d['design_state_data']['feature_reduction']['depth_of_trees'])
                    ),
                    max_features=int(d['design_state_data']['feature_reduction']["num_of_features_to_keep"]),
                    threshold=-np.inf
                ),
                col_no_list
            )])
        
    elif (f_r_method == 'Principal Component Analysis'):
        num_components = int(f_r['num_of_features_to_keep'])

        feature_reduction = ColumnTransformer(
            transformers=[
                ('pca', PCA(n_components=num_components), col_no_list)
            ]
        )
    
    
    return feature_reduction
feature_reduction = my_feature_reduction(d)

regression algorithms:

1. RandomForestRegressor  
2. GBTRegressor  
3. LinearRegression  
4. RidgeRegression  
5. LassoRegression  
6. ElasticNetRegression  
7. xg_boost  
8. DecisionTreeRegressor  
9. SVM (Support Vector Regression - SVR)  
10. SGD (Stochastic Gradient Descent)  
11. KNN (k-Nearest Neighbors)  
12. Extra Random Trees  
13. Neural Network

In [262]:
def my_model_selection(d):    
    if (d["design_state_data"]["algorithms"]['RandomForestRegressor']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['RandomForestRegressor']
        if (algo['parallelism'] == 0):
            jobs = -1
        else:
            jobs = algo['parallelism']
        algo_object = RandomForestRegressor(
            n_estimators=algo['max_trees'],
            max_depth=algo['max_depth'],
            min_samples_leaf=algo['min_samples_per_leaf_min_value'],
            n_jobs=jobs
        )


    elif (d["design_state_data"]["algorithms"]['GBTRegressor']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['GBTRegressor']
        if (algo['max_subsample'] > 1 and algo['max_subsample'] < 0):
            subsample = 1.0
        else:
            subsample = algo['max_subsample']
        algo_object = GradientBoostingRegressor(
            n_estimators=algo['num_of_BoostingStages'][1],  
            max_depth=algo['max_depth'],                   
            learning_rate=algo['max_stepsize'],
            subsample = subsample,
            max_features = algo['fixed_number']
        )

    elif (d["design_state_data"]["algorithms"]['LinearRegression']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['LinearRegression']
        if (algo['parallelism'] == 0):
            jobs = -1
        else:
            jobs = algo['parallelism']
        algo_object = LinearRegression(n_jobs=jobs)

    elif (d["design_state_data"]["algorithms"]['RidgeRegression']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['RidgeRegression']
        algo_object = Ridge(
            alpha=algo['min_regparam'], 
            max_iter=algo['max_iter']
        )


    elif (d["design_state_data"]["algorithms"]['LassoRegression']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['LassoRegression']
        algo_object = Lasso(
            alpha=algo['min_regparam'],
            max_iter=algo['max_iter']
        )

    elif (d["design_state_data"]["algorithms"]['ElasticNetRegression']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['ElasticNetRegression']
        algo_object = ElasticNet(
            alpha=algo['min_regparam'],
            l1_ratio=algo['min_elasticnet'],
            max_iter=algo['max_iter']
        )

    elif (d["design_state_data"]["algorithms"]['xg_boost']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['xg_boost']
        
        algo_object = XGBRegressor(
            n_estimators=algo['max_num_of_trees'],
            max_depth=algo['max_depth_of_tree'][1],
            learning_rate=algo['learningRate'][0],
            alpha=algo['l1_regularization'][0],
            lambda_=algo['l2_regularization'][0],
            gamma=algo['gamma'][0],
            min_child_weight=algo['min_child_weight'][0],
            subsample=algo['sub_sample'][0],
            colsample_bytree=algo['col_sample_by_tree'][0],
            random_state=algo['random_state'],
            n_jobs=algo['parallelism']
        )


    elif (d["design_state_data"]["algorithms"]['DecisionTreeRegressor']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['DecisionTreeRegressor']
        if (algo["use_best"] == True):
            splitter = 'best'
        elif(algo["use_random"] == True):
            splitter = 'random'
        else:
            splitter = 'best'

        algo_object = DecisionTreeRegressor(
            max_depth=algo['max_depth'],
            min_samples_leaf=algo['min_samples_per_leaf'][0],
            splitter=splitter
        )


    elif (d["design_state_data"]["algorithms"]['SVM']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['SVM']
        if (algo["linear_kernel"] == True):
            kernel = 'linear'
        elif (algo["sigmoid_kernel"] == True):
            kernel = 'sigmoid' 
        elif (algo["poly_kernel"] == True):
            kernel = 'poly'
        else:
            kernel = 'rbf'

        if (algo["scale"] == True):
            gamma = 'scale'
        elif (algo["custom_gamma_values"] == True):
            gamma = int(input("Enter the custom gamma value: "))
        else:
            gamma = 'auto'

        algo_object = SVR(
            kernel=kernel,
            C=algo['c_value'][0],
            epsilon=0.1,
            gamma=gamma,
            tol=algo['tolerance'],
            max_iter=algo['max_iterations']
        )


    elif (d["design_state_data"]["algorithms"]['SGD']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['SGD']
        if (algo["use_l1_regularization"] == True):
            penalty = "l1"
        elif (algo["use_l2_regularization"] == True):
            penalty = "l2"
        elif (algo["use_elastic_net_regularization"] == True):
            penalty = "elasticnet"
        else:
            penalty = "l2"
            
        algo_object = SGDRegressor(
            max_iter=algo['max_iterations'],
            tol=algo['tolerance'],
            alpha=algo['alpha_value'][0],
            penalty=penalty
            )

    elif (d["design_state_data"]["algorithms"]['KNN']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['KNN']
        
        if (algo["distance_weighting"] == True):
            weight = 'distance'
        else:
            weight = 'uniform'

        algo_object = KNeighborsRegressor(
            n_neighbors=algo['k_value'][0],
            weights=weight,
            algorithm=algo['neighbour_finding_algorithm'],
            p=algo['p_value']
        )

    elif (d["design_state_data"]["algorithms"]['extra_random_trees']['is_selected'] == True):
        algo = d["design_state_data"]["algorithms"]['extra_random_trees']

        if (algo["feature_sampling_statergy"] == "Square root"):
            max_features = "sqrt"
        elif (algo["feature_sampling_statergy"] == "Logarithm"):
            max_features = "log2"
        else:
            max_features = 1.0

        algo_object = ExtraTreesRegressor(
            n_estimators=algo['num_of_trees'][1],
            max_depth=algo['max_depth'][1],
            min_samples_leaf=algo['min_samples_per_leaf'][1],
            max_features=max_features,
            n_jobs=algo['parallelism']
        )

    elif d["design_state_data"]["algorithms"]['neural_network']['is_selected'] == True:
        algo = d["design_state_data"]["algorithms"]['neural_network']

        if (algo['automatic_batching']== True):
            batch_size = 'auto'
        else:
            batch_size = 200
        algo_object = MLPRegressor(
            hidden_layer_sizes=algo['hidden_layer_sizes'],
            activation='relu',  
            solver=algo['solver'],
            alpha=algo['alpha_value'],
            max_iter=algo['max_iterations'],
            batch_size= batch_size,
            learning_rate_init=algo['initial_learning_rate'],
            tol=algo['convergence_tolerance'],
            early_stopping=algo['early_stopping'],
            shuffle=algo['shuffle_data'] ,
            beta_1=algo['beta_1'],
            beta_2=algo['beta_2'],
            epsilon=algo['epsilon'],
            power_t=algo['power_t'],
            momentum=algo['momentum'],
            nesterovs_momentum=algo['use_nesterov_momentum']
            )
    

    return algo_object

algo_object = my_model_selection(d)


In [263]:
steps=[("preprocessor", preprocessor), ("feature_reduction", feature_reduction), ("regressor", algo_object)]
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)

In [264]:
pipeline.predict(X_test)
pipeline.score(X_test, y_test)

0.927191433323976

In [265]:
def my_grid_search_cv():
    para_grid = {}

    
    if isinstance(algo_object, RandomForestRegressor):
        para_grid = {
            "regressor__n_estimators": [50, 100, 200],  
            "regressor__max_depth": [None, 10, 20],    
            "regressor__min_samples_leaf": [1, 2, 5],  
        }

    
    elif isinstance(algo_object, GradientBoostingRegressor):
        para_grid = {
            "regressor__n_estimators": [50, 100, 200],   
            "regressor__learning_rate": [0.01, 0.1, 0.2], 
            "regressor__max_depth": [3, 5, 10],          
        }

    
    elif isinstance(algo_object, LinearRegression):
        para_grid = {
            "regressor__fit_intercept": [True, False], 
        }

    
    elif isinstance(algo_object, Ridge):
        para_grid = {
            "regressor__alpha": [0.1, 1.0, 10.0],  
            "regressor__max_iter": [1000, 2000],   
        }

   
    elif isinstance(algo_object, Lasso):
        para_grid = {
            "regressor__alpha": [0.01, 0.1, 1.0], 
            "regressor__max_iter": [1000, 2000], 
        }

    
    elif isinstance(algo_object, ElasticNet):
        para_grid = {
            "regressor__alpha": [0.01, 0.1, 1.0],    
            "regressor__l1_ratio": [0.1, 0.5, 0.9],  
            "regressor__max_iter": [1000, 2000],     
        }

    
    elif isinstance(algo_object, XGBRegressor):
        para_grid = {
            "regressor__n_estimators": [50, 100, 200],       
            "regressor__max_depth": [3, 5, 10],             
            "regressor__learning_rate": [0.01, 0.1, 0.2],   
            "regressor__subsample": [0.8, 1.0],             
            "regressor__colsample_bytree": [0.8, 1.0],      
        }

    elif isinstance(algo_object, DecisionTreeRegressor):
        para_grid = {
            "regressor__max_depth": [None, 10, 20],                
            "regressor__min_samples_leaf": [1, 2, 5],             
            "regressor__splitter": ["best", "random"],            
        }

    elif isinstance(algo_object, SVR):
        para_grid = {
            "regressor__kernel": ["linear", "poly", "rbf", "sigmoid"],  
            "regressor__C": [0.1, 1.0, 10.0],                          
            "regressor__gamma": ["scale", "auto"],                     
        }

    elif isinstance(algo_object, SGDRegressor):
        para_grid = {
            "regressor__alpha": [0.0001, 0.001, 0.01],             
            "regressor__penalty": ["l1", "l2", "elasticnet"],      
            "regressor__max_iter": [1000, 2000, 3000],             
            "regressor__tol": [1e-3, 1e-4],                        
        }
    
    elif isinstance(algo_object, KNeighborsRegressor):
        para_grid = {
            "regressor__n_neighbors": [3, 5, 10],          
            "regressor__weights": ["uniform", "distance"], 
            "regressor__p": [1, 2],                        
        }
    elif isinstance(algo_object, ExtraTreesRegressor):
        para_grid = {
            "regressor__n_estimators": [50, 100, 200],             
            "regressor__max_depth": [None, 10, 20],                
            "regressor__min_samples_leaf": [1, 2, 5],             
            "regressor__max_features": ["sqrt", "log2", None],     
        }

    elif isinstance(algo_object, MLPRegressor):
        para_grid = {
            "regressor__hidden_layer_sizes": [(50,), (100,), (50, 50)],  
            "regressor__activation": ["relu", "tanh", "logistic"],       
            "regressor__solver": ["adam", "sgd", "lbfgs"],               
            "regressor__learning_rate_init": [0.001, 0.01, 0.1],         
            "regressor__max_iter": [200, 500, 1000],                     
        }

    return para_grid

para_grid = my_grid_search_cv()

In [266]:


grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid=para_grid,
    n_jobs = 20,
    #n_jobs = d['design_state_data']['hyperparameters']['parallelism'], 
    cv=d['design_state_data']['hyperparameters']['num_of_folds'], 
    scoring='r2', 
    verbose=2
    )


grid_search.fit(X_train, y_train)


print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)


Fitting 6 folds for each of 243 candidates, totalling 1458 fits
Best Parameters: {'regressor__activation': 'relu', 'regressor__hidden_layer_sizes': (50, 50), 'regressor__learning_rate_init': 0.1, 'regressor__max_iter': 200, 'regressor__solver': 'adam'}
Best R2 Score: 0.9424233275806858


In [267]:
grid_search

In [268]:
best_model = grid_search.best_estimator_

In [269]:
y_pred = best_model.predict(X_test)

In [270]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

In [271]:
print("Mean Squared Error (MSE) -->", mse)
print("Mean Absolute Error (MAE) -->", mae)
print("Root Mean Squared Error (RMSE) -->", rmse)
print("R^2 (Coefficient of Determination) -->", r2)

Mean Squared Error (MSE) --> 0.06407782182384301
Mean Absolute Error (MAE) --> 0.17571499819128858
Root Mean Squared Error (RMSE) --> 0.2531359749696653
R^2 (Coefficient of Determination) --> 0.8991941134411391
