In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import json

In [3]:
csv_data = pd.read_csv("C://Users//91814//Desktop//atharva//iris.csv")

In [4]:
file_path='C://Users//91814//Desktop//atharva//algoparams_from_ui.json.json'
with open(file_path, 'r') as file:
    algoparams_from_ui = json.load(file)

In [5]:
algoparams_from_ui

{'\ufeff{\n    "session_name": "test",\n    "session_description": "test",\n    "design_state_data": {\n\n      "session_info" : {\n        "project_id": "1",\n        "experiment_id": "kkkk-11",\n        "dataset":"iris_modified.csv",\n        "session_name": "test",\n        "session_description": "test"\n        },\n\n      "target": {\n        "prediction_type": "Regression",\n        "target": "petal_width",\n        "type":"regression",\n        "partitioning": true\n      },\n      "train": {\n        "policy": "Split the dataset",\n        "time_variable": "sepal_length",\n        "sampling_method": "No sampling(whole data)",\n        "split": "Randomly",\n        "k_fold": false,\n        "train_ratio": 0,\n        "random_seed": 0\n      },\n      "metrics": {\n        "optomize_model_hyperparameters_for": "AUC",\n        "optimize_threshold_for": "F1 Score",\n        "compute_lift_at": 0,\n        "cost_matrix_gain_for_true_prediction_true_result": 1,\n        "cost_matrix_g

In [None]:
# 1) Read the target and type of regression to be run
target = algoparams_from_ui['design_state_data']['target']['target']
prediction_type = algoparams_from_ui['design_state_data']['target']['prediction_type']

# 2) Read the features and apply missing value imputation
feature_handling = algoparams_from_ui['design_state_data']['feature_handling']
selected_features = [feature_name for feature_name, details in feature_handling.items() if details['is_selected']]
numeric_features = [feature_name for feature_name in selected_features if feature_handling[feature_name]['feature_variable_type'] == 'numerical']
categorical_features = [feature_name for feature_name in selected_features if feature_handling[feature_name]['feature_variable_type'] == 'text']

# Construct the ColumnTransformer for feature handling
transformer_steps = []
for feature_name in selected_features:
    details = feature_handling[feature_name]['feature_details']
    if details['missing_values'] == 'Impute':
        imputer = SimpleImputer(strategy='mean' if details['impute_with'] == 'Average of values' else 'constant', fill_value=details['impute_value'])
        transformer_steps.append((feature_name + '_impute', imputer, [feature_name]))

preprocessor = ColumnTransformer(transformer_steps)

In [None]:
# 3) Compute feature reduction based on input
feature_reduction_method = algoparams_from_ui['design_state_data']['feature_reduction']['feature_reduction_method']
num_of_features_to_keep = int(algoparams_from_ui['design_state_data']['feature_reduction']['num_of_features_to_keep'])

if feature_reduction_method == 'Correlation with target':
    feature_reduction_model = SelectKBest(k=num_of_features_to_keep)
elif feature_reduction_method == 'Principal Component Analysis':
    feature_reduction_model = PCA(n_components=num_of_features_to_keep)
else:
    feature_reduction_model = None

# 4) Parse the JSON and make the model objects
models_to_run = [model_name for model_name, details in algoparams_from_ui['design_state_data']['algorithms'].items() if details['is_selected']]
model_objects = {}

if prediction_type == 'Regression':
    for model_name in models_to_run:
        model_details = algoparams_from_ui['design_state_data']['algorithms'][model_name]
        if model_name == 'LinearRegression':
            model = LinearRegression()
            model_objects[model_name] = model
        elif model_name == 'LogisticRegression':
            model = LogisticRegression(max_iter=model_details['max_iter'], C=(model_details['min_regparam'] + model_details['max_regparam']) / 2)
            model_objects[model_name] = model
        elif model_name == 'RandomForestRegressor':
            model = RandomForestRegressor(n_estimators=(model_details['min_trees'] + model_details['max_trees']) / 2,
                                           max_depth=(model_details['min_depth'] + model_details['max_depth']) / 2,
                                           min_samples_leaf=(model_details['min_samples_per_leaf_min_value'] + model_details['min_samples_per_leaf_max_value']) / 2)
            model_objects[model_name] = model
        
# 5) Run the fit and predict on each model with hyperparameter tuning
results = []

for model_name, model in model_objects.items():
    pipeline_steps = []
    if feature_reduction_model:
        pipeline_steps.append(('feature_reduction', feature_reduction_model))
    
    pipeline_steps.append(('preprocessor', preprocessor))
    pipeline_steps.append(('model', model))
    
    pipeline = Pipeline(pipeline_steps)
    
    X = csv_data[selected_features]
    y = csv_data[target]
    
    
    grid_search = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X, y)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X)
    
    rmse = mean_squared_error(y, y_pred, squared=False)
    r2 = r2_score(y, y_pred)
    
    results.append({
        'model_name': model_name,
        'rmse': rmse,
        'r2': r2
    })

# 6) Print results
for result in results:
    print(f"Model: {result['model_name']}")
    print(f"RMSE: {result['rmse']:.4f}")
    print(f"R2 Score: {result['r2']:.4f}")
     print()