In [1]:
import json
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# read JSON file
with open('algoparams_from_ui.json') as f:
    algoparams = json.load(f)

In [2]:
print(algoparams)

{'session_name': 'test', 'session_description': 'test', 'design_state_data': {'session_info': {'project_id': '1', 'experiment_id': 'kkkk-11', 'dataset': 'iris_modified.csv', 'session_name': 'test', 'session_description': 'test'}, 'target': {'prediction_type': 'Regression', 'target': 'petal_width', 'type': 'regression', 'partitioning': True}, 'train': {'policy': 'Split the dataset', 'time_variable': 'sepal_length', 'sampling_method': 'No sampling(whole data)', 'split': 'Randomly', 'k_fold': False, 'train_ratio': 0, 'random_seed': 0}, 'metrics': {'optomize_model_hyperparameters_for': 'AUC', 'optimize_threshold_for': 'F1 Score', 'compute_lift_at': 0, 'cost_matrix_gain_for_true_prediction_true_result': 1, 'cost_matrix_gain_for_true_prediction_false_result': 0, 'cost_matrix_gain_for_false_prediction_true_result': 0, 'cost_matrix_gain_for_false_prediction_false_result': 0}, 'feature_handling': {'sepal_length': {'feature_name': 'sepal_length', 'is_selected': True, 'feature_variable_type': 'nu

In [7]:
# Step 2: Read the CSV file and apply missing value imputation
df = pd.read_csv(algoparams['design_state_data']['session_info']['dataset'])
y_name=algoparams['design_state_data']['target']['target']

In [13]:
print(algoparams['design_state_data']['algorithms']['RandomForestClassifier'])

{'model_name': 'Random Forest Classifier', 'is_selected': False, 'min_trees': 10, 'max_trees': 30, 'feature_sampling_statergy': 'Default', 'min_depth': 20, 'max_depth': 30, 'min_samples_per_leaf_min_value': 5, 'min_samples_per_leaf_max_value': 50, 'parallelism': 0}


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10,      # number of trees in the forest
                                criterion='gini',   # splitting criterion
                                min_samples_split=2, # minimum number of samples required to split an internal node
                                min_samples_leaf=5,  # minimum number of samples required to be at a leaf node
                                min_weight_fraction_leaf=0.0,  # minimum weighted fraction of the sum total of weights required to be at a leaf node
                                max_depth=30,       # maximum depth of the tree
                                min_impurity_decrease=0.0,  # minimum impurity decrease required for a split
                                bootstrap=True,     # whether bootstrap samples are used when building trees
                                oob_score=False,    # whether to use out-of-bag samples to estimate the generalization error
                                n_jobs=-1,          # number of jobs to run in parallel (-1 means use all available processors)
                                random_state=None,  # random number generator seed
                                verbose=0,          # controls the verbosity of the tree building process
                                warm_start=False,   # whether to reuse the solution of the previous call to fit and add more estimators to the ensemble
                                class_weight=None,  # weights associated with classes in the form {class_label: weight}
                                max_samples=None)   # maximum number of samples to use when training each estimator

# set the model parameters
model.set_params(min_samples_leaf=50,   # minimum number of samples required to be at a leaf node
                 n_estimators=30,      # number of trees in the forest
                 max_depth=30,         # maximum depth of the tree
                 random_state=42)      # random number generator seed

# print the model parameters
print(model.get_params())

In [8]:
print(y_name)

petal_width


In [9]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
import json
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

features = algoparams['features']
missing_imputation = algoparams['missing_imputation']

if missing_imputation == 'mean':
    imputer = SimpleImputer(strategy='mean')
elif missing_imputation == 'median':
    imputer = SimpleImputer(strategy='median')
else:
    imputer = SimpleImputer(strategy='most_frequent')

df[features] = imputer.fit_transform(df[features])

# Step 3: Compute feature reduction
feature_reduction = algoparams['feature_reduction']

if feature_reduction == 'No Reduction':
    pass
elif feature_reduction == 'Corr with Target':
    target = algoparams['target']
    corr_with_target = df.corr()[target].sort_values(ascending=False)
    features = corr_with_target[corr_with_target.abs() > 0.5].index.tolist()
elif feature_reduction == 'Tree-based':
    target = algoparams['target']
    X = df[features]
    y = df[target]
    dt = DecisionTreeRegressor()
    dt.fit(X, y)
    model = SelectFromModel(dt, prefit=True)
    features = X.columns[model.get_support()].tolist()
elif feature_reduction == 'PCA':
    n_components = algoparams['pca_n_components']
    pca = PCA(n_components=n_components)
    X = df[features]
    X_pca = pca.fit_transform(X)
    df_pca = pd.DataFrame(X_pca, columns=['PCA_'+str(i+1) for i in range(n_components)])
    df = pd.concat([df, df_pca], axis=1)
    features = df_pca.columns.tolist()

# Step 4: Make the model objects
prediction_type = algoparams['prediction_type']

if prediction_type == 'regression':
    regression_type = algoparams['type']
    if regression_type == 'linear':
        model = LinearRegression()
        param_grid = {'fit_intercept': [True, False]}
    elif regression_type == 'ridge':
        model = Ridge()
        param_grid = {'alpha': [0.1, 1, 10]}
    elif regression_type == 'lasso':
        model = Lasso()
        param_grid = {'alpha': [0.1, 1, 10]}
    elif regression_type == 'random_forest':
        model = RandomForestRegressor()
        param_grid = {'n_estimators': [100, 200, 300]}
    else:
        raise ValueError('Invalid regression type specified.')
else:
    raise ValueError('Invalid prediction type specified.')

# Step 5: Run GridSearchCV for hyperparameter tuning
X = df[features]
y = df[target]
grid = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X, y)

KeyError: 'features'