In [1]:
!pip install striprtf

Collecting striprtf
  Downloading striprtf-0.0.26-py3-none-any.whl (6.9 kB)
Installing collected packages: striprtf
Successfully installed striprtf-0.0.26


In [2]:
# Importing required libraries
from striprtf.striprtf import rtf_to_text
import json
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, f_classif
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, LogisticRegression, RidgeClassifier, Lasso, SGDClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelBinarizer

In [3]:
# Read from RTF file and convert to plain text
rtf_file = '/content/algoparams_from_ui.json.rtf'
try:
    with open(rtf_file, 'r') as file:  # Open the file in text mode
        rtf_content = file.read()  # Read the content as a string
        plain_text_content = rtf_to_text(rtf_content)
except FileNotFoundError:
    print("File not found.")

In [4]:
# Load the JSON data
try:
    json_data = json.loads(plain_text_content)
except json.JSONDecodeError as e:
    print("Error parsing JSON:", e)

In [5]:
# Read the target and type of regression to be run
target = json_data["design_state_data"]["target"]["target"]
regression_type = json_data["design_state_data"]["target"]["prediction_type"]
print("Target:", target)
print("Regression Type:", regression_type)

Target: petal_width
Regression Type: Regression


In [6]:
# Read the dataset
file_path = 'iris.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

In [7]:
# Apply feature handling
feature_handling = json_data["design_state_data"]["feature_handling"]
# Iterate through the features and apply missing value imputation as specified in JSON
for feature_name, feature_details in feature_handling.items():
    if "missing_values" in feature_details["feature_details"] and feature_details["feature_details"]["missing_values"] == "Impute":
        impute_with = feature_details["feature_details"]["impute_with"]
        impute_value = feature_details["feature_details"]["impute_value"]
        if impute_with == "Average of values":
            df[feature_name].fillna(df[feature_name].mean(), inplace=True)
        elif impute_with == "custom":
            df[feature_name].fillna(impute_value, inplace=True)

In [8]:
# Find the feature reduction method from JSON
if "feature_reduction" in json_data['design_state_data']:
    if "No Reduction" in json_data['design_state_data']['feature_reduction'] and json_data['design_state_data']['feature_reduction']["No Reduction"].get("is_selected", True):
      feature_reduction_method = "No Reduction"
      num_of_features_to_keep = int(json_data['design_state_data']["feature_reduction"]["No Reduction"]["num_of_features_to_keep"])
    elif "Tree-based" in json_data['design_state_data']['feature_reduction'] and json_data['design_state_data']['feature_reduction']["Tree-based"].get("is_selected", True):
      feature_reduction_method = "Tree-based"
      num_of_features_to_keep = int(json_data['design_state_data']['feature_reduction']["Tree-based"]["num_of_features_to_keep"])
      num_of_trees = int(json_data['design_state_data']["feature_reduction"]["Tree-based"]["num_of_trees"])
      depth_of_trees = int(json_data['design_state_data']["feature_reduction"]["Tree-based"]["depth_of_trees"])
    elif "Correlation with target" in json_data['design_state_data']['feature_reduction'] and json_data['design_state_data']["feature_reduction"]["Correlation with target"].get("is_selected", True):
      feature_reduction_method = "Correlation with target"
      num_of_features_to_keep = int(json_data['design_state_data']["feature_reduction"]["Correlation with target"]["num_of_features_to_keep"])
    elif "Principal Component Analysis" in json_data['design_state_data']["feature_reduction"] and json_data['design_state_data']["feature_reduction"]["Principal Component Analysi"].get("is_selected", True):
      feature_reduction_method = "Principal Component Analysis"
      num_of_features_to_keep = int(json_data['design_state_data']["feature_reduction"]["Principal Component Analysis"]["num_of_features_to_keep"])
    else:
      feature_reduction_method = json_data['design_state_data']["feature_reduction"]["feature_reduction_method"]
      num_of_features_to_keep = int(json_data['design_state_data']["feature_reduction"]["num_of_features_to_keep"])
      num_of_trees = int(json_data['design_state_data']["feature_reduction"]["num_of_trees"])
      depth_of_trees = int(json_data['design_state_data']["feature_reduction"]["depth_of_trees"])
    print("Feature Reduction Method:", feature_reduction_method)
else:
    print("Feature reduction not specified")

Feature Reduction Method: Tree-based


In [9]:
# Apply feature reduction method
if feature_reduction_method == "No Reduction":
    # No feature reduction, use the original data
    if regression_type == "Classification":
    # Classification task
      X = df
      y = 'species'
    elif regression_type == "Regression":
      # Regression task
      X = df.drop(columns=['species'])
      y = target
    else:
      raise ValueError("Invalid regression_type. Please specify 'Regression' or 'Classification'.")
    X = X
elif feature_reduction_method == "Tree-based":
    # Check if it's a classification or regression task based on the target variable's data type
    if regression_type == "Classification":
    # Classification task
      features = df.drop(columns=['species'])
      y = 'species'
      feature_reduction_model = RandomForestClassifier(n_estimators=num_of_trees, max_depth=depth_of_trees)
    elif regression_type == "Regression":
      # Regression task
      features = df.drop(columns=[target,'species'])
      y = target
      feature_reduction_model = RandomForestRegressor(n_estimators=num_of_trees, max_depth=depth_of_trees)
    else:
      raise ValueError("Invalid regression_type. Please specify 'Regression' or 'Classification'.")
    # Fit the model to your data
    feature_reduction_model.fit(features, df[y])
    # Get feature importances
    feature_importances = feature_reduction_model.feature_importances_
    # Select the top 'num_of_features_to_keep' features based on importance
    top_features_indices = feature_importances.argsort()[-num_of_features_to_keep:][::-1]
    # Extract the top features
    top_features = features.columns[top_features_indices]
    # Create a new DataFrame with only the top features and the target variable
    X = df[top_features.tolist()+[y]]
elif feature_reduction_method == "Correlation with target":
    if regression_type == "Classification":
    # Classification task
      features = df.drop(columns=['species'])
      y = 'species'
      feature_reduction_model = SelectKBest(score_func=f_classif, k=num_of_features_to_keep)
    elif regression_type == "Regression":
      # Regression task
      features = df.drop(columns=[target,'species'])
      y = target
      feature_reduction_model = SelectKBest(score_func=f_regression, k=num_of_features_to_keep)
    else:
      raise ValueError("Invalid regression_type. Please specify 'Regression' or 'Classification'.")
    # Fit the selector on your data and target
    feature_reduction_model.fit_transform(features, df[y])
    # Get the selected feature indices
    top_features_indices = feature_reduction_model.get_support(indices=True)
    # Extract the top features
    top_features = features.columns[top_features_indices]
    # Create a new DataFrame with only the top features and the target variable
    X = df[top_features.tolist()+[y]]
elif feature_reduction_method == "Principal Component Analysis":
    if regression_type == "Classification":
    # Classification task
      X = df.drop(columns=['species'])
      y = 'species'
    elif regression_type == "Regression":
      # Regression task
      X = df.drop(columns=[target,'species'])
      y = target
    else:
      raise ValueError("Invalid regression_type. Please specify 'Regression' or 'Classification'.")
    # Standardize the features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    # Perform PCA for feature reduction
    pca = PCA(n_components=num_of_features_to_keep)
    principal_components = pca.fit_transform(scaled_features)
    # Create a DataFrame with the top principal components as columns
    top_features = pd.DataFrame(data=principal_components, columns=[f'PC{i}' for i in range(1, num_of_features_to_keep + 1)])
    # Create a new DataFrame with only the top features and the target variable
    X = df[top_features.tolist()+[y]]
else:
  print("feature_reduction method not specified")


In [10]:
# Define the train_ratio and random_seed based on JSON
train_ratio = int(json_data['design_state_data']["train"]["train_ratio"])
if train_ratio==0:
  train_ratio = 0.8 #default
random_seed =json_data['design_state_data']["train"]["random_seed"]

# Split the dataset based on the configuration
if json_data['design_state_data']['train']['k_fold']:
    kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)
    for train_index, test_index in kf.split(X):
        train_data, test_data = X.iloc[train_index], X.iloc[test_index]
else:
    train_data, test_data = train_test_split(X, test_size=1 - train_ratio, random_state=random_seed)


In [11]:
# Create custom lists of model names for regression and classification
regression_models = ["RandomForestRegressor","GBTRegressor","LinearRegression", "RidgeRegression", "LassoRegression", "ElasticNetRegression","SVM","SGD","KNN","extra_random_trees","neural_network"]
classification_models = ["RandomForestClassifier", "GBTClassifier", "LogisticRegression","DecisionTreeClassifier", "SVM", "SGD","KNN","extra_random_trees","neural_network"]

In [12]:
# Regression Models
regression_model_map = {
    "RandomForestRegressor": RandomForestRegressor,
    "GBTRegressor": GradientBoostingRegressor,
    "LinearRegression": LinearRegression,
    "RidgeRegression": Ridge,
    "LassoRegression": Lasso,
    "ElasticNetRegression": ElasticNet,
    "KNN": KNeighborsRegressor,
    "SVM": SVR,
    "SGD": SGDRegressor,
    "extra_random_trees": ExtraTreesRegressor,
    "neural_network": MLPRegressor,
}

# Classification Models
classification_model_map = {
    "RandomForestClassifier": RandomForestClassifier,
    "GBTClassifier": GradientBoostingClassifier,
    "LogisticRegression": LogisticRegression,
    "DecisionTreeClassifier": DecisionTreeClassifier,
    "SVM": SVC,
    "SGD": SGDClassifier,
    "KNN": KNeighborsClassifier,
    "extra_random_trees": ExtraTreesClassifier,
    "neural_network": MLPClassifier,
}

# Store the respective hyperparameters of each model from JSON
algorithm_data = json_data['design_state_data']['algorithms']

def generate_hyperparameters(model_name):
    model_config = algorithm_data[model_name]
    if model_name == 'RandomForestClassifier' or model_name =='RandomForestRegressor':
      min_trees = int(model_config["min_trees"])
      max_trees = int(model_config["max_trees"])
      min_depth = int(model_config["min_depth"])
      max_depth = int(model_config["max_depth"])
      min_samples_per_leaf_min_value = int(model_config["min_samples_per_leaf_min_value"])
      min_samples_per_leaf_max_value = int(model_config["min_samples_per_leaf_max_value"])
      parallelism = int(model_config["parallelism"])

      hyperparameters = {
        "n_estimators": [min_trees, max_trees],
        "max_depth": [min_depth, max_depth],
        "min_samples_leaf": [min_samples_per_leaf_min_value, min_samples_per_leaf_max_value],
        "max_features": [1.0],
        "n_jobs": [1] if parallelism == 0 else [parallelism]
      }
    if model_name == 'GBTClassifier' or model_name =='GBTRegressor':
      fixed_number = model_config["fixed_number"]
      min_stepsize = model_config["min_stepsize"]
      max_stepsize = model_config["max_stepsize"]
      min_iter = model_config["min_iter"]
      max_iter = model_config["max_iter"]
      min_depth = model_config["min_depth"]
      max_depth = model_config["max_depth"]
      num_of_BoostingStages=model_config['num_of_BoostingStages']

      hyperparameters = {
        "n_estimators": num_of_BoostingStages,
        "max_depth": [min_depth, max_depth],
        "subsample": [0.1,1.0],
        "max_features": [1.0],
      }
    if model_name == 'LinearRegression' or model_name =='LogisticRegression':
      parallelism = int(model_config["parallelism"])
      min_iter = int(model_config["min_iter"])
      max_iter = int(model_config["max_iter"])
      min_regparam = float(model_config["min_regparam"])
      max_regparam = float(model_config["max_regparam"])
      min_elasticnet = float(model_config["min_elasticnet"])
      max_elasticnet = float(model_config["max_elasticnet"])
      if model_name == 'LinearRegression':
        hyperparameters = {
        "n_jobs":[1] if parallelism == 0 else [parallelism],
        }
      if model_name == 'LogisticRegression':
        hyperparameters = {
        "C": [min_regparam, max_regparam],
        "max_iter": [min_iter, max_iter],
        "n_jobs": [1] if parallelism == 0 else [parallelism],
        "l1_ratio": [min_elasticnet, max_elasticnet],
      }
    if model_name == 'RidgeRegression' or model_name =='LassoRegression':
      min_iter = int(model_config["min_iter"])
      max_iter = int(model_config["max_iter"])
      min_regparam = float(model_config["min_regparam"])
      max_regparam = float(model_config["max_regparam"])
      hyperparameters = {
        "alpha": [min_regparam, max_regparam],
        "max_iter": [min_iter, max_iter],
      }
    if model_name == 'ElasticNetRegression':
      min_iter = int(model_config["min_iter"])
      max_iter = int(model_config["max_iter"])
      min_regparam = float(model_config["min_regparam"])
      max_regparam = float(model_config["max_regparam"])
      min_elasticnet = float(model_config["min_elasticnet"])
      max_elasticnet = float(model_config["max_elasticnet"])
      hyperparameters = {
        "alpha": [min_regparam, max_regparam],
        "l1_ratio": [min_elasticnet, max_elasticnet],
        "max_iter": [min_iter, max_iter],
      }
    if model_name =='DecisionTreeClassifier':
      min_depth = model_config["min_depth"]
      max_depth = model_config["max_depth"]
      use_gini = model_config["use_gini"]
      use_entropy = model_config["use_entropy"]
      min_samples_per_leaf = model_config.get("min_samples_per_leaf", [])
      use_best = model_config["use_best"]
      use_random = model_config["use_random"]
      hyperparameters = {
        "max_depth": [min_depth, max_depth],
        "criterion": ["gini" if use_gini else "entropy"],
        "min_samples_leaf": min_samples_per_leaf,
        "splitter": ["best" if use_best else "random"],
      }
    if model_name=='SVM':
      c_value = model_config["c_value"]
      kernel = ["linear", "rbf", "poly", "sigmoid"] if model_config["linear_kernel"] else []
      tol = model_config["tolerance"]
      max_iter = model_config["max_iterations"]
      auto = model_config["auto"]
      scale = model_config["scale"]
      hyperparameters = {
        "C": c_value,
        "kernel": kernel,
        "gamma": [0.001, 1.0],
        "tol": [tol],
        "max_iter": [max_iter],
        "shrinking": [auto],
      }
    if model_name=='SGD':
      loss = ['huber']
      tol = model_config["tolerance"]
      alpha = model_config["alpha_value"]
      l1_ratio = 0.5 if model_config["use_l1_regularization"] == "on" else 0.0
      l2_ratio = 0.5 if model_config["use_l2_regularization"] == "on" else 0.0
      elastic_net = True if model_config["use_elastic_net_regularization"] else False
      hyperparameters = {
        "loss": loss,
        "penalty": ["elasticnet" if elastic_net else "l1" if l1_ratio > 0.0 else "l2"],
        "alpha": alpha,
        "l1_ratio": [l1_ratio],
        "tol": [tol],
        "max_iter": [1,50],
      }
    if model_name=='KNN':
      k_value = model_config["k_value"]
      weights = ["uniform", "distance"] if model_config["distance_weighting"] else []
      p_value = [1.0,2.0] if model_config["p_value"] == 0 else model_config["p_value"]
      hyperparameters = {
        "n_neighbors": k_value,
        "weights": weights,
        "algorithm": ["auto"],
        "p": p_value,
      }
    if model_name=='extra_random_trees':
      num_of_trees = model_config["num_of_trees"]
      feature_sampling_statergy = model_config["feature_sampling_statergy"]
      max_depth = model_config["max_depth"]
      min_samples_per_leaf = model_config["min_samples_per_leaf"]
      parallelism = model_config["parallelism"]
      hyperparameters = {
        "n_estimators": num_of_trees,
        "max_features": ["sqrt", "log2"] if feature_sampling_statergy == "Square root and Logarithm" else ["auto"],
        "max_depth": max_depth,
        "min_samples_leaf": min_samples_per_leaf,
        "n_jobs": [1] if parallelism == 0 else [parallelism],
      }
    if model_name=='neural_network':
      hidden_layer_sizes = model_config["hidden_layer_sizes"]
      solver = model_config["solver"]
      early_stopping = model_config["early_stopping"]
      initial_learning_rate = model_config["initial_learning_rate"]
      hyperparameters = {
        "hidden_layer_sizes": hidden_layer_sizes,
        "activation": ["relu", "tanh", "logistic"],
        "solver": ['adam'],
        "max_iter": [1,50],
        "tol": [model_config["convergence_tolerance"]],
        "early_stopping": [early_stopping],
        "learning_rate_init": [0.1, 2.0],
        "batch_size": ["auto" if model_config["automatic_batching"] else "none"],
      }
    return hyperparameters

In [13]:
models = {}
# Create models based on the selected regression_type
if regression_type == "Regression":
    for model_name, model_class in regression_model_map.items():
        models[model_name] = model_class()
elif regression_type == "Classification":
    for model_name, model_class in classification_model_map.items():
        models[model_name] = model_class()
else:
    raise ValueError("Invalid regression_type. Please specify 'Regression' or 'Classification'.")
# Define hyperparameter tuning strategy based on JSON
hyperparameter_strategy = json_data['design_state_data']['hyperparameters']['stratergy']
# Perform hyperparameter tuning and fit/predict on each model
for model_name, model in models.items():
  #Comment below 'if condition' to check all models regardless of is_selected parameter
  if json_data['design_state_data']["algorithms"][model_name]["is_selected"]:
    if hyperparameter_strategy == "Grid Search":
        # Define the hyperparameters to search for this model
        param_grid = generate_hyperparameters(model_name)
        jobs = int(json_data['design_state_data']['hyperparameters']['parallelism'])
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(n_splits=6), n_jobs=jobs)
        # Fit the grid search to your data
        if regression_type == "Regression":
          grid_search.fit(train_data.drop(columns=[y]), train_data[y])
          best_model = grid_search.best_estimator_
          # Print the best hyperparameters and the corresponding score
          #print("Best Hyperparameters: ", grid_search.best_params_)
          #print("Best Score: ", grid_search.best_score_)
          # Fit the best model to the training data
          best_model.fit(train_data.drop(columns=[y]), train_data[y])
          # Make predictions on the test set
          predictions = best_model.predict(test_data.drop(columns=[y]))
          actual_values = test_data[y]
          # Calculate Mean Squared Error
          mse = mean_squared_error(actual_values, predictions)
          print(f"Model: {model_name}, Mean Squared Error: {mse}")
          # Calculate R-squared (Coefficient of determination)
          r2 = r2_score(actual_values, predictions)
          print(f"Model: {model_name}, R-squared: {r2}")
          # Calculate Mean Absolute Error
          mae = mean_absolute_error(actual_values, predictions)
          print(f"Model: {model_name}, Mean Absolute Error: {mae}")
        if regression_type == "Classification":
          grid_search.fit(train_data.drop(columns=[y]), train_data[y])
          best_model = grid_search.best_estimator_
          # Print the best hyperparameters and the corresponding score
          #print("Best Hyperparameters: ", grid_search.best_params_)
          #print("Best Score: ", grid_search.best_score_)
          # Fit the best model to the training data
          best_model.fit(train_data.drop(columns=[y]), train_data[y])
          # Make predictions on the test set
          predictions = best_model.predict(test_data.drop(columns=[y]))
          actual_labels = test_data[y]
          # Calculate Accuracy
          accuracy = accuracy_score(actual_labels, predictions)
          print(f"Model: {model_name}, Accuracy: {accuracy}")
          # Calculate Precision
          precision = precision_score(actual_labels, predictions,average=None)
          print(f"Model: {model_name}, Precision: {precision}")
          # Calculate Recall
          recall = recall_score(actual_labels, predictions,average=None)
          print(f"Model: {model_name}, Recall: {recall}")
          # Calculate F1 Score
          f1 = f1_score(actual_labels, predictions,average=None)
          print(f"Model: {model_name}, F1 Score: {f1}")
          # Calculate ROC AUC Score
          # Use LabelBinarizer to convert string labels to binary format
          label_binarizer = LabelBinarizer()
          actual_binary = label_binarizer.fit_transform(actual_labels)
          predicted_binary = label_binarizer.transform(predictions)
          roc_auc = roc_auc_score(actual_binary, predicted_binary, average=None)
          print(f"Model: {model_name}, ROC AUC: {roc_auc}")
    else:
        raise ValueError("Invalid hyperparameter tuning strategy.")

Model: RandomForestRegressor, Mean Squared Error: 0.049255435240653504
Model: RandomForestRegressor, R-squared: 0.8974058836895366
Model: RandomForestRegressor, Mean Absolute Error: 0.15047679850106324
