In [110]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [111]:
!pip install striprtf

Defaulting to user installation because normal site-packages is not writeable


In [112]:
#reading rich text file
import json
from striprtf.striprtf import rtf_to_text

# Open the file and read the content
with open("algoparams_from_ui.json.rtf", "r") as file:
    rtf_content = file.read()

# Convert RTF to plain text
plain_text = rtf_to_text(rtf_content)

# Try to parse the resulting text as JSON
try:
    data = json.loads(plain_text)
except json.JSONDecodeError:
    print("The text could not be parsed as JSON.")

In [113]:
#Reading the target key
data['design_state_data']["target"]

{'prediction_type': 'Regression',
 'target': 'petal_width',
 'type': 'regression',
 'partitioning': True}

In [114]:
#read the data
df=pd.read_csv("iris.csv")

In [115]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [116]:
#Handling features
data['design_state_data']['feature_handling']

{'sepal_length': {'feature_name': 'sepal_length',
  'is_selected': True,
  'feature_variable_type': 'numerical',
  'feature_details': {'numerical_handling': 'Keep as regular numerical feature',
   'rescaling': 'No rescaling',
   'make_derived_feats': False,
   'missing_values': 'Impute',
   'impute_with': 'Average of values',
   'impute_value': 0}},
 'sepal_width': {'feature_name': 'sepal_width',
  'is_selected': True,
  'feature_variable_type': 'numerical',
  'feature_details': {'numerical_handling': 'Keep as regular numerical feature',
   'rescaling': 'No rescaling',
   'make_derived_feats': False,
   'missing_values': 'Impute',
   'impute_with': 'custom',
   'impute_value': -1}},
 'petal_length': {'feature_name': 'petal_length',
  'is_selected': True,
  'feature_variable_type': 'numerical',
  'feature_details': {'numerical_handling': 'Keep as regular numerical feature',
   'rescaling': 'No rescaling',
   'make_derived_feats': False,
   'missing_values': 'Impute',
   'impute_with': '

In [117]:
#Handling missing values on sepal_length column
df["sepal_length"]=df["sepal_length"].fillna(data['design_state_data']['feature_handling']['sepal_length']['feature_details']['impute_value'])

In [118]:
#Handling missing values on sepal_width column
df["sepal_width"]=df["sepal_width"].fillna(data['design_state_data']['feature_handling']['sepal_width']['feature_details']['impute_value'])

In [119]:
#Handling missing values on petal_length column
df["petal_length"]=df["petal_length"].fillna(data['design_state_data']['feature_handling']['petal_length']['feature_details']['impute_value'])

In [120]:
#Handling missing values on petal_length column
df["petal_width"]=df["petal_width"].fillna(data['design_state_data']['feature_handling']['petal_width']['feature_details']['impute_value'])

In [121]:
feature_reduction_params=data['design_state_data']['feature_reduction']
feature_reduction_params

{'feature_reduction_method': 'Tree-based',
 'num_of_features_to_keep': '4',
 'num_of_trees': '5',
 'depth_of_trees': '6'}

In [122]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()



# Encode the 'species' column
df['species'] = label_encoder.fit_transform(df['species'])
# Load the Iris dataset
# Separate features and target variable
X = df.drop('petal_width', axis=1)
y = df['petal_width']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

# Create a Random Forest Classifier
rf_classifier = RandomForestRegressor(n_estimators=int(feature_reduction_params['num_of_trees']),max_depth=int(feature_reduction_params['depth_of_trees']))

# Fit the classifier on the data
rf_classifier.fit(X_train, y_train)

# Get the feature importances
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame in descending order of importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print("Feature Importances:")
print(feature_importance_df)

# Select the top n features (e.g., n=2) based on importance
n_top_features = 2
selected_features = feature_importance_df.head(n_top_features)['Feature'].tolist()

# Create a new DataFrame with only the selected features and target variable
reduced_df = df[selected_features + ['petal_width']]

important_features=reduced_df.columns


Feature Importances:
        Feature  Importance
2  petal_length    0.544372
3       species    0.421926
1   sepal_width    0.020962
0  sepal_length    0.012740


In [123]:
#fetching only regression models
regression_model_names=[names for names in data['design_state_data']['algorithms'].keys() if "regressor" in names.lower()]

In [124]:
regression_model_names

['RandomForestRegressor', 'GBTRegressor', 'DecisionTreeRegressor']

In [125]:
data['design_state_data']['algorithms']["GBTRegressor"]['max_stepsize']+1

1.5

In [126]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

rf_param_grid = {
    'n_estimators': range(data['design_state_data']['algorithms']["RandomForestRegressor"]['min_trees'], data['design_state_data']['algorithms']["RandomForestRegressor"]['max_trees']+1),  # This represents the number of trees
    'max_depth': range(data['design_state_data']['algorithms']["RandomForestRegressor"]['min_depth'], data['design_state_data']['algorithms']["RandomForestRegressor"]['max_depth']+1),
    'min_samples_leaf': range(data['design_state_data']['algorithms']["RandomForestRegressor"]['min_samples_per_leaf_min_value'], data['design_state_data']['algorithms']["RandomForestRegressor"]['min_samples_per_leaf_max_value']+1)}

gbt_param_grid = {
    'n_estimators': range(data['design_state_data']['algorithms']["GBTRegressor"]['num_of_BoostingStages'][0], data['design_state_data']['algorithms']["GBTRegressor"]['num_of_BoostingStages'][1]),  # This represents the number of boosting stages
    'max_depth': range(data['design_state_data']['algorithms']["GBTRegressor"]['min_depth'], data['design_state_data']['algorithms']["GBTRegressor"]['max_depth']+1),
    'min_samples_leaf': range(data['design_state_data']['algorithms']["GBTRegressor"]['min_subsample'], data['design_state_data']['algorithms']["GBTRegressor"]['max_subsample']+1),
    'learning_rate': [data['design_state_data']['algorithms']["GBTRegressor"]['min_stepsize'],data['design_state_data']['algorithms']["GBTRegressor"]['max_stepsize']],  # This represents the step size
}

dt_param_grid = {
    'max_depth': range(data['design_state_data']['algorithms']["DecisionTreeRegressor"]['min_depth'], data['design_state_data']['algorithms']["DecisionTreeRegressor"]['max_depth']+1),
    'min_samples_leaf': data['design_state_data']['algorithms']["DecisionTreeRegressor"]['min_samples_per_leaf'],
    'criterion': ['friedman_mse']  # This represents the function to measure the quality of a split
}


hyperparameters=data['design_state_data']['hyperparameters']

rf=RandomForestRegressor()
kfold = KFold(n_splits=hyperparameters['num_of_folds'], shuffle=hyperparameters["shuffle_grid"], random_state=hyperparameters['random_state'])
grid_search_rf = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=hyperparameters['max_iterations'], n_jobs=hyperparameters['parallelism'])

In [127]:
gb=GradientBoostingRegressor(criterion='friedman_mse')
kfold = KFold(n_splits=hyperparameters['num_of_folds'], shuffle=hyperparameters["shuffle_grid"], random_state=hyperparameters['random_state'])
grid_search_gb = GridSearchCV(estimator=gb, param_grid=gbt_param_grid, cv=kfold, n_jobs=hyperparameters['parallelism'])

In [128]:
dt=DecisionTreeRegressor(criterion='friedman_mse')
kfold = KFold(n_splits=hyperparameters['num_of_folds'], shuffle=hyperparameters["shuffle_grid"], random_state=hyperparameters['random_state'])
grid_search_dt = GridSearchCV(estimator=dt, param_grid=dt_param_grid, cv=kfold, n_jobs=hyperparameters['parallelism'])

In [129]:
grid_search_rf.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search_rf.best_params_
print(best_params)

{'max_depth': 21, 'min_samples_leaf': 5, 'n_estimators': 10}


In [130]:
grid_search_gb.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search_gb.best_params_
print(best_params)

{'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 2, 'n_estimators': 70}


In [131]:
grid_search_dt.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search_dt.best_params_
print(best_params)

{'criterion': 'friedman_mse', 'max_depth': 5, 'min_samples_leaf': 6}


In [132]:
#prediction on decsion tree
grid_search_dt.predict(X_test)

array([0.15714286, 1.31666667, 1.1125    , 0.34444444, 1.88333333,
       0.2       , 0.34444444, 1.45714286, 0.2625    , 2.12857143,
       1.37142857, 0.34444444, 1.1125    , 1.1125    , 2.03333333,
       2.12857143, 0.2       , 1.71428571, 2.03333333, 2.03333333,
       0.2       , 0.2625    , 1.71428571, 2.12857143, 1.1125    ,
       0.2625    , 0.2       , 1.88333333, 1.1125    , 1.45714286,
       1.31666667, 1.1125    , 0.2       , 1.1125    , 1.45714286,
       1.37142857, 2.03333333, 1.1125    , 2.12857143, 1.37142857,
       0.34444444, 0.2       , 2.03333333, 1.88333333, 1.71428571])

In [133]:
#prediction on gradient boosting
grid_search_gb.predict(X_test)

array([0.15202951, 1.26570348, 1.2659667 , 0.50624662, 1.85295993,
       0.24285921, 0.39746648, 1.4612412 , 0.27740085, 2.24777129,
       1.32164539, 0.33338882, 1.22038031, 1.2659667 , 1.93721318,
       2.01373495, 0.19477688, 1.71547755, 1.88995112, 1.84764112,
       0.21250224, 0.33964822, 1.71547755, 2.17806779, 1.22629729,
       0.20894693, 0.21904717, 2.03947867, 1.08228383, 1.42976716,
       1.27576046, 1.29346507, 0.16739521, 1.14542791, 1.40660104,
       1.60007716, 1.80316836, 1.27322021, 2.29573895, 1.33655987,
       0.34395926, 0.27252348, 1.86067082, 2.12671863, 1.7220968 ])

In [134]:
#prediction on random forest
grid_search_rf.predict(X_test)

array([0.18263095, 1.25666667, 1.1652381 , 0.29497222, 1.79772423,
       0.20280739, 0.2967904 , 1.46283304, 0.26038004, 1.89067878,
       1.32771055, 0.30561797, 1.2042381 , 1.1652381 , 2.06888095,
       2.05888095, 0.20462557, 1.88562698, 2.07915873, 1.96848413,
       0.22264144, 0.24161416, 1.88562698, 2.08706277, 1.2042381 ,
       0.23843235, 0.20742857, 1.78503376, 1.1652381 , 1.45637272,
       1.46444071, 1.1882381 , 0.19959524, 1.1652381 , 1.40150344,
       1.42726611, 2.07915873, 1.1652381 , 1.92721724, 1.35348016,
       0.25923268, 0.20462557, 1.95265079, 1.77336709, 1.80990555])