In [None]:
# !pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import r2_score, accuracy_score
import time

In [None]:
# data = pd.read_csv('Zboson_decay_data.csv')

# print("Column Names with Indexes:")
# for idx, col_name in enumerate(data.columns):
#     print(f"Index {idx}: {col_name}")
# target_col_idx = int(input("Enter the index of the target variable column: "))

# X = data.drop(data.columns[target_col_idx], axis=1)
# y = data.iloc[:, target_col_idx]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=11 )

# print()
# print("Target Variable (y):", data.columns[target_col_idx])

In [3]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Generate random data for 10 features
num_samples = 50000
features = {}
for i in range(1, 11):
    feature_name = f'feature_{i}'
    if i % 2 == 0:
        # Generate values in a smaller range for even-numbered features
        features[feature_name] = np.random.uniform(0, 100, num_samples)
    else:
        # Generate values in a larger range for odd-numbered features
        features[feature_name] = np.random.uniform(0, 1000, num_samples)

# Create a DataFrame for the features
df = pd.DataFrame(features)

# Generate random values for the target variable
df['target'] = np.random.uniform(100, 1000, num_samples)

# Save the DataFrame to a CSV file
df.to_csv('regression_data.csv', index=False)



In [6]:
data = pd.read_csv('regression_data.csv')
num_data = data.select_dtypes(include=['int64', 'float64'])

print("Column Names with Indexes:")
for idx, col_name in enumerate(num_data.columns):
    print(f"Index {idx}: {col_name}")
target_col_idx = int(input("Enter the index of the target variable column: "))

X = num_data.drop(num_data.columns[target_col_idx], axis=1)
y = num_data.iloc[:, target_col_idx]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=11 )

print()
print("Target Variable (y):", data.columns[target_col_idx])

Column Names with Indexes:
Index 0: feature_1
Index 1: feature_2
Index 2: feature_3
Index 3: feature_4
Index 4: feature_5
Index 5: feature_6
Index 6: feature_7
Index 7: feature_8
Index 8: feature_9
Index 9: feature_10
Index 10: target
Enter the index of the target variable column: 10

Target Variable (y): target


In [10]:
def algorithm_type(x_var, y_var):
    from sklearn.utils.multiclass import type_of_target
    dtype = y_var.dtype
    target_type = type_of_target(y_var)

    if dtype == 'object' or target_type == 'binary':
      problem_type = 'Classification'
      print('Object or Binary target variable detected !')

    elif target_type == 'continuous':
      problem_type = 'Regression'
      print('Continuous target variable detected !')

    elif dtype in ['int64','float64'] or target_type in ['multiclass']:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.23, random_state=42)
        rf_classifier = RandomForestClassifier()
        rf_regressor = RandomForestRegressor()
        rf_classifier.fit(X_train, y_train)
        rf_regressor.fit(X_train, y_train)
        classifier_score = rf_classifier.score(X_test, y_test)
        regressor_score = rf_regressor.score(X_test, y_test)
        if classifier_score > regressor_score:
            problem_type = 'Classification'
        else:
            problem_type = 'Regression'
        print(f'CS:{classifier_score} , RS:{regressor_score}')

    else:
        user_input = input('''Specify problem type manually -
        (r for Regression / c for Classification) : ''')

        if user_input == 'r': problem_type = 'Regression'
        elif user_input == 'c': problem_type = 'Classification'
        else: print('Select either Regression or Classification !')

    return problem_type


def model_analysis(ptype):
    if ptype.lower() in ['Regression','regression', 'r']:
        models = [
            ('DecisionTreeRegressor', DecisionTreeRegressor()),
            ('RandomForestRegressor', RandomForestRegressor()),
            ('GradientBoostingRegressor', GradientBoostingRegressor())
        ]
    elif ptype.lower() in ['Classification','classification', 'c']:
        models = [
            ('DecisionTreeClassifier', DecisionTreeClassifier()),
            ('RandomForestClassifier', RandomForestClassifier()),
            ('GradientBoostingClassifier', GradientBoostingClassifier())
        ]
    else:
        raise ValueError('Invalid problem type specified.')

    return models


def model_selection(models, X, y, problem_type):
    cv_start = time.time()

    # if len(X) < 25000:  cv = 4
    # elif 25000 <= len(X) <= 50000:  cv = 3
    # else: cv = 2
    cv = 2

    from sklearn.metrics import accuracy_score, r2_score
    best_model = None
    best_score = float('-inf') if problem_type == 'Regression' else 0

    for name, model in models:
        scoring = 'r2' if problem_type == 'Regression' else 'accuracy'
        scores = cross_val_score(model, X, y, scoring=scoring, cv=cv)
        mean_score = scores.mean()
        if problem_type == 'Regression' and mean_score > best_score:
            best_score = mean_score
            best_model = model
        elif problem_type == 'Classification' and mean_score > best_score:
            best_score = mean_score
            best_model = model
    cv_end = time.time()
    print(f'CV time : {cv_end-cv_start}')
    return best_model


algorithm = algorithm_type(X, y)
print(f'Algorithm selected : {algorithm}')

models = model_analysis(algorithm)
print(models)

best_model = model_selection(models, X, y, algorithm)
print(f'Model selected : {best_model}')


def optimize_model(algorithm, model):
    print(algorithm)
    print(model)

    if algorithm in ['Regression','regression']:
      if isinstance(model, DecisionTreeRegressor):
        print('Optimize DTR !!')
      elif isinstance(model, RandomForestRegressor):
        print('Optimize RFR !!')
      elif isinstance(model, GradientBoostingRegressor):
        print('Optimize GBR !!')

    elif algorithm in ['Classification','classification']:
      if isinstance(model, DecisionTreeClassifier):
        print('Optimize DTC !!')
      elif isinstance(model, RandomForestClassifier):
        print('Optimize RFC !!')
      elif isinstance(model, GradientBoostingClassifier):
        print('Optimize GBC !!')

    else:
      print('No model selected !!')

optimize_model(algorithm, best_model)

Continuous target variable detected !
Algorithm selected : Regression
[('DecisionTreeRegressor', DecisionTreeRegressor()), ('RandomForestRegressor', RandomForestRegressor()), ('GradientBoostingRegressor', GradientBoostingRegressor())]


KeyboardInterrupt: 

In [None]:
# Random Forest Regressor Optimization
from bayes_opt import BayesianOptimization
def optimize_rfr(X_train, X_test, y_train, y_test):
    def optimize_rfr_inner(n_estimators, max_depth, min_samples_split, min_samples_leaf):
        rfr = RandomForestRegressor(n_estimators=int(n_estimators), max_depth=int(max_depth),
                                     min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
        rfr.fit(X_train, y_train)
        y_pred = rfr.predict(X_test)
        return r2_score(y_test, y_pred)

    rfr_bounds = {'n_estimators': (10, 100), 'max_depth': (1, 50), 'min_samples_split': (2, 20),
                  'min_samples_leaf': (1, 20)}

    bayes_rfr = BayesianOptimization(f=optimize_rfr_inner, pbounds=rfr_bounds, random_state=42)
    bayes_rfr.maximize(init_points=10, n_iter=10)

    return bayes_rfr, bayes_rfr.max['target']

def optimize_model(algorithm, model, X_train, X_test, y_train, y_test):
    print(algorithm)
    print(model)

    if algorithm in ['Regression','regression']:
      if isinstance(model, DecisionTreeRegressor):
          optimized_model = optimize_dtr(X_train, X_test, y_train, y_test)
      elif isinstance(model, RandomForestRegressor):
          optimized_model = optimize_rfr(X_train, X_test, y_train, y_test)
      elif isinstance(model, GradientBoostingRegressor):
          optimized_model = optimize_gbr(X_train, X_test, y_train, y_test)

    elif algorithm in ['Classification','classification']:
      if isinstance(model, DecisionTreeClassifier):
          optimized_model = optimize_dtc(X_train, X_test, y_train, y_test)
      elif isinstance(model, RandomForestClassifier):
          optimized_model = optimize_rfc(X_train, X_test, y_train, y_test)
      elif isinstance(model, GradientBoostingClassifier):
          optimized_model = optimize_gbc(X_train, X_test, y_train, y_test)

    else:
        print('No model selected !!')
        optimized_model = None

    return optimized_model

final_model, performance = optimize_model(algorithm, best_model, X_train, X_test, y_train, y_test)
print(f"Model : {best_model}")
print(f"Performance : {performance}")

Regression
RandomForestRegressor()
|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.9897   [0m | [0m19.35    [0m | [0m19.06    [0m | [0m15.18    [0m | [0m63.88    [0m |
| [95m2        [0m | [95m0.9999   [0m | [95m8.645    [0m | [95m3.964    [0m | [95m3.046    [0m | [95m87.96    [0m |
| [0m3        [0m | [0m0.9976   [0m | [0m30.45    [0m | [0m14.45    [0m | [0m2.371    [0m | [0m97.29    [0m |
| [0m4        [0m | [0m0.9997   [0m | [0m41.79    [0m | [0m5.034    [0m | [0m5.273    [0m | [0m26.51    [0m |
| [0m5        [0m | [0m0.9989   [0m | [0m15.91    [0m | [0m10.97    [0m | [0m9.775    [0m | [0m36.21    [0m |
| [0m6        [0m | [0m0.9999   [0m | [0m30.98    [0m | [0m3.65     [0m | [0m7.259    [0m | [0m42.97    [0m |
| [0m7        [0m | [0m0.9971   [0m | [0m23.35    [0m | [0m15.92    [0m