In [None]:
# Optuna with all columns
import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Load your dataset
df = pd.read_csv('df.csv')
df.columns = df.columns.str.replace(r'[^\w\s]', '', regex=True)

X = df.drop(['theft', 'Class'], axis=1)  # Drop target and non-numeric columns
y = df['theft']

# Encode the target variable for multiclass classification
le = LabelEncoder()
y = le.fit_transform(y)

# Compute class weights to handle imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define an objective function for Optuna
def objective(trial):
    model_name = trial.suggest_categorical('model', ['lgbm', 'xgboost', 'logreg', 'rf'])

    # Define hyperparameters for each model
    if model_name == 'lgbm':
        param = {
            'objective': 'multiclass',
            'num_class': len(np.unique(y)),
            'metric': 'multi_logloss',
            'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
            'num_leaves': trial.suggest_int('num_leaves', 20, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
            'max_bin': trial.suggest_int('max_bin', 100, 500),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
            'class_weight': class_weight_dict
        }
        model = lgb.LGBMClassifier(**param)

    elif model_name == 'xgboost':
        param = {
            'objective': 'multi:softmax',
            'num_class': len(np.unique(y)),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'scale_pos_weight': class_weights[1]  # Use for handling imbalance in xgboost
        }
        model = xgb.XGBClassifier(**param)

    elif model_name == 'logreg':
        param = {
            'solver': trial.suggest_categorical('solver', ['lbfgs', 'saga']),
            'max_iter': trial.suggest_int('max_iter', 100, 1000),
            'C': trial.suggest_loguniform('C', 1e-5, 1e5),
            'multi_class': 'ovr',
            'class_weight': 'balanced'  # Use balanced class weights for Logistic Regression
        }
        model = LogisticRegression(**param)

    elif model_name == 'rf':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),  # Fix: 'auto' replaced with 'sqrt' or 'log2'
            'class_weight': 'balanced'  # Use balanced class weights for RandomForest
        }
        model = RandomForestClassifier(**param)

    # Train
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Set up the Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Get the best trial
best_trial = study.best_trial
print(f"Best trial: {best_trial}")

# Print the best parameters
print(f"Best parameters: {best_trial.params}")

In [None]:
# Without Gas columns
import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Load your dataset
df = pd.read_csv('df.csv')

# Drop the columns related to gas
df = df.drop(columns=['Gas:Facility [kW](Hourly)', 'Heating:Gas [kW](Hourly)',
                      'InteriorEquipment:Gas [kW](Hourly)', 'Water Heater:WaterSystems:Gas [kW](Hourly)'])

# Sanitize column names to avoid characters like '[' or ']'
df.columns = df.columns.str.replace(r'[^\w\s]', '', regex=True)

# Prepare your dataset (replace 'your_dataset.csv' with actual file)
X = df.drop(['theft', 'Class'], axis=1)  # Drop target and non-numeric columns
y = df['theft']

# Encode the target variable for multiclass classification
le = LabelEncoder()
y = le.fit_transform(y)

# Compute class weights to handle imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define an objective function for Optuna
def objective(trial):
    model_name = trial.suggest_categorical('model', ['lgbm', 'xgboost', 'logreg', 'rf'])

    # Define hyperparameters for each model
    if model_name == 'lgbm':
        param = {
            'objective': 'multiclass',
            'num_class': len(np.unique(y)),
            'metric': 'multi_logloss',
            'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
            'num_leaves': trial.suggest_int('num_leaves', 20, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
            'max_bin': trial.suggest_int('max_bin', 100, 500),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
            'class_weight': class_weight_dict
        }
        model = lgb.LGBMClassifier(**param)

    elif model_name == 'xgboost':
        param = {
            'objective': 'multi:softmax',
            'num_class': len(np.unique(y)),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'scale_pos_weight': class_weights[1]  # Use for handling imbalance in xgboost
        }
        model = xgb.XGBClassifier(**param)

    elif model_name == 'logreg':
        param = {
            'solver': trial.suggest_categorical('solver', ['lbfgs', 'saga']),
            'max_iter': trial.suggest_int('max_iter', 100, 1000),
            'C': trial.suggest_loguniform('C', 1e-5, 1e5),
            'multi_class': 'ovr',
            'class_weight': 'balanced'  # Use balanced class weights for Logistic Regression
        }
        model = LogisticRegression(**param)

    elif model_name == 'rf':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),  # Fix: 'auto' replaced with 'sqrt' or 'log2'
            'class_weight': 'balanced'  # Use balanced class weights for RandomForest
        }
        model = RandomForestClassifier(**param)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Set up the Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Get the best trial
best_trial = study.best_trial
print(f"Best trial: {best_trial}")

# Print the best parameters
print(f"Best parameters: {best_trial.params}")
