In [None]:
import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Load your dataset
df = pd.read_csv('df.csv')
df.columns = df.columns.str.replace(r'[^\w\s]', '', regex=True)
X = df.drop(['theft', 'Class'], axis=1)
y = df['theft']

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model names
models = ['lgbm', 'xgboost', 'logreg', 'rf']

# Dictionary to store best parameters
best_params = {}

def objective(trial, model_name):
    if model_name == 'lgbm':
        param = {
            'objective': 'multiclass',
            'num_class': len(np.unique(y)),
            'metric': 'multi_logloss',
            'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
            'num_leaves': trial.suggest_int('num_leaves', 20, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
            'max_bin': trial.suggest_int('max_bin', 100, 500),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
            'class_weight': class_weight_dict
        }
        model = lgb.LGBMClassifier(**param)
    
    elif model_name == 'xgboost':
        param = {
            'objective': 'multi:softmax',
            'num_class': len(np.unique(y)),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'scale_pos_weight': class_weights[1]
        }
        model = xgb.XGBClassifier(**param)
    
    elif model_name == 'logreg':
        param = {
            'solver': trial.suggest_categorical('solver', ['lbfgs', 'saga']),
            'max_iter': trial.suggest_int('max_iter', 100, 1000),
            'C': trial.suggest_loguniform('C', 1e-5, 1e5),
            'multi_class': 'ovr',
            'class_weight': 'balanced'
        }
        model = LogisticRegression(**param)
    
    elif model_name == 'rf':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
            'class_weight': 'balanced'
        }
        model = RandomForestClassifier(**param)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Optimize each model separately
for model_name in models:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name), n_trials=30)
    best_params[model_name] = study.best_trial.params
    print(f"Best parameters for {model_name}: {study.best_trial.params}")

# Print all best parameters
print("\nFinal Best Parameters for Each Model:")
for model, params in best_params.items():
    print(f"{model}: {params}")


In [None]:
import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Load your dataset
df = pd.read_csv('df.csv')
gass_columns = [
    'Gas:Facility [kW](Hourly)',
    'Heating:Gas [kW](Hourly)',
    'InteriorEquipment:Gas [kW](Hourly)',
    'Water Heater:WaterSystems:Gas [kW](Hourly)'
]

df.drop(columns=gass_columns, inplace=True)

df.columns = df.columns.str.replace(r'[^\w\s]', '', regex=True)

X = df.drop(['theft', 'Class'], axis=1)
y = df['theft']

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model names
models = ['lgbm', 'xgboost', 'logreg', 'rf']

# Dictionary to store best parameters
best_params = {}

def objective(trial, model_name):
    if model_name == 'lgbm':
        param = {
            'objective': 'multiclass',
            'num_class': len(np.unique(y)),
            'metric': 'multi_logloss',
            'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
            'num_leaves': trial.suggest_int('num_leaves', 20, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
            'max_bin': trial.suggest_int('max_bin', 100, 500),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
            'class_weight': class_weight_dict
        }
        model = lgb.LGBMClassifier(**param)
    
    elif model_name == 'xgboost':
        param = {
            'objective': 'multi:softmax',
            'num_class': len(np.unique(y)),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'scale_pos_weight': class_weights[1]
        }
        model = xgb.XGBClassifier(**param)
    
    elif model_name == 'logreg':
        param = {
            'solver': trial.suggest_categorical('solver', ['lbfgs', 'saga']),
            'max_iter': trial.suggest_int('max_iter', 100, 1000),
            'C': trial.suggest_loguniform('C', 1e-5, 1e5),
            'multi_class': 'ovr',
            'class_weight': 'balanced'
        }
        model = LogisticRegression(**param)
    
    elif model_name == 'rf':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
            'class_weight': 'balanced'
        }
        model = RandomForestClassifier(**param)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Optimize each model separately
for model_name in models:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name), n_trials=30)
    best_params[model_name] = study.best_trial.params
    print(f"Best parameters for {model_name}: {study.best_trial.params}")

# Print all best parameters
print("\nFinal Best Parameters for Each Model:")
for model, params in best_params.items():
    print(f"{model}: {params}")


In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Load your dataset
df = pd.read_csv('df.csv')

df.columns = df.columns.str.replace(r'[^\w\s]', '', regex=True)

X = df.drop(['theft', 'Class'], axis=1)
y = df['theft']

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Dictionary to store best parameters
best_params = {}

# Extra Trees Optimization
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'class_weight': 'balanced',
        'n_jobs': -1,
        'random_state': 42
    }

    model = ExtraTreesClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Create study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Store best parameters
best_params['extra_trees'] = study.best_trial.params
print(f"Best parameters for Extra Trees: {study.best_trial.params}")

# Print final best parameters
print("\nFinal Best Parameters for Extra Trees:")
print(best_params['extra_trees'])


In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Load your dataset
df = pd.read_csv('df.csv')

gass_columns = [
    'Gas:Facility [kW](Hourly)',
    'Heating:Gas [kW](Hourly)',
    'InteriorEquipment:Gas [kW](Hourly)',
    'Water Heater:WaterSystems:Gas [kW](Hourly)'
]

df.drop(columns=gass_columns, inplace=True)
df.columns = df.columns.str.replace(r'[^\w\s]', '', regex=True)

X = df.drop(['theft', 'Class'], axis=1)
y = df['theft']

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Dictionary to store best parameters
best_params = {}

# Extra Trees Optimization
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'class_weight': 'balanced',
        'n_jobs': -1,
        'random_state': 42
    }

    model = ExtraTreesClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Create study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Store best parameters
best_params['extra_trees'] = study.best_trial.params
print(f"Best parameters for Extra Trees: {study.best_trial.params}")

# Print final best parameters
print("\nFinal Best Parameters for Extra Trees:")
print(best_params['extra_trees'])
