# Deep Belief Model

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, roc_auc_score, confusion_matrix

# Load dataset
print("📂 Loading dataset...")
df = pd.read_csv("df.csv")  
print(f"✅ Dataset Loaded! Shape: {df.shape}")

# Encode categorical labels
print("\n🔄 Encoding categorical labels...")
label_encoder = LabelEncoder()
df["Class"] = label_encoder.fit_transform(df["Class"])
df["theft"] = label_encoder.fit_transform(df["theft"])
print("✅ Encoding complete!")

# Feature selection
feature_cols = [
    "Electricity:Facility [kW](Hourly)", "Fans:Electricity [kW](Hourly)", "Cooling:Electricity [kW](Hourly)",
    "Heating:Electricity [kW](Hourly)", "InteriorLights:Electricity [kW](Hourly)", "InteriorEquipment:Electricity [kW](Hourly)",
    "Gas:Facility [kW](Hourly)", "Heating:Gas [kW](Hourly)", "InteriorEquipment:Gas [kW](Hourly)",
    "Water Heater:WaterSystems:Gas [kW](Hourly)"
]

X = df[feature_cols].values
y = df["theft"].values

# Check class distribution
print("\n📊 Class Distribution Before Train-Test Split:")
print(Counter(y))

# Split dataset
print("\n✂️ Splitting dataset...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"✅ Split complete! Train size: {X_train.shape}, Test size: {X_test.shape}")

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define LightGBM model & parameter grid
lgb_params = {
    "n_estimators": [100, 200, 300],  
    "learning_rate": [0.01, 0.05, 0.1],  
    "max_depth": [10, 20, -1],  
    "num_leaves": [31, 50, 100],  
    "boosting_type": ["gbdt"],  
    "objective": ["multiclass"],  
    "num_class": [len(np.unique(y))],  
    "metric": ["multi_logloss"],  
}

# Perform GridSearchCV
print("\n🚀 Starting GridSearchCV for LightGBM...")
lgb_model = lgb.LGBMClassifier()
lgb_grid = GridSearchCV(lgb_model, lgb_params, cv=2, scoring="f1_weighted", n_jobs=-1)
lgb_grid.fit(X_train, y_train)
print("✅ LightGBM GridSearch completed!")
print(f"🎯 Best LightGBM Parameters: {lgb_grid.best_params_}")

# Train final model
best_lgb = lgb_grid.best_estimator_

# Predictions
print("\n📢 Making predictions...")
y_pred = best_lgb.predict(X_test)
y_pred_proba = best_lgb.predict_proba(X_test)

# Evaluation function
def evaluate_model(y_true, y_pred, y_proba, model_name):
    results = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1-score": f1_score(y_true, y_pred, average="weighted"),
        "Kappa": cohen_kappa_score(y_true, y_pred),
        "AUC": roc_auc_score(y_true, y_proba, multi_class="ovr")
    }
    print(f"\n🔍 **{model_name} Results:**")
    print(results)
    print(f"\n📊 Confusion Matrix ({model_name}):\n", confusion_matrix(y_true, y_pred))

# Evaluate model
print("\n📊 Evaluating LightGBM...")
evaluate_model(y_test, y_pred, y_pred_proba, "LightGBM")

print("\n✅ **Training & Evaluation Complete!** 🚀")


📂 Loading dataset...
✅ Dataset Loaded! Shape: (560655, 13)

🔄 Encoding categorical labels...
✅ Encoding complete!

📊 Class Distribution Before Train-Test Split:
Counter({0: 331824, 1: 51083, 3: 44349, 4: 41460, 6: 35413, 5: 33553, 2: 22973})

✂️ Splitting dataset...
✅ Split complete! Train size: (448524, 10), Test size: (112131, 10)

🚀 Starting GridSearchCV for LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002398 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2544
[LightGBM] [Info] Number of data points in the train set: 448524, number of used features: 10
[LightGBM] [Info] Start training from score -0.524502
[LightGBM] [Info] Start training from score -2.395639
[LightGBM] [Info] Start training from score -3.194808
[LightGBM] [Info] Start training from score -2.537021
[LightGBM] [Info] Start training from score -2.604377


In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, roc_auc_score, confusion_matrix
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.utils.class_weight import compute_class_weight

# Load dataset
print("📂 Loading dataset...")
df = pd.read_csv("df.csv")  
print(f"✅ Dataset Loaded! Shape: {df.shape}")

# Encode categorical labels
print("\n🔄 Encoding categorical labels...")
label_encoder = LabelEncoder()
df["Class"] = label_encoder.fit_transform(df["Class"])
df["theft"] = label_encoder.fit_transform(df["theft"])
print("✅ Encoding complete!")

# Feature selection
feature_cols = [
    "Electricity:Facility [kW](Hourly)", "Fans:Electricity [kW](Hourly)", "Cooling:Electricity [kW](Hourly)",
    "Heating:Electricity [kW](Hourly)", "InteriorLights:Electricity [kW](Hourly)", "InteriorEquipment:Electricity [kW](Hourly)",
    "Gas:Facility [kW](Hourly)", "Heating:Gas [kW](Hourly)", "InteriorEquipment:Gas [kW](Hourly)",
    "Water Heater:WaterSystems:Gas [kW](Hourly)"
]

X = df[feature_cols].values
y = df["theft"].values

# Check class distribution before training
print("\n📊 Class Distribution Before Train-Test Split:")
print(Counter(y))

# Split dataset
print("\n✂️ Splitting dataset...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"✅ Split complete! Train size: {X_train.shape}, Test size: {X_test.shape}")

# Feature Selection: Select Best Features
print("\n🔍 Selecting Best Features...")
selector = SelectKBest(score_func=mutual_info_classif, k=8)  # Keep 8 best features
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)
print(f"✅ Feature Selection Complete! New Shape: {X_train.shape}")

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Compute Class Weights
print("\n⚖️ Computing Class Weights...")
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
print(f"✅ Class Weights: {class_weight_dict}")

# Define LightGBM Model with Class Weights & Regularization
lgb_model = lgb.LGBMClassifier(
    boosting_type="gbdt",
    objective="multiclass",
    num_class=len(np.unique(y)),
    n_estimators=300,  
    learning_rate=0.05,
    max_depth=20,  
    num_leaves=50,  
    min_data_in_leaf=50,  
    reg_lambda=0.5,  
    class_weight="balanced",  # ✅ Using class weights correctly
    random_state=42
)

# Train LightGBM with Early Stopping
print("\n🚀 Training LightGBM Model with Early Stopping...")
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],  
    eval_metric="multi_logloss",
    callbacks=[
        lgb.early_stopping(30),  # Stop if no improvement in 30 rounds
        lgb.log_evaluation(100)  # Log every 100 iterations
    ]
)

print("✅ LightGBM Training Completed!")

# Predictions
print("\n📢 Making predictions...")
y_pred = lgb_model.predict(X_test)
y_pred_proba = lgb_model.predict_proba(X_test)

# Evaluation function
def evaluate_model(y_true, y_pred, y_proba, model_name):
    results = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1-score": f1_score(y_true, y_pred, average="weighted"),
        "Kappa": cohen_kappa_score(y_true, y_pred),
        "AUC": roc_auc_score(y_true, y_proba, multi_class="ovr")
    }
    print(f"\n🔍 **{model_name} Results:**")
    print(results)
    print(f"\n📊 Confusion Matrix ({model_name}):\n", confusion_matrix(y_true, y_pred))

# Evaluate model
print("\n📊 Evaluating LightGBM...")
evaluate_model(y_test, y_pred, y_pred_proba, "Optimized LightGBM")

print("\n✅ **Training & Evaluation Complete!** 🚀")


📂 Loading dataset...
✅ Dataset Loaded! Shape: (560655, 13)

🔄 Encoding categorical labels...
✅ Encoding complete!

📊 Class Distribution Before Train-Test Split:
Counter({0: 331824, 1: 51083, 3: 44349, 4: 41460, 6: 35413, 5: 33553, 2: 22973})

✂️ Splitting dataset...
✅ Split complete! Train size: (448524, 10), Test size: (112131, 10)

🔍 Selecting Best Features...
✅ Feature Selection Complete! New Shape: (448524, 8)

⚖️ Computing Class Weights...
✅ Class Weights: {0: 0.24137383604570628, 1: 1.5678874677088395, 2: 3.4864978312578705, 3: 1.805993887732381, 4: 1.931827579077941, 5: 2.3870229535766176, 6: 2.2617316322928747}

🚀 Training LightGBM Model with Early Stopping...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010955 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2034
[LightGBM] [Info] Number of data points in the train set: 448524, number of used features: 8
[LightGBM] [Info] Start training f

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Load dataset
print("📂 Loading dataset...")
df = pd.read_csv("df.csv")  
print(f"✅ Dataset Loaded! Shape: {df.shape}")

# Encode categorical labels
print("\n🔄 Encoding categorical labels...")
label_encoder = LabelEncoder()
df["Class"] = label_encoder.fit_transform(df["Class"])
df["theft"] = label_encoder.fit_transform(df["theft"])
print("✅ Encoding complete!")

# Feature selection
feature_cols = [
    "Electricity:Facility [kW](Hourly)", "Fans:Electricity [kW](Hourly)", "Cooling:Electricity [kW](Hourly)",
    "Heating:Electricity [kW](Hourly)", "InteriorLights:Electricity [kW](Hourly)", "InteriorEquipment:Electricity [kW](Hourly)",
    "Gas:Facility [kW](Hourly)", "Heating:Gas [kW](Hourly)", "InteriorEquipment:Gas [kW](Hourly)",
    "Water Heater:WaterSystems:Gas [kW](Hourly)"
]

X = df[feature_cols].values
y = df["theft"].values

# Check class distribution before training
print("\n📊 Class Distribution Before Train-Test Split:")
print(Counter(y))

# Split dataset
print("\n✂️ Splitting dataset...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"✅ Split complete! Train size: {X_train.shape}, Test size: {X_test.shape}")

# Feature Selection: Recursive Feature Elimination (RFE)
print("\n🔍 Selecting Best Features using RFE...")
selector = RFECV(RandomForestClassifier(n_estimators=150, random_state=42), step=1, cv=3)
selector.fit(X_train, y_train)
selected_features = np.array(feature_cols)[selector.support_]
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)
print(f"✅ Feature Selection Complete! New Shape: {X_train.shape}")
print(f"Selected Features: {selected_features}")

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Compute Class Weights
print("\n⚖️ Computing Class Weights...")
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
print(f"✅ Class Weights: {class_weight_dict}")

# Custom Focal Loss for LightGBM
def focal_loss_lgb(y_true, y_pred):
    alpha = 0.25  # Adjust as needed
    gamma = 2.0   # Adjust as needed
    y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
    loss = -alpha * (1 - y_pred) ** gamma * y_true * np.log(y_pred) - \
           (1 - alpha) * y_pred ** gamma * (1 - y_true) * np.log(1 - y_pred)
    return "focal_loss", np.mean(loss), False

# Hyperparameter Tuning with Optuna
def objective(trial):
    params = {
        "boosting_type": "gbdt",
        "objective": "multiclass",
        "num_class": len(np.unique(y_train)),
        "metric": "multi_logloss",
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 200),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-3, 10),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-3, 10),
        "random_state": 42,
        "verbosity": -1
    }
    
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=30, verbose=0)
    
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred, average="weighted")

print("\n🚀 Running Optuna Hyperparameter Tuning...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

best_params = study.best_params
print("🎯 Best Parameters Found:", best_params)

# Train Final Optimized LightGBM Model
print("\n🚀 Training Optimized LightGBM Model...")
lgb_model = lgb.LGBMClassifier(**best_params)
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric=focal_loss_lgb,
    callbacks=[lgb.early_stopping(30)]
)
print("✅ LightGBM Training Completed!")

# Predictions
print("\n📢 Making predictions...")
y_pred = lgb_model.predict(X_test)
y_pred_proba = lgb_model.predict_proba(X_test)

# Evaluation function
def evaluate_model(y_true, y_pred, y_proba, model_name):
    results = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1-score": f1_score(y_true, y_pred, average="weighted"),
        "Kappa": cohen_kappa_score(y_true, y_pred),
        "AUC": roc_auc_score(y_true, y_proba, multi_class="ovr")
    }
    print(f"\n🔍 **{model_name} Results:**")
    print(results)
    print(f"\n📊 Confusion Matrix ({model_name}):\n", confusion_matrix(y_true, y_pred))

# Evaluate Model
print("\n📊 Evaluating Optimized LightGBM...")
evaluate_model(y_test, y_pred, y_pred_proba, "Optimized LightGBM")

print("\n✅ **Training & Evaluation Complete!** 🚀")
