In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib


In [21]:
# Replace path with your actual file path
data = pd.read_excel('energy_theft_data.xlsx')

# Features and target
X = data.drop(["customer_id", "theft_flag", "power_bypassed"], axis=1)
y = data["theft_flag"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Model definitions
models = {
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100, max_depth=8),
    "Decision Tree": DecisionTreeClassifier(random_state=42, max_depth=8),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(random_state=42),
    "Naive Bayes": GaussianNB()
}

# Train the models
trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model

print("✅ All models have been trained.")

✅ All models have been trained.


In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd

results = []
for name, model in trained_models.items():
    y_pred = model.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": round(accuracy_score(y_test, y_pred), 4),
        "Precision": round(precision_score(y_test, y_pred, zero_division=0), 4),
        "Recall": round(recall_score(y_test, y_pred, zero_division=0), 4),
        "F1 Score": round(f1_score(y_test, y_pred, zero_division=0), 4),
    })

results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False).reset_index(drop=True)

# Print the results
print("\n✅ Model Performance Comparison:\n")
print(results_df)

# Identify best model
best_model_name = results_df.iloc[0]["Model"]
best_model = trained_models[best_model_name]
print(f"\n🏆 Best Model: {best_model_name}")


✅ Model Performance Comparison:

                 Model  Accuracy  Precision  Recall  F1 Score
0  K-Nearest Neighbors     0.975     0.9623  0.9444    0.9533
1        Decision Tree     0.950     0.9583  0.8519    0.9020
2        Random Forest     0.945     0.9388  0.8519    0.8932
3                  SVM     0.930     0.9762  0.7593    0.8542
4          Naive Bayes     0.750     0.5455  0.4444    0.4898

🏆 Best Model: K-Nearest Neighbors


In [24]:
model_file = "best_model.pkl"
joblib.dump(best_model, model_file, compress=3)

import os
file_size = os.path.getsize(model_file) / (1024 * 1024)  # in MB
if file_size <= 20:
    print(f"\n✅ Model saved successfully as '{model_file}' ({file_size:.2f} MB) ")
else:
    print(f"⚠️ Model too large ({file_size:.2f} MB). Consider reducing depth or using fewer estimators.")


✅ Model saved successfully as 'best_model.pkl' (0.02 MB) 
