In [1]:
# Import all required libraries
import pandas as pd
import numpy as np
import joblib
import io
import sys
from joblib import dump
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (classification_report, 
                           roc_auc_score, 
                           accuracy_score, 
                           confusion_matrix)
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
df = pd.read_csv('processed_electricity_theft_data.csv')

# Display basic information
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Check class distribution
print("\nClass distribution:")
print(df['theft_detected'].value_counts(normalize=True))

# Basic statistics
print("\nDescriptive statistics:")
display(df.describe())

Dataset shape: (100000, 22)

First 5 rows:


Unnamed: 0,billed_consumption,actual_consumption,average_voltage,peak_time_consumption,off_peak_consumption,power_factor,payment_history,num_outages,customer_type,location,...,peak_hour,consumption_diff,consumption_ratio,peak_offpeak_ratio,is_evening_peak,is_morning_peak,low_voltage_flag,high_power_factor,frequent_outages,theft_detected
0,1.147445,2.248218,-1.453353,3.088043,1.340971,0.673619,0,-0.297178,2,2,...,22,2.03945,0.839275,1.384486,1,0,1,0,0,1
1,-0.877158,-1.22393,-0.326608,-1.407434,-0.924167,0.337457,0,-1.4695,1,1,...,10,-0.857134,-0.768166,-1.201929,0,1,0,0,0,0
2,0.454615,-0.253382,-0.495405,-0.099987,-0.327165,1.766147,0,0.288983,0,0,...,16,-0.815302,-0.754688,0.177331,0,0,0,1,0,0
3,1.316804,2.171519,0.323512,2.149679,1.886108,-0.082746,1,-0.883339,1,1,...,19,1.760829,0.622884,0.151677,1,0,0,0,0,1
4,-0.99263,-0.837713,1.746335,-0.549776,-0.925876,2.102309,1,1.461305,0,1,...,16,-0.193382,0.027857,0.513772,0,0,0,1,1,1



Missing values:
billed_consumption       0
actual_consumption       0
average_voltage          0
peak_time_consumption    0
off_peak_consumption     0
power_factor             0
payment_history          0
num_outages              0
customer_type            0
location                 0
year                     0
month_num                0
peak_hour                0
consumption_diff         0
consumption_ratio        0
peak_offpeak_ratio       0
is_evening_peak          0
is_morning_peak          0
low_voltage_flag         0
high_power_factor        0
frequent_outages         0
theft_detected           0
dtype: int64

Class distribution:
theft_detected
1    0.5019
0    0.4981
Name: proportion, dtype: float64

Descriptive statistics:


Unnamed: 0,billed_consumption,actual_consumption,average_voltage,peak_time_consumption,off_peak_consumption,power_factor,payment_history,num_outages,customer_type,location,...,peak_hour,consumption_diff,consumption_ratio,peak_offpeak_ratio,is_evening_peak,is_morning_peak,low_voltage_flag,high_power_factor,frequent_outages,theft_detected
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,-1.85878e-16,3.095124e-16,8.199663e-17,3.156231e-16,1.215028e-16,6.804868e-16,0.45207,2.209788e-17,1.00049,1.00398,...,11.49376,1.867306e-16,2.54694e-16,-1.405454e-16,0.20797,0.20902,0.27087,0.20829,0.33431,0.5019
std,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,0.4977,1.000005,0.815449,0.814832,...,6.913789,1.000005,1.000005,1.000005,0.405857,0.406611,0.444411,0.406087,0.471751,0.499999
min,-1.731648,-2.091232,-1.824234,-1.976114,-2.067588,-2.183762,0.0,-1.4695,0.0,0.0,...,0.0,-1.888574,-1.484377,-1.682435,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.8617616,-0.7263623,-0.86237,-0.7369668,-0.7356124,-0.7550713,0.0,-0.8833386,0.0,0.0,...,6.0,-0.8581567,-0.7767493,-0.7505421,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0004267833,-0.04063619,0.006284366,-0.1051668,-0.0654483,0.001294225,0.0,0.2889832,1.0,1.0,...,11.0,-0.1914522,-0.3106225,-0.1910312,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.8626152,0.6799719,0.8667143,0.6221561,0.6525648,0.7576597,1.0,0.8751441,2.0,2.0,...,17.0,0.8223138,0.6147846,0.5834118,0.0,0.0,1.0,0.0,1.0,1.0
max,1.732502,3.078748,1.853644,4.826848,4.049055,2.522513,1.0,1.461305,2.0,2.0,...,23.0,2.763326,4.832592,3.589848,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
# Load data
df = pd.read_csv('processed_electricity_theft_data.csv')

# Separate features and target
X = df.drop('theft_detected', axis=1)
y = df['theft_detected']

# Select top 15 most important features (balance between size and performance)
selector = SelectKBest(f_classif, k=15)
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]
print("Selected Features:", list(selected_features))

# Update X with selected features
X = pd.DataFrame(X_selected, columns=selected_features)

Selected Features: ['billed_consumption', 'actual_consumption', 'average_voltage', 'peak_time_consumption', 'off_peak_consumption', 'power_factor', 'payment_history', 'num_outages', 'customer_type', 'location', 'month_num', 'consumption_diff', 'consumption_ratio', 'is_morning_peak', 'frequent_outages']


In [4]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Verify the shapes
print("Training set after SMOTE:", X_train_smote.shape)
print("Test set:", X_test_scaled.shape)

Training set after SMOTE: (70266, 15)
Test set: (30000, 15)


In [9]:
# Define the models with optimized parameters
models = {
    "Logistic Regression": LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=42),
    "Linear SVM": LinearSVC(C=0.1, random_state=42, dual=False),
    "Random Forest": RandomForestClassifier(n_estimators=50, max_depth=10, min_samples_leaf=5, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=50, max_depth=3, learning_rate=0.1, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=50, max_depth=5, learning_rate=0.1, random_state=42)
}

# Evaluate each model
results = []
for name, model in models.items():
    print(f"\nEvaluating {name}...")
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring='roc_auc')
    
    # Full training
    model.fit(X_train_smote, y_train_smote)
    
    # Predictions
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:  # For LinearSVC which doesn't have predict_proba by default
        y_prob = model.decision_function(X_test_scaled)
    
    y_pred = model.predict_proba(X_test_scaled)
    
    # Calculate metrics
    roc_auc = roc_auc_score(y_test, y_prob)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Calculate model size - CORRECTED APPROACH
    import io
    import sys
    from joblib import dump
    
    # Create in-memory file
    model_buffer = io.BytesIO()
    dump(model, model_buffer)
    model_size = sys.getsizeof(model_buffer)/1024  # Size in KB
    
    # Store results
    results.append({
        'Model': name,
        'CV ROC AUC Mean': np.mean(cv_scores),
        'Test ROC AUC': roc_auc,
        'Accuracy': accuracy,
        'Model Size (KB)': model_size
    })
    
    # Print results
    print(f"CV ROC AUC: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")
    print(f"Test ROC AUC: {roc_auc:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Model Size: {model_size:.2f} KB")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} Confusion Matrix')
    plt.show()

# Show final comparison
results_df = pd.DataFrame(results).sort_values('Test ROC AUC', ascending=False)
print("\nModel Performance Comparison:")
display(results_df)


Evaluating Logistic Regression...


ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets

In [8]:
# Select best model based on Test ROC AUC
best_model_name = results_df.iloc[0]['Model']
best_model = models[best_model_name]
best_model_size = results_df.iloc[0]['Model Size (KB)']

print(f"\nBest Model: {best_model_name}")
print(f"Model Size: {best_model_size:.2f} KB")

# Create model package with minimal components
model_package = {
    'model': best_model,
    'scaler': scaler,
    'feature_selector': selector,
    'features': list(selected_features),
    'metadata': {
        'performance': results_df.iloc[0].to_dict(),
        'creation_date': pd.Timestamp.now().strftime('%Y-%m-%d')
    }
}

# Save with maximum compression
joblib.dump(model_package, 'best_theft_model.pkl', compress=3)

# Verify saved model size
import os
saved_size = os.path.getsize('best_theft_model.pkl')/1024
print(f"Saved model size: {saved_size:.2f} KB")

# Model size optimization with size limit
MAX_SIZE_KB = 500  # Set your maximum acceptable size (500KB in this example)

if saved_size > MAX_SIZE_KB:
    print(f"\nBest model exceeds size limit of {MAX_SIZE_KB}KB. Finding alternative...")
    
    # Filter models under size limit, sorted by performance
    size_constrained = results_df[results_df['Model Size (KB)'] <= MAX_SIZE_KB]
    
    if not size_constrained.empty:
        best_small_model_name = size_constrained.iloc[0]['Model']
        best_small_model = models[best_small_model_name]
        small_model_size = size_constrained.iloc[0]['Model Size (KB)']
        
        print(f"Best model under size limit: {best_small_model_name}")
        print(f"Size: {small_model_size:.2f} KB")
        print(f"Test ROC AUC: {size_constrained.iloc[0]['Test ROC AUC']:.4f}")
        
        # Update and save the smaller model
        model_package['model'] = best_small_model
        model_package['metadata']['performance'] = size_constrained.iloc[0].to_dict()
        
        joblib.dump(model_package, 'best_compact_theft_model.pkl', compress=3)
        
        # Verify new size
        new_size = os.path.getsize('best_compact_theft_model.pkl')/1024
        print(f"New model size: {new_size:.2f} KB")
        
        # Recommendation
        print("\nRecommendation: Use the compact model for deployment")
    else:
        print("\nNo models under size limit. Try these options:")
        print("1. Reduce the number of features (currently using 15)")
        print("2. Use simpler models like Logistic Regression")
        print("3. Increase the size limit if possible")
else:
    print("\nModel successfully saved within size limits.")


Best Model: LightGBM
Model Size: 179.97 KB
Saved model size: 68.34 KB

Model successfully saved within size limits.
