In [None]:
# Decision Tree Classifier for RECS 2020 Dataset
# ============================================
# This script trains a decision tree classifier (max_depth=4) to predict energy efficiency
# classes (High, Moderate, Low) and integrates fuzzy logic scores if available.
#
# Inputs:
# - Processed dataset (data/processed/merged_with_efficiency.csv or merged_cleaned.csv)
# Outputs:
# - Trained model (models/decision_tree_model.pkl)
# - Visualizations (confusion matrix, decision tree)
# - Metrics (accuracy, classification report, cross-validation)
#
# Dependencies: pandas, matplotlib, seaborn, scikit-learn, joblib

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Setup paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_DIR = os.path.join(BASE_DIR, "data")
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
MODELS_DIR = os.path.join(BASE_DIR, "models")

In [None]:
# Load the processed dataset with fuzzy scores
df = pd.read_csv(os.path.join(PROCESSED_DIR, "merged_with_efficiency.csv"))

In [None]:
# Define features
features = [
    'ENERGY_CONSUMPTION_PER_SQFT', 'Pct_INCOME_MORE_THAN_150K',
    'Pct_HOUSING_SINGLE_FAMILY_HOME_DETACHED', 'Pct_HOUSING_APT_MORE_THAN_5_UNITS',
    'CLIMATE_Cold', 'CLIMATE_Hot-Humid', 'CLIMATE_Mixed-Humid', 'CLIMATE_Very-Cold',
    'Pct_BUILT_BEFORE_1950', 'Pct_MAIN_AC_AGE_OLDER_THAN_20',
    'Pct_MAIN_HEAT_AGE_OLDER_THAN_20', 'Pct_MAIN_WATER_HEAT_AGE_OLDER_THAN_20'
]
available_features = [col for col in features if col in df.columns]

# Prepare model DataFrame
df_model = df[available_features + ['Efficiency_Class', 'FINAL_CLASS']].copy()

In [None]:
# Prepare data
X = df_model[available_features]
y = df_model['Efficiency_Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train decision tree with max_depth=4
clf = DecisionTreeClassifier(max_depth=4, random_state=42)
clf.fit(X_train, y_train)

# Evaluate model
y_pred = clf.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Use fuzzy-refined classes for comparison
y_fuzzy = df_model.loc[X_test.index, 'FINAL_CLASS']
print("\nFuzzy-Refined Accuracy:", accuracy_score(y_test, y_fuzzy))
print("\nFuzzy-Refined Classification Report:\n", classification_report(y_test, y_fuzzy))

In [None]:
# 5-fold cross-validation
cv_scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Average CV Accuracy:", cv_scores.mean())

In [None]:
# Confusion matrix visualization
conf_matrix = confusion_matrix(y_test, y_pred, labels=clf.classes_)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=clf.classes_,
            yticklabels=clf.classes_, cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Decision Tree Confusion Matrix")
plt.show()

In [None]:
# Decision tree visualization
plt.figure(figsize=(18, 10))
plot_tree(clf, feature_names=available_features, class_names=clf.classes_,
          filled=True, rounded=True)
plt.title("Decision Tree Visualization")
plt.show()

In [None]:
# Feature importance
importances = clf.feature_importances_
feature_imp = pd.DataFrame({'Feature': available_features, 'Importance': importances})
feature_imp = feature_imp.sort_values('Importance', ascending=False)
print("\nFeature Importance:\n", feature_imp)

In [None]:
# Save the model
joblib.dump(clf, os.path.join(MODELS_DIR, "decision_tree_model.pkl"))