In [6]:
# !pip install scikit-learn

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [8]:

# 1. Load Data
# Assuming CSV format: label,CPI,Math_Intensity,Stall_Ratio,System_BW_Proxy,Branch_MPKI,GFLOPS_Approx,Clock_Ratio
df = pd.read_csv('../scripts/csv/training_dataset_dvfs_all.csv')

# 2. Data Cleaning
# Remove rows with NaN or infinite values (common in Likwid startup/shutdown)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# 3. Feature Selection
# We separate the target (y) from the features (X)
features = ['CPI', 'Math_Intensity', 'Stall_Ratio', 'System_BW_Proxy [GB/s]', 
            'Branch_MPKI', 'GFLOPS_Approx', 'Clock_Ratio']

X = df[features]
y = df['label']

# 4. Split Data (Stratified)
# Stratify ensures we get a mix of Compute/Memory/Spin in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Model Training (Decision Tree)
# max_depth=5 keeps the logic simple enough to code manually in C later
clf = DecisionTreeClassifier(max_depth=5, random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

# 6. Evaluation
print("=== Classification Report ===")
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

# 7. Feature Importance Analysis
# This tells you which counters are actually driving the decisions
importances = pd.DataFrame({'feature': features, 'importance': clf.feature_importances_})
print("\n=== Feature Importances ===")
print(importances.sort_values(by='importance', ascending=False))

# 8. Export Logic for C/C++
print("\n=== C-Code Logic (Copy this to your Daemon) ===")
tree_rules = export_text(clf, feature_names=features)
print(tree_rules)

=== Classification Report ===
              precision    recall  f1-score   support

    HighFreq       0.99      1.00      0.99      1618
     LowFreq       0.99      0.99      0.99       945
     MedFreq       1.00      1.00      1.00       601

    accuracy                           0.99      3164
   macro avg       0.99      0.99      0.99      3164
weighted avg       0.99      0.99      0.99      3164


=== Confusion Matrix ===
[[1610    8    0]
 [  12  933    0]
 [   0    0  601]]

=== Feature Importances ===
                  feature  importance
2             Stall_Ratio    0.411494
1          Math_Intensity    0.353368
0                     CPI    0.218385
6             Clock_Ratio    0.010753
3  System_BW_Proxy [GB/s]    0.005999
4             Branch_MPKI    0.000000
5           GFLOPS_Approx    0.000000

=== C-Code Logic (Copy this to your Daemon) ===
|--- Math_Intensity <= 0.08
|   |--- CPI <= 0.88
|   |   |--- System_BW_Proxy [GB/s] <= 0.00
|   |   |   |--- CPI <= 0.35
|   

In [None]:
from sklearn.tree import _tree
import numpy as np

def generate_model_c_file(clf, feature_names, filename="../dvfs/model.c"):
    tree_ = clf.tree_
    
    # Mapping: Ensure these keys match your training labels exactly!
    label_to_enum = {
        "LowFreq":  "FREQ_LEVEL_LOW",
        "MedFreq":  "FREQ_LEVEL_MED",
        "HighFreq": "FREQ_LEVEL_HIGH"
    }
    
    model_classes = list(clf.classes_)
    
    with open(filename, "w") as f:
        # Write Includes
        f.write("/* Auto-generated by Python ML Exporter */\n")
        f.write("#include \"model.h\"\n")
        f.write("#include <stdlib.h>\n\n")
        
        # Write Function Signature
        f.write("FreqLevel predict_phase_level(double CPI, double Math_Intensity, double Stall_Ratio, double System_BW_Proxy, double Branch_MPKI, double GFLOPS_Approx, double Clock_Ratio) {\n")

        # Recursive Tree Walker
        def recurse(node, depth):
            indent = "    " * depth
            if tree_.feature[node] != _tree.TREE_UNDEFINED:
                # Decision Node
                feat_name = feature_names[tree_.feature[node]]
                threshold = tree_.threshold[node]
                f.write(f"{indent}if ({feat_name} <= {threshold:.6f}) {{\n")
                recurse(tree_.children_left[node], depth + 1)
                f.write(f"{indent}}} else {{\n")
                recurse(tree_.children_right[node], depth + 1)
                f.write(f"{indent}}}\n")
            else:
                # Leaf Node
                class_idx = np.argmax(tree_.value[node])
                label_str = model_classes[class_idx]
                enum_val = label_to_enum.get(label_str, "FREQ_LEVEL_HIGH") # Default fallback
                
                f.write(f"{indent}// Prediction: {label_str}\n")
                f.write(f"{indent}return {enum_val};\n")

        # Start Recursion
        recurse(0, 1)
        
        # Close Function
        f.write("}\n")
    
    print(f"Successfully generated {filename}")

# --- USAGE EXAMPLE ---
# Assuming 'clf' is your trained DecisionTreeClassifier
features = ['CPI', 'Math_Intensity', 'Stall_Ratio', 'System_BW_Proxy', 
            'Branch_MPKI', 'GFLOPS_Approx', 'Clock_Ratio']

generate_model_c_file(clf, features)

Successfully generated ../ctrl/model.c
