
### **Cell 1: Imports and Setup**

This cell imports the necessary libraries for data manipulation, machine learning, and plotting. It also suppresses warnings for cleaner output.



In [None]:
import os
import pandas as pd
import re
import csv
import json
import warnings
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")



### **Cell 2: Configuration of Algorithm Results**

This cell defines a dictionary that maps algorithm names to their corresponding result files. This centralized configuration makes it easy to manage file paths.



In [None]:
# Dictionary mapping algorithm names to their result file paths
algorithm_results = {
    # CATEGORICAL ALGORITHMS
    'AVF': r'..\..\..\results\decision_tree\algorithms\AVF\AVF.csv',
    'CBRW': r'..\..\..\results\decision_tree\algorithms\CBRW\CBRW.csv',
    'CompreX': r'..\..\..\results\decision_tree\algorithms\CompreX\COMPREX.csv',
    'FPOF': r'..\..\..\results\decision_tree\algorithms\FPOF\FPOF.csv',
    'POP': r'..\..\..\results\decision_tree\algorithms\POP\POP.csv',
    'SCAN': r'..\..\..\results\decision_tree\algorithms\SCAN\SCAN.csv',
    'SDRW': r'..\..\..\results\decision_tree\algorithms\SDRW\SDRW.csv',
    'Zero++': r'..\..\..\results\decision_tree\algorithms\Zero++\ZERO++.csv',
    
    # NUMERICAL ALGORITHMS WITH ENCODINGS
    'LOF_ca': r'..\..\..\results\decision_tree\algorithms\LOF\ca\CA.csv',
    'KNN_ca': r'..\..\..\results\decision_tree\algorithms\KNN\ca\CA.csv',
    'iForest_ca': r'..\..\..\results\decision_tree\algorithms\iForest\ca\CA.csv',
    'FastABOD_ca': r'..\..\..\results\decision_tree\algorithms\FastABOD\ca\CA.csv',
    'DeepSVDD_ca': r'..\..\..\results\decision_tree\algorithms\DeepSVDD\ca\CA.csv',
    'McCatch_ca': r'..\..\..\results\decision_tree\algorithms\McCatch\ca\CA.csv',
    
    'LOF_idf': r'..\..\..\results\decision_tree\algorithms\LOF\idf\IDF.csv',
    'KNN_idf': r'..\..\..\results\decision_tree\algorithms\KNN\idf\IDF.csv',
    'iForest_idf': r'..\..\..\results\decision_tree\algorithms\iForest\idf\IDF.csv',
    'FastABOD_idf': r'..\..\..\results\decision_tree\algorithms\FastABOD\idf\IDF.csv',
    'DeepSVDD_idf': r'..\..\..\results\decision_tree\algorithms\DeepSVDD\idf\IDF.csv',
    'McCatch_idf': r'..\..\..\results\decision_tree\algorithms\McCatch\idf\IDF.csv',
    
    'LOF_onehot': r'..\..\..\results\decision_tree\algorithms\LOF\one_hot\ONE_HOT.csv',
    'KNN_onehot': r'..\..\..\results\decision_tree\algorithms\KNN\one_hot\ONE_HOT.csv',
    'iForest_onehot': r'..\..\..\results\decision_tree\algorithms\iForest\one_hot\ONE_HOT.csv',
    'FastABOD_onehot': r'..\..\..\results\decision_tree\algorithms\FastABOD\one_hot\ONE_HOT.csv',
    'DeepSVDD_onehot': r'..\..\..\results\decision_tree\algorithms\DeepSVDD\one_hot\ONE_HOT.csv',
    'McCatch_onehot': r'..\..\..\results\decision_tree\algorithms\McCatch\one_hot\ONE_HOT.csv',
    
    'LOF_pivot': r'..\..\..\results\decision_tree\algorithms\LOF\pivot\PIVOT.csv',
    'KNN_pivot': r'..\..\..\results\decision_tree\algorithms\KNN\pivot\PIVOT.csv',
    'iForest_pivot': r'..\..\..\results\decision_tree\algorithms\iForest\pivot\PIVOT.csv',
    'FastABOD_pivot': r'..\..\..\results\decision_tree\algorithms\FastABOD\pivot\PIVOT.csv',
    'DeepSVDD_pivot': r'..\..\..\results\decision_tree\algorithms\DeepSVDD\pivot\PIVOT.csv',
    'McCatch_pivot': r'..\..\..\results\decision_tree\algorithms\McCatch\pivot\PIVOT.csv',
}



### **Cell 3: Load and Prepare Dataset Summary**

This cell loads a summary of all datasets, sorts them, and manually annotates the 'best' performing algorithm for each one. This annotated DataFrame will be used as the basis for training the decision tree.



In [None]:
# Load the dataset summary file
df_summary = pd.read_csv(r'..\..\resume_datasets.csv', sep=';')
df_summary = df_summary.sort_values(by='file').reset_index(drop=True)

# Manually annotate the best performing algorithm for each dataset
# This list serves as the ground truth (target variable) for the decision tree.
best_algorithms = [
    'SDRW', 'SDRW', 'SCAN', 'iForest_pivot', 'SCAN', 'SDRW', 'iForest_pivot',
    'POP', 'KNN_one', 'iForest_idf', 'SCAN', 'AVF', 'SDRW',
    'AVF', 'CompreX', 'CBRW', 'CompreX', 'iForest_one', 'FastABOD_one', 'SCAN', 'SCAN',
    'POP', 'SCAN', 'iForest_idf', 'POP', 'KNN_ca', 'KNN_one', 'KNN_one',
    'iForest_one', 'iForest_one', 'iForest_one', 'iForest_one',
    'iForest_one', 'CBRW', 'CBRW', 'CBRW', 'CBRW', 'CBRW', 'CBRW', 'CBRW',
    'CBRW', 'CBRW', 'CBRW', 'KNN_one', 'SCAN', 'AVF', 'POP'
]
df_summary['best'] = best_algorithms

df_summary.head()



### **Cell 4: Create and Save the Decision Tree Input File**

Here, we select the relevant features from the summary DataFrame, add placeholder columns for 'context' and 'class', and save the result to a CSV file that will be used to train the model.



In [None]:
# Select features for the decision tree model
df_for_tree = df_summary.drop(columns=['outliers', '%_outliers', 'type', 'attr_numerics', 'attr_categorics', 'attr_binaries'])

# Add placeholder columns (to be filled manually or by another process)
df_for_tree['context'] = ''
df_for_tree['class'] = '' # This will be the target variable

# Save the prepared data to a CSV file
output_path = r'decision_tree.csv'
df_for_tree.to_csv(output_path, index=False)

print(f"Data for decision tree saved to {output_path}")



### **Cell 5: Train, Evaluate, and Visualize the Decision Tree**

This is the main block for the machine learning task. It loads one of the prepared datasets, splits it into training and testing sets, trains a `DecisionTreeClassifier`, evaluates its performance, and visualizes the resulting tree.



In [None]:
# This loop is set up to process multiple tree configurations,
# but the 'break' statement causes it to run only for the first item.
# This is useful for testing one configuration at a time.

tree_configurations = [
    {'name': 'Binary Tree', 'file': 'decision_tree_binary.csv'},
    #{'name': 'Binary Tree (Majority)', 'file': 'decision_tree_majority.csv'},
    #{'name': 'Algorithm Tree', 'file': 'decision_tree_algorithm.csv'},
]

for config in tree_configurations:
    tree_name = config['name']
    file_name = config['file']
    file_path = f'{file_name}'
    
    print(f"--- Processing: {tree_name} ---")

    # 1. Load the dataset
    df = pd.read_csv(file_path)
    df = df.drop(columns=['file', 'source'])

    # 2. Separate features (X) and target (y)
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # 3. Split data into training and testing sets (70% train, 30% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # 4. Create and train the Decision Tree model
    model = DecisionTreeClassifier(max_depth=4, random_state=42)
    model.fit(X_train, y_train)

    # 5. Evaluate the model on the test and train set
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    print(f"Training Accuracy: {accuracy_train:.2f}")
    print(f"Test Accuracy: {accuracy_test:.2f}")

    # 6. Perform 10-fold cross-validation for a more robust evaluation
    cv_scores = cross_val_score(model, X, y, cv=10)
    print(f"Cross-validation accuracies: {cv_scores}")
    print(f"Average CV Accuracy: {cv_scores.mean():.2f}")

    # 7. Visualize and save the decision tree
    plt.figure(figsize=(20, 12))
    plot_tree(model, feature_names=X.columns, class_names=sorted(list(set(y))), filled=True, rounded=True)
    plt.title(tree_name, fontsize=20)
    
    # Save the figure
    output_image_path = f"{file_name.replace('.csv', '')}.png"
    plt.savefig(output_image_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    # The break statement ensures only the first configuration is run.
    # Remove it to run for all configurations in the list.
    break



### **Cell 6: Validation on New Data**

This final section demonstrates how to use the trained model to make predictions on a new, unseen validation dataset.



In [None]:
# 1. Load the new validation data
df_validation = pd.read_csv(f'decision_tree_validation.csv')
X_validation = df_validation.iloc[:, 1:] # Assuming the first column is an identifier

print("Validation Features (X):")
print(X_validation)

# 2. Use the trained model from the previous cell to make predictions
validation_predictions = model.predict(X_validation)
print(f"\nModel Predictions: {validation_predictions}")

# 3. Define the ground truth labels for the validation set
y_validation_true = ['numeric', 'numeric', 'categorical', 'categorical', 'numeric', 
                     'numeric', 'categorical', 'numeric', 'numeric', 'numeric']
print(f"True Labels:       {y_validation_true}")

# 4. (Optional) Calculate accuracy on the validation set
validation_accuracy = accuracy_score(y_validation_true, validation_predictions)
print(f"\nValidation Accuracy: {validation_accuracy:.2f}")