Final RuleFit rule extraction code with no feature duplications or redundant conditions

In [1]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit

def simplify_rule(rule):
    """
    Simplifies a rule string by removing logically redundant conditions
    and ensuring the rule format is valid.
    """
    try:
        conditions = rule.split(" & ")
        parsed_conditions = []
        for cond in conditions:
            parts = cond.split()
            if len(parts) < 3:
                continue
            feature, operator, value = parts[:3]
            value = float(value)
            parsed_conditions.append((feature, operator, value))

        simplified_conditions = {}
        for feature, operator, value in parsed_conditions:
            if feature not in simplified_conditions:
                simplified_conditions[feature] = (operator, value)
            else:
                current_operator, current_value = simplified_conditions[feature]
                if operator == ">" and value > current_value:
                    simplified_conditions[feature] = (operator, value)
                elif operator == "<=" and value < current_value:
                    simplified_conditions[feature] = (operator, value)

        return " & ".join([f"{feature} {operator} {value}" for feature, (operator, value) in simplified_conditions.items()])
    except Exception as e:
        print(f"Error simplifying rule: {rule}. Error: {e}")
        return rule

# Load Data
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\26 features results\3 best features'
train_df = pd.read_csv(os.path.join(input_dir, "3_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "3_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "3_testing_selected_features.csv"))

# Combine Training and Validation Data
train_val_df = pd.concat([train_df, val_df], ignore_index=True)
X_train_val = train_val_df.drop(columns=["label"]).values
y_train_val = train_val_df["label"].values
feature_names = train_val_df.columns[:-1].tolist()

# Train RuleFit Model
rulefit_model = RuleFit(tree_size=4, sample_fract=0.7, max_rules=200, random_state=42)
rulefit_model.fit(X_train_val, y_train_val, feature_names=feature_names)

# Extract Rules
rules = rulefit_model.get_rules()
rules = rules[rules.coef != 0]  # Filter rules with non-zero coefficients

if rules.empty:
    print("No rules were generated. Check your data or model configuration.")
else:
    # Simplify Rules
    rules["rule"] = rules["rule"].apply(simplify_rule)

    # Get Unique Classes
    unique_classes = sorted(set(y_train_val))

    # Display and Save Rules
    print("\nSimplified Top Rules in If-Then Format for All Classes:")
    if_then_rules = []
    
    for label in unique_classes:
        # Filter top rules for each class by importance scores and class association
        class_rules = rules[rules.apply(lambda x: np.argmax(x['coef']) == label if isinstance(x['coef'], np.ndarray) else x['coef'] > 0, axis=1)]
        class_rules = class_rules.sort_values(by="importance", ascending=False).head(20)  # Top N rules per class

        print(f"\nClass {label} Rules:")
        for _, row in class_rules.iterrows():
            rule_str = f"If ({row['rule']}) then Class = {label} (Importance: {row['importance']:.4f})"
            print(rule_str)
            if_then_rules.append(rule_str)

    # Save Rules to File
    output_file_path = "simplified_rulefit_top_if_then_rules_for_all_classes.txt"
    with open(output_file_path, 'w') as f:
        for rule in if_then_rules:
            f.write(rule + "\n")
    print(f"\nSimplified If-Then rules for all classes have been saved to {output_file_path}.")



Simplified Top Rules in If-Then Format for All Classes:

Class 0 Rules:
If (coef_of_var > 1.3878175020217896 & skewness > 1.0956262350082397) then Class = 0 (Importance: 0.0752)
If (skewness > 0.9522108733654022) then Class = 0 (Importance: 0.0675)
If (coef_of_var > 1.3185044527053833) then Class = 0 (Importance: 0.0648)
If (coef_of_var > 1.3949972987174988 & skewness > 1.080114722251892) then Class = 0 (Importance: 0.0545)
If (coef_of_var > 1.3344648480415344 & skewness > 0.9358093738555908) then Class = 0 (Importance: 0.0296)
If (coef_of_var > 1.3088043332099915) then Class = 0 (Importance: 0.0270)
If (coef_of_var > 1.3264262080192566) then Class = 0 (Importance: 0.0245)
If (coef_of_var > 1.38550865650177 & skewness > 1.0445439219474792) then Class = 0 (Importance: 0.0237)
If (coef_of_var > 1.3514490723609924) then Class = 0 (Importance: 0.0217)
If (coef_of_var > 1.3378830552101135 & skewness > 0.8769373893737793) then Class = 0 (Importance: 0.0204)
If (coef_of_var > 1.2904673814773

Final Decision Tree rule extraction code with no feature duplications or redundant conditions

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\26 features results\3 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "3_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "3_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "3_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters
clf = DecisionTreeClassifier(
    criterion="entropy",
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=4,
    random_state=0
)
clf.fit(combined_features, combined_labels)

# Extract rules in the form of "If ... then ..." with importance
feature_names = list(train_df.columns[:-1])
rules = []

def traverse_tree(tree, feature_names, feature_importances, node=0, conditions="", path_importance=0):
    """
    Recursively traverse the decision tree to extract rules and calculate their importance.
    """
    # Check if this is a leaf node
    if tree.children_left[node] == -1 and tree.children_right[node] == -1:
        # Leaf node, output the class prediction and importance
        class_value = np.argmax(tree.value[node][0])
        rule_importance = path_importance
        rule = f"If ({conditions.rstrip(' & ')}) then Class = {class_value} (Importance: {rule_importance:.4f})"
        rules.append(rule)
    else:
        # Internal node, calculate feature importance for this path
        feature_index = tree.feature[node]
        threshold = tree.threshold[node]

        # Avoid invalid indices
        if feature_index >= 0:
            # Update path importance using the feature importance of the current feature
            path_importance += feature_importances[feature_index]

            # Left child (feature <= threshold)
            left_conditions = conditions + f"{feature_names[feature_index]} <= {threshold:.3f} & "
            traverse_tree(tree, feature_names, feature_importances, tree.children_left[node], left_conditions, path_importance)

            # Right child (feature > threshold)
            right_conditions = conditions + f"{feature_names[feature_index]} > {threshold:.3f} & "
            traverse_tree(tree, feature_names, feature_importances, tree.children_right[node], right_conditions, path_importance)

# Traverse the tree to extract rules
try:
    traverse_tree(clf.tree_, feature_names, clf.feature_importances_)
except Exception as e:
    print(f"Error during tree traversal: {e}")

# Simplify a single rule by merging redundant conditions
def simplify_rule(rule):
    conditions_part, result_part = rule.split(") then")
    conditions = conditions_part.replace("If (", "").split(" & ")
    simplified_conditions = {}

    # Extract feature and threshold for each condition
    for condition in conditions:
        feature, operator, threshold = condition.split(" ")
        threshold = float(threshold)

        # Simplify by keeping the most restrictive range
        if feature not in simplified_conditions:
            simplified_conditions[feature] = {"<=": float('inf'), ">": float('-inf')}
        if operator == "<=":
            simplified_conditions[feature]["<="] = min(simplified_conditions[feature]["<="], threshold)
        elif operator == ">":
            simplified_conditions[feature][">"] = max(simplified_conditions[feature][">"], threshold)

    # Reconstruct simplified conditions
    final_conditions = []
    for feature, thresholds in simplified_conditions.items():
        if thresholds[">"] != float('-inf'):
            final_conditions.append(f"{feature} > {thresholds['>']:.3f}")
        if thresholds["<="] != float('inf'):
            final_conditions.append(f"{feature} <= {thresholds['<=']:.3f}")

    return f"If ({' & '.join(final_conditions)}) then{result_part}"

# Simplify all rules
simplified_rules = [simplify_rule(rule) for rule in rules]

# Extract importance scores from simplified rules and sort them in descending order
def extract_importance(rule):
    importance_start = rule.find("(Importance: ") + len("(Importance: ")
    importance_end = rule.find(")", importance_start)
    return float(rule[importance_start:importance_end])

# Sort rules by importance in descending order
sorted_rules = sorted(simplified_rules, key=extract_importance, reverse=True)

# Display sorted and simplified rules
print("Simplified and Sorted Rules:")
for rule in sorted_rules:
    print(rule)

# Save sorted and simplified rules to a file
output_file_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\26 features results\3 best features\simplified_sorted_rules.txt'
with open(output_file_path, 'w') as f:
    for rule in sorted_rules:
        f.write(rule + "\n")

print(f"Simplified and sorted rules have been saved to {output_file_path}.")


Simplified and Sorted Rules:
If (skewness > 0.667 & skewness <= 0.672 & coef_of_var > 1.216 & coef_of_var <= 1.220) then Class = 1 (Importance: 5.8019)
If (skewness > 0.672 & skewness <= 0.673 & coef_of_var > 1.216 & coef_of_var <= 1.220) then Class = 0 (Importance: 5.8019)
If (skewness > 0.819 & skewness <= 0.857 & coef_of_var > 1.184 & coef_of_var <= 1.220) then Class = 1 (Importance: 5.8019)
If (skewness > 0.857 & skewness <= 0.858 & coef_of_var > 1.184 & coef_of_var <= 1.220) then Class = 0 (Importance: 5.8019)
If (skewness > 0.858 & skewness <= 0.860 & coef_of_var > 1.184 & coef_of_var <= 1.220) then Class = 1 (Importance: 5.8019)
If (skewness > 0.860 & skewness <= 0.861 & coef_of_var > 1.184 & coef_of_var <= 1.220) then Class = 2 (Importance: 5.8019)
If (skewness > 0.869 & skewness <= 0.884 & coef_of_var > 1.184 & coef_of_var <= 1.220) then Class = 2 (Importance: 5.8019)
If (skewness > 0.884 & skewness <= 0.886 & coef_of_var > 1.184 & coef_of_var <= 1.220) then Class = 0 (Importa