In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import pathlib

def split_and_save_dataset(csv_file, dataset_name):
    df = pd.read_csv(csv_file)

    df_train, df_test = train_test_split(df, test_size=0, random_state=42)
    
    respath = '../src/resources/datasets/' + dataset_name

    os.makedirs(respath, exist_ok=True)
    
    for i in range(1, 11):
        train_filename = respath + f'/{dataset_name}.train{i}.csv'
        test_filename = respath + f'/{dataset_name}.test{i}.csv'
        
        df_train.to_csv(train_filename, index=False)
        df_test.to_csv(test_filename, index=False)


In [23]:
csv_file = '../src/resources/datasets/CDC-Diabetes/CDC-Diabetes.csv'
dataset_name = 'CDC-Diabetes'
split_and_save_dataset(csv_file, dataset_name)

In [24]:
df_example = pd.read_csv("../src/resources/datasets/CDC-Diabetes/CDC-Diabetes.train1.csv")
print(df_example.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [21]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import _tree

def extract_tree_structure(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    lines = []  # Use a list to collect lines, which will be joined into the final output

    def recurse(node, depth):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            line = f"{node} IN {tree_.children_left[node]} {tree_.children_right[node]} " \
                   f"{feature_names.index(name)} {threshold:.6f} {depth} -1"
            lines.append(line)
            recurse(tree_.children_left[node], depth + 1)
            recurse(tree_.children_right[node], depth + 1)
        else:
            line = f"{node} LN -1 -1 -1 -1 {depth} {int(tree_.value[node].argmax())}"
            lines.append(line)

    recurse(0, 0)
    return "\n".join(lines)


def train_and_save_forest(dataset_name, target_var, n_trees=10, max_depth=3):
    dataset_path = f"../src/resources/datasets/{dataset_name}"
    save_path = f"../src/resources/forests/{dataset_name}"
    os.makedirs(save_path, exist_ok=True)

    for i in range(1, 11):  # Assuming 10 training sets as per your requirement
        df_train = pd.read_csv(f"{dataset_path}/{dataset_name}.train{i}.csv")
        
        X_train = df_train.drop(target_var, axis=1)
        y_train = df_train[target_var]  # Target variable
        
        rf = RandomForestClassifier(n_estimators=n_trees, max_depth=max_depth, random_state=42)
        rf.fit(X_train, y_train)
        
        # Formatting the .txt file content
        content = f"DATASET_NAME: {dataset_name}.train{i}.csv\n" \
                  f"ENSEMBLE: RF\n" \
                  f"NB_TREES: {n_trees}\n" \
                  f"NB_FEATURES: {len(X_train.columns)}\n" \
                  f"NB_CLASSES: {len(pd.unique(y_train))}\n" \
                  f"MAX_TREE_DEPTH: {max_depth}\n"
        content += "Format: node / node type (LN - leave node, IN - internal node) left child / right child / feature / threshold / node_depth / majority class (starts with index 0)\n\n"  # Another newline for separation
        
        for t_idx, tree in enumerate(rf.estimators_):
            content += f"[TREE {t_idx}]\nNB_NODES: {tree.tree_.node_count}\n"
            content += extract_tree_structure(tree, list(X_train.columns)) + "\n\n" 

        with open(f"{save_path}/{dataset_name}.RF{i}.txt", 'w') as file:
            file.write(content)

target_var = 'Outcome' 
train_and_save_forest(dataset_name, target_var)
