In [8]:
import csv
import json
import pandas as pd
import numpy as np
import os

import c45
import random_forest
import matricies
from matricies import *

with open("Data/labels.txt", "r") as file:
    labels = [line.strip() for line in file]
    
scatac_feat, scrna_feat = get_matricies()



[[60197.  6248.  4661. ... 47172.  4581. 38815.]
 [66853.  5991. 77816. ...  2792. 19038.  1547.]
 [13579.  1527.  1360. ...  2404.  1607.  2412.]
 ...
 [26790.  3789.  5076. ...  1845.  2987.  1953.]
 [10934.  1848. 16540. ...  1156.  2437.   938.]
 [83110. 12502. 45698. ...  7958. 22743.  3810.]]


In [10]:
df_scatac = pd.DataFrame(scatac_feat, columns=[f"feat_{i}" for i in range(scatac_feat.shape[1])])
df_scatac["label"] = labels

attribute_types = {}
for col in df_scatac.columns:
    if col != "label":
        attribute_types[col] = "numeric"

df_scrna = pd.DataFrame(scrna_feat, columns=[f"feat_{i}" for i in range(scrna_feat.shape[1])])
df_scrna["label"] = labels


In [None]:
##Test tree
my_tree = c45.c45(
    split_metric="Gain",
    threshold=0.0,
    attribute_types=attribute_types
)
trained_tree = my_tree.fit(
    training_set=df, 
    truth="label",
    save=False
)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def grid_search_random_forest(
    df_train, df_test, truth_col,
    threshold_values,
    num_attributes_values,
    num_trees_values,
    num_data_points_values,
    split_metric="Gain"
):
    """
    df_train: DataFrame for building the forest (80% portion).
    df_test:  DataFrame for final testing (20% portion).
    truth_col: name of the target/label column.
    
    threshold_values, num_attributes_values, num_trees_values, num_data_points_values
        are lists of parameter values to try.

    Returns:
       best_model: the random_forest instance with best accuracy
       best_params: dict of the best hyperparams
       best_accuracy: float
       results_dict: a dictionary mapping (threshold, num_attrs, n_trees, n_points) -> accuracy
    """
    # Prepare for training
    #  (Note: random_forest needs the entire training DataFrame, 
    #         so we don't break out X_train, y_train here.)
    # Prepare for testing
    X_test = df_test.drop(columns=[truth_col])
    y_test = df_test[truth_col]
    
    results_dict = {}
    best_accuracy = -1.0
    best_params = None
    best_model = None
    
    all_features = [col for col in df_train.columns if col != truth_col]
    attribute_types = {col: "numeric" for col in all_features} 
    
    for threshold in threshold_values:
        for num_attrs in num_attributes_values:
            for n_trees in num_trees_values:
                for n_points in num_data_points_values:
                    
                    rf = random_forest.random_forest(
                        num_attributes=num_attrs,
                        num_data_points=n_points,
                        num_trees=n_trees,
                        split_metric=split_metric,
                        threshold=threshold,
                        attribute_types=attribute_types
                    )
                    
                    rf.fit(df_train, truth_col)
                    preds = rf.predict(X_test)
                    acc = accuracy_score(y_test, preds)
                    
                    param_tuple = (threshold, num_attrs, n_trees, n_points)
                    results_dict[param_tuple] = acc
                    
                    if acc > best_accuracy:
                        best_accuracy = acc
                        best_params = {
                            "threshold": threshold,
                            "num_attributes": num_attrs,
                            "num_trees": n_trees,
                            "num_data_points": n_points
                        }
                        best_model = rf
    
    return best_model, best_params, best_accuracy, results_dict

In [None]:

train_df, test_df = train_test_split(df_scatac, test_size=0.2, random_state=42)
train_df2, test_df2 = train_test_split(df_scrna, test_size=0.2, random_state=42)

threshold_values = [0.1, 0.2]
num_attributes_values = [2, 4, 6]
num_trees_values = [100, 500]
num_data_points_values = [0.3, 0.5]

best_model, best_params, best_acc, all_results = grid_search_random_forest(
    df_train=train_df,
    df_test=test_df,
    truth_col="label",
    threshold_values=threshold_values,
    num_attributes_values=num_attributes_values,
    num_trees_values=num_trees_values,
    num_data_points_values=num_data_points_values,
    split_metric="Gain"
)

print("Best Parameters:", best_params)
print(f"Best Accuracy: {best_acc * 100:.2f}%")

for params_tuple, accuracy_val in all_results.items():
    threshold, num_attrs, n_trees, n_points = params_tuple
    print(f"{params_tuple} -> {accuracy_val * 100:.2f}%")