In [None]:
import scipy.io
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import tree
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Function to load .mat files
def load_mat_file(file_path):
    return scipy.io.loadmat(file_path)

# Function to print basic data info
def basic_data_info(expression_df, labels_df):
    print(f"Number of samples (rows): {expression_df.shape[0]}")
    print(f"Number of genes (columns): {expression_df.shape[1]}")
    print("\nLabels information:")
    print(labels_df.value_counts())

# Function to plot confusion matrix with cell names like TP, TN, FP, FN
def plot_confusion_matrix(y_true, y_pred, title='Confusion Matrix'):
    cm = confusion_matrix(y_true, y_pred)
    cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
    labels = np.array([["TN", "FP"], ["FN", "TP"]])
    cm_with_labels = np.array([f"{label}\n{cm_val}" for label, cm_val in zip(labels.flatten(), cm.flatten())])
    cm_with_labels = cm_with_labels.reshape(2, 2)
    
    plt.figure(figsize=(3, 3))
    sns.heatmap(cm, annot=cm_with_labels, fmt="", cmap='Blues', cbar=False, square=True, 
                xticklabels=["Predicted 0", "Predicted 1"], yticklabels=["Actual 0", "Actual 1"])
    
    plt.title(title)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Function to split data into train and test sets
def split_data(expression_df, labels_df, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(expression_df, labels_df, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Function to perform grid search
def perform_grid_search(X_train, y_train):
    param_grid = {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy']
    }
    dt = DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy', verbose=0, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search

# Function to train the model
def train_best_model(grid_search, X_train, y_train):
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)
    return best_model

# Function to calculate node accuracy
def calculate_node_accuracy(tree_model, X_train, y_train):
    # Get the leaf node IDs for each sample in the training data
    leaf_ids = tree_model.apply(X_train)

    # Dictionary to store node accuracy information
    node_accuracies = {}

    # Iterate over all unique leaf nodes
    for node_id in np.unique(leaf_ids):
        # Get samples in the leaf node (use positional indices)
        sample_ids = np.where(leaf_ids == node_id)[0]  # These are positional indices

        # Use iloc to access rows by positional index
        y_true = y_train.iloc[sample_ids]  
        y_pred = tree_model.predict(X_train.iloc[sample_ids])  # Make sure X_train is a DataFrame

        # Calculate node accuracy (correct predictions over total samples)
        correct_predictions = np.sum(y_true == y_pred)
        total_samples = len(sample_ids)
        accuracy = correct_predictions / total_samples if total_samples > 0 else 0

        # Store node accuracy
        node_accuracies[node_id] = accuracy

        # Print debugging information
        # print(f"Node {node_id}: Total samples = {total_samples}, Correct predictions = {correct_predictions}, Accuracy = {accuracy:.2f}")

    return node_accuracies

# Function to visualize the decision tree with node accuracy and node IDs
def visualize_tree_with_node_accuracy(tree_model, node_accuracies, feature_names):
    plt.figure(figsize=(20, 10))
    tree_plot = tree.plot_tree(tree_model, filled=True, feature_names=feature_names, class_names=["0", "1"], rounded=True)
    ax = plt.gca()
    
    # Annotate each node with its node ID and accuracy if it's a leaf
    for idx, text in enumerate(ax.texts):
        node_id = idx  # Node index corresponds to the position in the plot
        # Display the node ID on the graph
        text.set_text(f"Node {node_id}\n" + text.get_text())
        
        # If the node is a leaf node, append the accuracy
        if node_id in node_accuracies:
            node_accuracy = node_accuracies[node_id]
            updated_text = text.get_text() + f"\nAccuracy: {node_accuracy:.2f}"
            text.set_text(updated_text)

    plt.title("Decision Tree Visualization with Node IDs and Accuracy")
    plt.show()

# Function to evaluate the model
def evaluate_model(best_model, X_test, y_test):
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    plot_confusion_matrix(y_test, y_pred)

base_path = '/Users/asifahmed/Documents/Codes/MyRecourseProject/datasets/ACES_dataset'

# Load the expression data and labels
expression_data = load_mat_file(f'{base_path}/ACES_RefinedCommunity_AVG.mat')
label_data = load_mat_file(f'{base_path}/ACESLabel.mat')

# Extract the expression matrix and labels
expression_matrix = expression_data['data']
labels = label_data['label'].flatten()  # Flatten to a 1D array if necessary

# Convert to DataFrame for easier handling
expression_df = pd.DataFrame(expression_matrix)
labels_df = pd.Series(labels, name='label')

# Display basic data information
basic_data_info(expression_df, labels_df)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = split_data(expression_df, labels_df)

# Train the decision tree using GridSearch
grid_search = perform_grid_search(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")

# Train the best model
best_model = train_best_model(grid_search, X_train, y_train)

# Evaluate the initial model
print("\nInitial Model Evaluation:")
evaluate_model(best_model, X_test, y_test)

# Calculate node accuracy
node_accuracies = calculate_node_accuracy(best_model, X_train, y_train)

# Visualize the initial tree with node accuracy
print("\nInitial Decision Tree Visualization with Node Accuracy:")
num_features = X_train.shape[1]
feature_names = [f"x{index}" for index in range(num_features)]
visualize_tree_with_node_accuracy(best_model, node_accuracies, feature_names)


In [None]:
import matplotlib.pyplot as plt

def calculate_feature_ranges(tree_model, X_train, y_train, node_ids, features):
    ranges = {}
    for node_id in node_ids:
        leaf_ids = tree_model.apply(X_train)
        samples_in_node = X_train.iloc[leaf_ids == node_id]
        class_1_samples = samples_in_node[y_train.iloc[leaf_ids == node_id] == 1]
        ranges[node_id] = {feature: (class_1_samples.iloc[:, feature].min(), class_1_samples.iloc[:, feature].max()) for feature in features}
    return ranges

def plot_feature_ranges(ranges, features):
    colors = ["green", "red"]  # Colors for the nodes
    fig, axs = plt.subplots(len(features), 1, figsize=(13, len(features) * 2))
    
    for i, feature in enumerate(features):
        for j, (node_id, node_ranges) in enumerate(ranges.items()):
            feature_key = f"x{feature}"
            axs[i].plot(node_ranges[feature], [i]*2, marker='o', color=colors[j], label=f"Node {node_id} {feature_key}")
            axs[i].set_title(f"Range of {feature_key} for class 1")

    for ax in axs:
        ax.legend()
        ax.set_yticks([])  # Hide y ticks

    plt.tight_layout()
    plt.show()

# Example usage
features_of_interest = [17, 20, 13]
node_ids = [14, 6]
ranges = calculate_feature_ranges(best_model, X_train, y_train, node_ids, features_of_interest)
plot_feature_ranges(ranges, features_of_interest)

In [None]:
import numpy as np
import pandas as pd

def calculate_feature_ranges(tree_model, X_train, y_train, node_ids, features):
    ranges = {}
    for node_id in node_ids:
        leaf_ids = tree_model.apply(X_train)
        samples_in_node = X_train.iloc[leaf_ids == node_id]
        class_1_samples = samples_in_node[y_train.iloc[leaf_ids == node_id] == 1]
        ranges[node_id] = {feature: (class_1_samples.iloc[:, feature].min(), class_1_samples.iloc[:, feature].max()) for feature in features}
    return ranges

def can_avoid_range(valid_range, avoid_range):
    return not (valid_range[0] >= avoid_range[0] and valid_range[1] <= avoid_range[1])

def sample_avoiding_range(valid_range, avoid_range):
    if not can_avoid_range(valid_range, avoid_range):
        return np.random.uniform(valid_range[0], valid_range[1])
    
    while True:
        value = np.random.uniform(valid_range[0], valid_range[1])
        if not (avoid_range[0] <= value <= avoid_range[1]):
            return value

def generate_avoiding_samples(node_14_data, node_14_ranges, node_6_ranges, num_samples):
    generated_samples = []

    for _ in range(num_samples):
        sample = {}
        # Generate samples for the avoiding features
        for feature in [13, 17, 20]:
            valid_range = node_14_ranges[feature]
            avoid_range = node_6_ranges[feature]
            sample[feature] = sample_avoiding_range(valid_range, avoid_range)
        
        # Keep other features from the existing node 14 class 1 samples
        other_features = node_14_data.drop(columns=[13, 17, 20]).sample(n=1, replace=True).iloc[0].to_dict()
        sample.update(other_features)

        generated_samples.append(sample)
    
    generated_samples_df = pd.DataFrame(generated_samples)
    desired_order = node_14_data.columns.tolist()
    generated_samples_df = generated_samples_df[desired_order]
    generated_samples_df['label'] = 1

    return generated_samples_df

# Example usage
features_of_interest = [13, 17, 20]
node_ids = [14, 6]
ranges = calculate_feature_ranges(best_model, X_train, y_train, node_ids, features_of_interest)

# Extract node-specific data and ranges
node_14_data = X_train[(best_model.apply(X_train) == 14) & (y_train == 1)]  # Class 1 samples from node 14
node_14_ranges = ranges[14]
node_6_ranges = ranges[6]

num_class_1_samples_node_14 = len(node_14_data)
generated_samples = generate_avoiding_samples(node_14_data, node_14_ranges, node_6_ranges, num_samples=num_class_1_samples_node_14)

print(f"Number of generated samples: {len(generated_samples)}")

In [None]:
def observe_generated_samples(tree_model, generated_samples):
    # Drop the 'label' column if it exists in the DataFrame
    samples_without_label = generated_samples.drop(columns=['label'], errors='ignore')

    # Get the node assignments for the generated samples
    node_assignments = tree_model.apply(samples_without_label)

    # Get the predicted class labels for the generated samples
    predicted_labels = tree_model.predict(samples_without_label)

    # Combine the results into a DataFrame for easy analysis
    observation_df = samples_without_label.copy()
    observation_df['Node_Assignment'] = node_assignments
    observation_df['Predicted_Label'] = predicted_labels

    return observation_df

# Pass the generated samples through the decision tree
observation_results = observe_generated_samples(best_model, generated_samples)

# Display the results
print("Observation of Generated Samples:")

# Analyze the distribution of nodes and predicted classes
node_distribution = observation_results['Node_Assignment'].value_counts()
predicted_class_distribution = observation_results['Predicted_Label'].value_counts()

print("\nNode Distribution of Generated Samples:")
print(node_distribution)

print("\nPredicted Class Distribution of Generated Samples:")
print(predicted_class_distribution)

In [None]:
# Plot the initial decision tree with node accuracy
print("\nInitial Decision Tree Visualization with Node Accuracy:")
num_features = X_train.shape[1]
feature_names = [f"x{index}" for index in range(num_features)]
visualize_tree_with_node_accuracy(best_model, node_accuracies, feature_names)

# Combine the generated samples with the original training data
X_train_augmented = pd.concat([X_train, generated_samples.drop(columns=['label'])], ignore_index=True)
y_train_augmented = pd.concat([y_train, generated_samples['label']], ignore_index=True)

# Perform grid search on the augmented data
grid_search_augmented = perform_grid_search(X_train_augmented, y_train_augmented)
print(f"Best Parameters for Retrained Model: {grid_search_augmented.best_params_}")

# Retrain the model with the best parameters from grid search
best_retrained_model = train_best_model(grid_search_augmented, X_train_augmented, y_train_augmented)

# Calculate node accuracy for the retrained model
node_accuracies_retrained = calculate_node_accuracy(best_retrained_model, X_train_augmented, y_train_augmented)

# Plot the retrained decision tree with node accuracy
print("\nVisualization of Retrained Decision Tree with Node Accuracy:")
visualize_tree_with_node_accuracy(best_retrained_model, node_accuracies_retrained, feature_names)

# Evaluate the retrained model on the test set
print("\nEvaluation of the Retrained Model:")
evaluate_model(best_retrained_model, X_test, y_test)