In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm import tqdm
import AA_Import_LCP_Functions as chase_lcc

# Data Import

In [2]:
lccdata_folder = 'lccdata_files'

# Import LCC data files for wild type protein and mutant protein
wt_dict = chase_lcc.import_lcc_data(lccdata_folder, 'w')
D132H_dict = chase_lcc.import_lcc_data(lccdata_folder, 'm')

# Random Forest

In [3]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

def prepare_data_x(wt_dict, D132H_dict, window_size):
    wildtype_data = wt_dict[window_size]
    wildtype_label = np.zeros(len(wildtype_data))
    mutant_data = D132H_dict[window_size]
    mutant_label = np.ones(len(mutant_data))

    lcc_data = np.vstack((wildtype_data, mutant_data))
    label_data = np.hstack((wildtype_label, mutant_label))
    lcc_data, label_data = unison_shuffled_copies(lcc_data, label_data)
    lcc_data /= 100
    upper_training_limit = int(len(lcc_data) * 0.8)
    
    # Splitting the data into training and testing sets
    X_train, X_test = lcc_data[:upper_training_limit], lcc_data[upper_training_limit:]
    y_train, y_test = label_data[:upper_training_limit], label_data[upper_training_limit:]

    return X_train, X_test, y_train, y_test

In [4]:
# Function to run trials and save feature importances
def run_trials(n_trials, classifier, classifier_name, window_sizes, lccdata_folder):
    consistency_counts = {}

    # Wrap the outer loop with tqdm for a progress bar
    for trial in tqdm(range(1, n_trials + 1), desc=f"{classifier_name} Trials"):
        print(f"\nStarting {classifier_name} Trial {trial}")
        trial_dir = f"{classifier_name}_Consistency_Trial_{trial}"
        os.makedirs(trial_dir, exist_ok=True)

        for window_size in window_sizes:
            X_train, X_test, y_train, y_test = prepare_data_x(wt_dict, D132H_dict, window_size)
            clf = classifier()
            clf.fit(X_train, y_train)

            # Saving feature importances
            feature_importances = clf.feature_importances_
            trial_filename = os.path.join(trial_dir, f"Feature_Importance_WS_{window_size}.csv")
            pd.DataFrame(feature_importances).to_csv(trial_filename, index=False)

            # Update consistency counts based on threshold criteria
            threshold = 0.0285 * (68 / (70 - window_size))
            for i, importance in enumerate(feature_importances):
                if importance > threshold:
                    key = (window_size, i)  # (window_size, position)
                    if key not in consistency_counts:
                        consistency_counts[key] = [0] * n_trials
                    consistency_counts[key][trial - 1] = 1

    return consistency_counts

def assess_consistency(consistency_counts, n_trials):
    return {key: sum(values) for key, values in consistency_counts.items() if sum(values) == n_trials}

# Main execution
window_sizes = range(2, 52)
n_trials = 5

# Run trials for RF and XGBoost
rf_consistency_counts = run_trials(n_trials, RandomForestClassifier, 'RF', window_sizes, lccdata_folder)
xgb_consistency_counts = run_trials(n_trials, lambda: XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 'XGB', window_sizes, lccdata_folder)

# Assess consistency
rf_consistent_positions = assess_consistency(rf_consistency_counts, n_trials)
xgb_consistent_positions = assess_consistency(xgb_consistency_counts, n_trials)

print("RF Consistent Positions:", rf_consistent_positions)
print("XGBoost Consistent Positions:", xgb_consistent_positions)

RF Trials:   0%|                                          | 0/5 [00:00<?, ?it/s]


Starting RF Trial 1


RF Trials:  20%|██████                        | 1/5 [42:39<2:50:37, 2559.47s/it]


Starting RF Trial 2


RF Trials:  40%|███████████▏                | 2/5 [1:24:16<2:06:07, 2522.46s/it]


Starting RF Trial 3


RF Trials:  60%|████████████████▊           | 3/5 [2:05:46<1:23:36, 2508.06s/it]


Starting RF Trial 4


RF Trials:  80%|████████████████████████      | 4/5 [2:47:21<41:42, 2502.81s/it]


Starting RF Trial 5


RF Trials: 100%|██████████████████████████████| 5/5 [3:28:58<00:00, 2507.62s/it]
XGB Trials:   0%|                                         | 0/5 [00:00<?, ?it/s]


Starting XGB Trial 1


XGB Trials:  20%|██████▌                          | 1/5 [00:24<01:36, 24.20s/it]


Starting XGB Trial 2


XGB Trials:  40%|█████████████▏                   | 2/5 [00:47<01:11, 23.96s/it]


Starting XGB Trial 3


XGB Trials:  60%|███████████████████▊             | 3/5 [01:11<00:47, 23.92s/it]


Starting XGB Trial 4


XGB Trials:  80%|██████████████████████████▍      | 4/5 [01:35<00:23, 23.88s/it]


Starting XGB Trial 5


XGB Trials: 100%|█████████████████████████████████| 5/5 [01:59<00:00, 23.92s/it]

RF Consistent Positions: {(2, 9): 5, (2, 11): 5, (2, 27): 5, (2, 28): 5, (2, 29): 5, (2, 39): 5, (2, 41): 5, (3, 7): 5, (3, 8): 5, (3, 11): 5, (3, 20): 5, (3, 26): 5, (3, 27): 5, (3, 28): 5, (4, 6): 5, (4, 9): 5, (4, 10): 5, (4, 11): 5, (4, 19): 5, (4, 20): 5, (4, 23): 5, (4, 27): 5, (5, 5): 5, (5, 6): 5, (5, 7): 5, (5, 11): 5, (5, 12): 5, (5, 13): 5, (5, 18): 5, (5, 20): 5, (6, 6): 5, (6, 11): 5, (6, 12): 5, (6, 21): 5, (6, 22): 5, (6, 23): 5, (7, 6): 5, (7, 11): 5, (7, 21): 5, (8, 5): 5, (8, 9): 5, (8, 13): 5, (8, 20): 5, (8, 21): 5, (8, 23): 5, (9, 5): 5, (9, 9): 5, (9, 18): 5, (9, 19): 5, (9, 21): 5, (10, 7): 5, (10, 8): 5, (10, 18): 5, (10, 19): 5, (10, 31): 5, (11, 6): 5, (11, 7): 5, (11, 17): 5, (11, 18): 5, (11, 30): 5, (12, 6): 5, (12, 9): 5, (12, 13): 5, (12, 16): 5, (12, 17): 5, (12, 18): 5, (12, 19): 5, (12, 29): 5, (12, 30): 5, (13, 5): 5, (13, 13): 5, (13, 14): 5, (13, 15): 5, (13, 16): 5, (13, 17): 5, (13, 18): 5, (13, 29): 5, (14, 4): 5, (14, 13): 5, (14, 14): 5, (14, 1




In [5]:
print(len(rf_consistent_positions))
print(len(xgb_consistent_positions))

160
147


In [6]:
# Feature selection consistency comparison
models = ['XGB', 'RF']
n_trials = 5
window_sizes = range(2, 52)
threshold = 0.0285
results = {model: {i: 0 for i in range(1, 6)} for model in models}

def weighted_threshold(window_size):
    return threshold * (68 / (70 - window_size))

def count_positions_above_threshold(model, window_size, threshold):
    positions_above_threshold = []
    for trial in range(1, n_trials + 1):
        folder_name = f'{model}_Consistency_Trial_{trial}'
        file_name = f'Feature_Importance_WS_{window_size}.csv'
        file_path = os.path.join(folder_name, file_name)
        
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            positions_above_threshold.append(df[df['0'] > threshold].index.tolist())
        else:
            # Handle missing file if necessary
            pass
    
    # Count positions saved in all 5, 4, 3, 2, and 1 trials
    all_positions = set(sum(positions_above_threshold, []))
    for position in all_positions:
        counts = sum(position in trial_positions for trial_positions in positions_above_threshold)
        results[model][counts] += 1

for model in models:
    for window_size in window_sizes:
        w_threshold = weighted_threshold(window_size)
        count_positions_above_threshold(model, window_size, w_threshold)

# Results
for model in models:
    print(f'{model} Consistency:')
    for k, v in results[model].items():
        print(f'{k}/5 Trials: {v} positions')

XGB Consistency:
1/5 Trials: 36 positions
2/5 Trials: 20 positions
3/5 Trials: 17 positions
4/5 Trials: 6 positions
5/5 Trials: 147 positions
RF Consistency:
1/5 Trials: 17 positions
2/5 Trials: 14 positions
3/5 Trials: 14 positions
4/5 Trials: 19 positions
5/5 Trials: 160 positions


In [7]:
def calculate_stability_index(consistency_counts):
    """
    Calculate the stability index based on the consistency counts for different levels.
    
    Args:
    - consistency_counts (dict): A dictionary where keys are the number of trials (1 to 5)
      a feature appears in, and values are the number of positions that appear in that many trials.
      
    Returns:
    - float: The calculated stability index.
    """
    # Define weights for each consistency level
    weights = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
    
    # Calculate the weighted sum of positions for each consistency level
    weighted_sum = sum(weights[key] * count for key, count in consistency_counts.items())
    
    # Calculate the total possible weighted sum, assuming all positions were selected in all trials
    max_weighted_sum = sum(weights[key] * max(consistency_counts.values()) for key in weights)
    
    # Calculate the stability index as the ratio of the weighted sum to the maximum possible weighted sum
    stability_index = weighted_sum / max_weighted_sum
    
    return stability_index

# Consistency data for XGBoost and RF
xgb_consistency = {1: 36, 2: 20, 3: 17, 4: 6, 5: 147}
rf_consistency = {1: 17, 2: 14, 3: 14, 4: 19, 5: 160}

# Calculate stability indexes
xgb_stability_index = calculate_stability_index(xgb_consistency)
rf_stability_index = calculate_stability_index(rf_consistency)

print(f"XGBoost Stability Index: {xgb_stability_index:.4f}")
print(f"RF Stability Index: {rf_stability_index:.4f}")


XGBoost Stability Index: 0.4018
RF Stability Index: 0.4012


In [8]:
def calculate_consistency_score(consistency_counts):
    """
    Calculate the consistency score for feature selection across trials.
    
    Args:
    - consistency_counts (dict): A dictionary where keys are the number of trials (1 to 5)
      a feature appears in, and values are the number of positions that appear in that many trials.
    
    Returns:
    - float: The calculated consistency score.
    """
    # Initialize total counts and weighted sum
    total_counts = sum(consistency_counts.values())
    weighted_sum = sum(key * value for key, value in consistency_counts.items())
    
    # Calculate the consistency score as the weighted sum of positions divided by the total counts
    consistency_score = weighted_sum / total_counts if total_counts else 0
    
    return consistency_score

# Consistency data for XGBoost and RF
xgb_consistency = {1: 36, 2: 20, 3: 17, 4: 6, 5: 147}
rf_consistency = {1: 17, 2: 14, 3: 14, 4: 19, 5: 160}

# Calculate consistency scores
xgb_consistency_score = calculate_consistency_score(xgb_consistency)
rf_consistency_score = calculate_consistency_score(rf_consistency)

print(f"XGBoost Consistency Score: {xgb_consistency_score:.4f}")
print(f"RF Consistency Score: {rf_consistency_score:.4f}")

XGBoost Consistency Score: 3.9204
RF Consistency Score: 4.2991
