In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [None]:
# Function for Leave-One-Out Random Forest Classification
def zy_leave_one_out_random_forest(rf_dt, rf_map, zy_sample="zy_RF_temp_ID", group=None, nspecies=None, seed=123, ntree=999):
    # If species number is provided, filter the data
    if nspecies is not None:
        rf_dt = rf_dt.iloc[:nspecies, :]
        rf_dt = rf_dt.loc[rf_dt.sum(axis=1) != 0]
    
    np.random.seed(seed)
    
    # Convert group column to categorical
    rf_map[group] = rf_map[group].astype('category')
    
    # Subset data by sample
    rf_dt = rf_dt.loc[:, rf_map[zy_sample]]
    
    results = []
    model = RandomForestClassifier(n_estimators=ntree, random_state=seed, oob_score=True)
    
    for i in range(rf_dt.shape[1]):
        print(f"Sample: {i+1}")
        test_sample = rf_map.iloc[i]
        test_dt = rf_dt.loc[:, test_sample[zy_sample]]
        
        # Training set excluding current test sample
        train_sample = rf_map.drop(i)
        train_dt = rf_dt.loc[:, train_sample[zy_sample]]
        
        # Train RandomForest
        model.fit(train_dt.T, train_sample[group])
        
        # Predict probabilities
        pred = model.predict_proba(test_dt.T)[:, 1]
        
        # Store predictions along with sample information
        pred_df = pd.DataFrame(pred, columns=[group])
        pred_df[zy_sample] = test_sample[zy_sample]
        pred_df[group] = test_sample[group]
        results.append(pred_df)
    
    # Combine all prediction results
    final_result = pd.concat(results, ignore_index=True)
    return {'pred': final_result, 'importance': model.feature_importances_}

# Function to format column names to valid Python identifiers
def zy_format_class_name(rf_dt, rf_map, zy_sample):
    rf_dt.columns = pd.io.parsers.ParserBase({'names': rf_dt.columns})._make_parseables(rf_dt.columns)
    rf_map[zy_sample] = pd.io.parsers.ParserBase({'names': rf_map[zy_sample]})._make_parseables(rf_map[zy_sample])
    rf_map['zy_RF_temp_ID'] = rf_map[zy_sample]
    rf_dt = rf_dt.loc[:, rf_map['zy_RF_temp_ID']]
    return {'rf_dt': rf_dt, 'rf_map': rf_map}

# Function for Random Forest Classification (train and test data)
def zy_rf(train_dt, train_map, test_dt, test_map, ntree=999, seed=123, zy_sample="zy_RF_temp_ID", group=None):
    if zy_sample is not None:
        test_sample = train_sample = zy_sample
        test_group = train_group = group
    
    group_fact = test_map[test_group].unique()
    
    # Set group as factor (category)
    test_map[test_group] = pd.Categorical(test_map[test_group], categories=group_fact)
    train_map[train_group] = pd.Categorical(train_map[train_group], categories=group_fact)
    
    # Subset data by sample
    train_dt = train_dt.loc[:, train_map[train_sample]]
    test_dt = test_dt.loc[:, test_map[test_sample]]
    
    # Train RandomForest
    model = RandomForestClassifier(n_estimators=ntree, random_state=seed)
    model.fit(train_dt.T, train_map[train_group])
    
    # Predict probabilities
    pred = model.predict_proba(test_dt.T)[:, 1]
    
    # Merge prediction results
    result = test_map[[test_sample, test_group]].copy()
    result['pred'] = pred
    return result

# Function for Random Forest with 2-Class Cross-Validation
def zy_rf_two_class(rf_dt, rf_map, zy_sample="zy_RF_temp_ID", group=None, ntree=999, cross_n=10, nspecies=None, seed=123):
    # If species number is provided, filter the data
    if nspecies is not None:
        rf_dt = rf_dt.iloc[:nspecies, :]
        rf_dt = rf_dt.loc[rf_dt.sum(axis=1) != 0]
    
    np.random.seed(seed)
    
    # Group by the 'group' column and assign cross-validation group numbers
    gs = rf_map.groupby(group).size().reset_index(name='value')
    
    g1 = rf_map[rf_map[group] == gs.iloc[0, 0]].copy()
    g1['rf_temp_cross_n'] = np.tile(np.arange(1, cross_n+1), len(g1) // cross_n + 1)[:len(g1)]
    
    g2 = rf_map[rf_map[group] == gs.iloc[1, 0]].copy()
    g2['rf_temp_cross_n'] = np.tile(np.arange(1, cross_n+1), len(g2) // cross_n + 1)[:len(g2)]
    
    rf_map = pd.concat([g1, g2])
    rf_map[group] = rf_map[group].astype('category')
    
    results = []
    
    for i in range(1, cross_n + 1):
        # Split data into train and test based on cross-validation group
        test_sample = rf_map[rf_map['rf_temp_cross_n'] == i]
        test_dt = rf_dt.loc[:, test_sample[zy_sample]]
        
        train_sample = rf_map[rf_map['rf_temp_cross_n'] != i]
        train_dt = rf_dt.loc[:, train_sample[zy_sample]]
        
        # Train RandomForest
        model = RandomForestClassifier(n_estimators=ntree, random_state=seed)
        model.fit(train_dt.T, train_sample[group])
        
        # Predict probabilities
        pred = model.predict_proba(test_dt.T)[:, 1]
        
        # Store predictions along with sample information
        pred_df = pd.DataFrame(pred, columns=[group])
        pred_df[zy_sample] = test_sample[zy_sample]
        pred_df[group] = test_sample[group]
        results.append(pred_df)
    
    # Combine all prediction results
    final_result = pd.concat(results, ignore_index=True)
    return final_result
