In [None]:
import pandas as pd
from IPython.display import Image
data = pd.read_csv("mutationc.csv")

: 

# Part 1 - Feature Table

In [None]:
def compute_phi(group:pd.DataFrame,t:str):
    # helpers
    tL = group[(group[t] == 1)]
    tR = group[(group[t] == 0)]

    n_t = group.shape[0]
    n_t_C = group[group.iloc[:,0].str.contains(r"^C\d+$",regex=True)].shape[0]
    n_t_NC = group[group.iloc[:,0].str.contains(r"^NC\d+$",regex=True)].shape[0]
    n_tL = tL.shape[0]
    n_tR = tR.shape[0]
    n_tL_C = tL[tL.iloc[:,0].str.contains(r"^C\d+$",regex=True)].shape[0]
    n_tL_NC = tL[tL.iloc[:,0].str.contains(r"^NC\d+$",regex=True)].shape[0]
    n_tR_C = tR[tR.iloc[:,0].str.contains(r"^C\d+$",regex=True)].shape[0]
    n_tR_NC = tR[tR.iloc[:,0].str.contains(r"^NC\d+$",regex=True)].shape[0]

    P_L = n_tL / n_t
    P_R = n_tR / n_t
    P_C_tL = n_tL_C / n_tL if n_tL != 0 else 0
    P_NC_tL = n_tL_NC / n_tL if n_tL != 0 else 0
    P_C_tR = n_tR_C / n_tR if n_tR != 0 else 0
    P_NC_tR = n_tR_NC / n_tR if n_tR != 0 else 0
    balance = P_C_tL - P_C_tR
    Q = abs(P_C_tL - P_C_tR) + abs(P_NC_tL - P_NC_tR)
    PLPR = (2 * P_L * P_R)
    phi = PLPR * Q

    return {"n(t)":n_t,"n(t,C)":n_t_C,"n(t,NC)":n_t_NC,
            
            "n(t_L)":n_tL,"n(t_R)":n_tR,"n(t_L,C)":n_tL_C,"n(t_R,C)":n_tR_C,

            "n(t_L,NC)":n_tL_NC,"n(t_R,NC)":n_tR_NC,

            "P_L":P_L,"P_R":P_R,"P(C|t_L)":P_C_tL,"P(C|t_R)":P_C_tR,
            
            "P(NC|t_L)":P_NC_tL,"P(NC|t_R)":P_NC_tR,

            "2P_LP_R":PLPR,"Q":Q,"ɸ(s,t)":phi,"Balance":balance}

def create_feature_table(group:pd.DataFrame):
    ft = pd.DataFrame(columns=[
    "n(t_L)","n(t_R)","n(t_L,C)","n(t_R,C)",
    "n(t_L,NC)","n(t_R,NC)",
    "P_L","P_R","P(C|t_L)","P(C|t_R)",
    "P(NC|t_L)","P(NC|t_R)",
    "2P_LP_R","Q","ɸ(s,t)","Balance"
    ])

    for t in group.columns:
        results = compute_phi(group, t)
        ft.loc[t] = {
            'n(t_L)': results['n(t_L)'],
            'n(t_R)': results['n(t_R)'],
            'n(t_L,C)': results['n(t_L,C)'],
            'n(t_R,C)': results['n(t_R,C)'],
            'n(t_L,NC)': results['n(t_L,NC)'],
            'n(t_R,NC)': results['n(t_R,NC)'],
            'P_L': results['P_L'],
            'P_R': results['P_R'],
            'P(C|t_L)': results['P(C|t_L)'],
            'P(C|t_R)': results['P(C|t_R)'],
            'P(NC|t_L)': results['P(NC|t_L)'],
            'P(NC|t_R)': results['P(NC|t_R)'],
            '2P_LP_R': results['2P_LP_R'],
            'Q': results['Q'],
            'ɸ(s,t)': results['ɸ(s,t)'],
            'Balance':results['Balance']
        }
    return ft

def create_feature_table_simple(group:pd.DataFrame):
    ft = pd.DataFrame(columns=[
        'ɸ(s,t)',
        'Balance'
    ])

    for t in group.columns:
        results = compute_phi(group, t)
        ft.loc[t] = {
            'ɸ(s,t)': results['ɸ(s,t)'],
            'Balance':results['Balance']
        }
    return ft

In [None]:
ft = create_feature_table(data)
ft.sort_values("ɸ(s,t)",ascending=False).head(10)

# Part 2 - Evaluating Decision Tree 

In [None]:
def compute_metrics(testing:pd.DataFrame,classification:pd.Series):
    actual = testing.iloc[:,0].replace(r'^C\d+$','1',regex=True).replace(r'^NC\d+$','0',regex=True).astype(int)
    predicted = classification
    TP = ((actual == 1) & (predicted == 1)).sum(axis=0)
    FP = ((actual == 0) & (predicted == 1)).sum(axis=0)
    FN = ((actual == 1) & (predicted == 0)).sum(axis=0)
    TN = ((actual == 0) & (predicted == 0)).sum(axis=0)
    Accuracy = (TP + TN) / (TP + TN + FP + FN)
    Sensitivity = TP / (TP + FN)
    Specificity = TN / (TN + FP)
    Precision = TP / (TP + FP)
    Miss_rate = FN / (TP + FN)
    False_discovery_rate = FP / (TP + FP)
    False_omission_rate = FN / (FN + TN)
    metrics = pd.DataFrame({"Accuracy":Accuracy,"Sensitivity":Sensitivity,
                            "Specificity":Specificity,"Precision":Precision,
                            "Miss Rate":Miss_rate,"False Discovery Rate":False_discovery_rate,
                            "False Omission Rate":False_omission_rate},index=[0])
    return metrics

In [None]:
def testing_process(training:pd.DataFrame,testing:pd.DataFrame):
    results = []
    
    ft = create_feature_table_simple(training.iloc[1:,:])
    F = ft.sort_values("ɸ(s,t)",ascending=False).index[0]

    group_A = training[(training[F] == 1)]
    group_B = training[(training[F] == 0)]
    gA_ft = create_feature_table(group_A)
    gB_ft = create_feature_table(group_B)

    alpha = gA_ft.sort_values("ɸ(s,t)",ascending=False).index[0]
    alpha_balance = gA_ft.sort_values("ɸ(s,t)",ascending=False).Balance.head(1).sum()
    beta = gB_ft.sort_values("ɸ(s,t)",ascending=False).index[0]
    beta_balance = gB_ft.sort_values("ɸ(s,t)",ascending=False).Balance.head(1).sum()

    def classify(sample):
        if sample[F] == 1:
            if sample[alpha] == 1:
                return 1 if alpha_balance > 0 else 0
            else:
                return 1 if alpha_balance < 0 else 0
        else:
            if sample[beta] == 1:
                return 1 if beta_balance > 0 else 0
            else:
                return 1 if beta_balance < 0 else 0
            
    classification = testing.apply(classify, axis=1)
    print(f"\t{F}")
    print(f"\t{alpha} Balance:{alpha_balance}")
    print(f"\t{beta} Balance:{beta_balance}")
    return compute_metrics(testing, classification)

In [None]:
results = testing_process(data,data)

In [None]:
Image(url="https://media.discordapp.net/attachments/1314802602042720320/1432749366812217384/Screenshot_2025-10-28_at_11.14.15_AM.png?ex=69022f54&is=6900ddd4&hm=ccffbd88ec70a70d2cbfe61dd99eb32a839681dd1459d71f94aa7d6c34d0ccad&=&format=webp&quality=lossless&width=2304&height=1308", width=600)

# 3-Fold Cross-Validation

In [None]:
def create_training_testing_df(group: pd.DataFrame, folds: int):
    if folds == 1:
        return [group]
    training = group.sample(frac=1/folds, random_state=42)
    remaining = group.drop(training.index)
    other_sets = create_training_testing_df(remaining, folds - 1)
    return [training] + other_sets

sets = create_training_testing_df(data,3)

print("Trial 1:")
results_1 = testing_process(sets[0],pd.concat([sets[1],sets[2]]))
print("Trial 2:")
results_2 = testing_process(sets[1],pd.concat([sets[0],sets[2]]))
print("Trial 3:")
results_3 = testing_process(sets[2],pd.concat([sets[0],sets[1]]))


In [None]:
avg_metrics = (results_1 + results_2 + results_3) / 3
print(avg_metrics)