In [4]:
import pandas as pd
import numpy as np
data = pd.read_csv("mutationc.csv")

## Compute Phi

In [15]:
def compute_phi(group:pd.DataFrame,t:str):
    # helpers
    tL = group[(group[t] == 1)]
    tR = group[(group[t] == 0)]

    n_t = group.shape[0]
    n_t_C = group[group.iloc[:,0].str.contains(r"^C\d+$",regex=True)].shape[0]
    n_t_NC = group[group.iloc[:,0].str.contains(r"^NC\d+$",regex=True)].shape[0]
    n_tL = tL.shape[0]
    n_tR = tR.shape[0]
    n_tL_C = tL[tL.iloc[:,0].str.contains(r"^C\d+$",regex=True)].shape[0]
    n_tL_NC = tL[tL.iloc[:,0].str.contains(r"^NC\d+$",regex=True)].shape[0]
    n_tR_C = tR[tR.iloc[:,0].str.contains(r"^C\d+$",regex=True)].shape[0]
    n_tR_NC = tR[tR.iloc[:,0].str.contains(r"^NC\d+$",regex=True)].shape[0]

    P_L = n_tL / n_t
    P_R = n_tR / n_t
    P_C_tL = n_tL_C / n_tL if n_tL != 0 else 0
    P_NC_tL = n_tL_NC / n_tL if n_tL != 0 else 0
    P_C_tR = n_tR_C / n_tR if n_tR != 0 else 0
    P_NC_tR = n_tR_NC / n_tR if n_tR != 0 else 0
    balance = P_C_tL - P_C_tR
    Q = abs(P_C_tL - P_C_tR) + abs(P_NC_tL - P_NC_tR)
    PLPR = (2 * P_L * P_R)
    phi = PLPR * Q

    return {"n(t)":n_t,"n(t,C)":n_t_C,"n(t,NC)":n_t_NC,
            
            "n(t_L)":n_tL,"n(t_R)":n_tR,"n(t_L,C)":n_tL_C,"n(t_R,C)":n_tR_C,

            "n(t_L,NC)":n_tL_NC,"n(t_R,NC)":n_tR_NC,

            "P_L":P_L,"P_R":P_R,"P(C|t_L)":P_C_tL,"P(C|t_R)":P_C_tR,
            
            "P(NC|t_L)":P_NC_tL,"P(NC|t_R)":P_NC_tR,

            "2P_LP_R":PLPR,"Q":Q,"ɸ(s,t)":phi,"Balance":balance}

def create_feature_table(group:pd.DataFrame):
    ft = pd.DataFrame(columns=[
    "n(t_L)","n(t_R)","n(t_L,C)","n(t_R,C)",
    "n(t_L,NC)","n(t_R,NC)",
    "P_L","P_R","P(C|t_L)","P(C|t_R)",
    "P(NC|t_L)","P(NC|t_R)",
    "2P_LP_R","Q","ɸ(s,t)","Balance"
    ])

    for t in group.columns:
        results = compute_phi(group, t)
        ft.loc[t] = {
            'n(t_L)': results['n(t_L)'],
            'n(t_R)': results['n(t_R)'],
            'n(t_L,C)': results['n(t_L,C)'],
            'n(t_R,C)': results['n(t_R,C)'],
            'n(t_L,NC)': results['n(t_L,NC)'],
            'n(t_R,NC)': results['n(t_R,NC)'],
            'P_L': results['P_L'],
            'P_R': results['P_R'],
            'P(C|t_L)': results['P(C|t_L)'],
            'P(C|t_R)': results['P(C|t_R)'],
            'P(NC|t_L)': results['P(NC|t_L)'],
            'P(NC|t_R)': results['P(NC|t_R)'],
            '2P_LP_R': results['2P_LP_R'],
            'Q': results['Q'],
            'ɸ(s,t)': results['ɸ(s,t)'],
            'Balance':results['Balance']
        }
    return ft


In [None]:
ft = create_feature_table(data.iloc[1:,:])
F = ft.sort_values("ɸ(s,t)",ascending=False).Balance.head(1).sum()

np.int64(0)

In [None]:
ft = create_feature_table(data.iloc[1:,:])
F = ft.sort_values("ɸ(s,t)",ascending=False).index[0]

group_A = data[(data[F] == 1)]
group_B = data[(data[F] == 0)]
gA_ft = create_feature_table(group_A)
gB_ft = create_feature_table(group_B)

alpha = gA_ft.sort_values("ɸ(s,t)",ascending=False).index[0]
alpha_balance = gA_ft.sort_values("ɸ(s,t)",ascending=False).Balance.head(1).sum()
beta = gB_ft.sort_values("ɸ(s,t)",ascending=False).index[0]
beta_balance = gB_ft.sort_values("ɸ(s,t)",ascending=False).Balance.head(1).sum()

print(F)
print(alpha)
print(alpha_balance)
print(beta)

def classify(sample):
    if sample[F] == 1:
        if sample[alpha] == 1:
            return 1 if alpha_balance > 0 else 0
        else:
            return 1 if alpha_balance < 0 else 0
    else:
        if sample[beta] == 1:
            return 1 if beta_balance > 0 else 0
        else:
            return 1 if beta_balance < 0 else 0
        
data['classification'] = data.apply(classify, axis=1)

BRAF_GRCh37_7:140453136-140453136_Missense-Mutation_SNP_A-A-T
RTL1_GRCh37_14:101347507-101347507_Frame-Shift-Del_DEL_G-G--
-0.6810344827586207
KRAS_GRCh37_12:25398284-25398284_Missense-Mutation_SNP_C-C-A_C-C-T_C-C-G


0      0
1      0
2      1
3      0
4      1
      ..
196    0
197    0
198    1
199    0
200    0
Name: classification, Length: 201, dtype: int64

In [None]:
# calcuate root feature from compute_phi(data)
# calculate feature A from comptue_phi(group_a)
# calculate feature B from comptue_phi(group_b)