In [2]:
import pandas as pd
import math
data = pd.read_csv("mutationc.csv",index_col=0)

In [3]:
def compute_h(P_C_t,P_NC_t):
    ans = 0.0
    if P_C_t > 0:
        ans -= P_C_t * math.log2(P_C_t)
    if P_NC_t > 0:
        ans -= P_NC_t * math.log2(P_NC_t)
    return ans

def compute_phi(group:pd.DataFrame,t:str):
    tL = group[(group[t] == 1)]
    tR = group[(group[t] == 0)]

    n_t = group.shape[0]
    n_t_C = group[group.index.str.contains(r"^C\d+$",regex=True)].shape[0]
    n_t_NC = group[group.index.str.contains(r"^NC\d+$",regex=True)].shape[0]
    n_tL = tL.shape[0]
    n_tR = tR.shape[0]
    n_tL_C = tL[tL.index.str.contains(r"^C\d+$",regex=True)].shape[0]
    n_tL_NC = tL[tL.index.str.contains(r"^NC\d+$",regex=True)].shape[0]
    n_tR_C = tR[tR.index.str.contains(r"^C\d+$",regex=True)].shape[0]
    n_tR_NC = tR[tR.index.str.contains(r"^NC\d+$",regex=True)].shape[0]
    P_L = n_tL / n_t
    P_R = n_tR / n_t
    P_C_t = n_t_C / n_t
    P_NC_t = n_t_NC / n_t
    P_C_tL = n_tL_C / n_tL if n_tL != 0 else 0
    P_NC_tL = n_tL_NC / n_tL if n_tL != 0 else 0
    P_C_tR = n_tR_C / n_tR if n_tR != 0 else 0
    P_NC_tR = n_tR_NC / n_tR if n_tR != 0 else 0
    balance = P_C_tL - P_C_tR
    H_t = compute_h(P_C_t, P_NC_t)
    H_st = P_L*compute_h(P_C_tL,P_NC_tL) + P_R*compute_h(P_C_tR,P_NC_tR)
    gain = H_t - H_st

    return {"n(t_L)":n_tL,"n(t_R)":n_tR,
            "n(t_L,C)":n_tL_C,"n(t_L,NC)":n_tL_NC,
            "n(t_R,C)":n_tR_C,"n(t_R,NC)":n_tR_NC,
            "P_L":P_L,"P_R":P_R,
            "H(s,t)":H_st,"H(T)":H_t,"gain(s)":gain,"balance":balance}

def create_feature_table(group:pd.DataFrame):
    ft = pd.DataFrame(columns=["n(t_L)","n(t_R)",
            "n(t_L,C)","n(t_L,NC)",
            "n(t_R,C)","n(t_R,NC)",
            "P_L","P_R",
            "H(s,t)","H(T)","gain(s)","balance"])

    for t in group.columns:
        results = compute_phi(group, t)
        ft.loc[t] = {
            "n(t_L)":results["n(t_L)"],"n(t_R)":results["n(t_R)"],
            "n(t_L,C)":results["n(t_L,C)"],"n(t_L,NC)":results["n(t_L,NC)"],
            "n(t_R,C)":results["n(t_R,C)"],"n(t_R,NC)":results["n(t_R,NC)"],
            "P_L":results["P_L"],"P_R":results["P_R"],
            "H(s,t)":results["H(s,t)"],"H(T)":results["H(T)"],"gain(s)":results["gain(s)"],
            "balance":results["balance"]
        }
    return ft

In [4]:
ft = create_feature_table(data)
ft.sort_values(by="gain(s)",ascending=False).head(10)

Unnamed: 0,n(t_L),n(t_R),"n(t_L,C)","n(t_L,NC)","n(t_R,C)","n(t_R,NC)",P_L,P_R,"H(s,t)",H(T),gain(s),balance
BRAF_GRCh37_7:140453136-140453136_Missense-Mutation_SNP_A-A-T,33,168,28,5,76,92,0.164179,0.835821,0.931087,0.999125,0.068038,0.396104
PLEKHA6_GRCh37_1:204228411-204228411_Frame-Shift-Del_DEL_C-C--,7,194,7,0,97,97,0.034826,0.965174,0.965174,0.999125,0.033951,0.5
RAB28_GRCh37_4:13485808-13485808_5'UTR_DEL_G-G--,7,194,7,0,97,97,0.034826,0.965174,0.965174,0.999125,0.033951,0.5
SVIL_GRCh37_10:29760116-29760116_Frame-Shift-Del_DEL_C-C--,7,194,7,0,97,97,0.034826,0.965174,0.965174,0.999125,0.033951,0.5
ZBTB20_GRCh37_3:114058003-114058003_Frame-Shift-Del_DEL_G-G--,15,186,13,2,91,95,0.074627,0.925373,0.967341,0.999125,0.031784,0.377419
DRD5_GRCh37_4:9785349-9785349_3'UTR_SNP_G-G-C,37,164,27,10,77,87,0.18408,0.81592,0.968699,0.999125,0.030426,0.260218
APC_GRCh37_5:112175951-112175952_Frame-Shift-Ins_INS_----A,6,195,6,0,98,97,0.029851,0.970149,0.970131,0.999125,0.028994,0.497436
QKI_GRCh37_6:163987695-163987695_Intron_DEL_T-T--,6,195,6,0,98,97,0.029851,0.970149,0.970131,0.999125,0.028994,0.497436
CTNNBL1_GRCh37_20:36361416-36361416_Frame-Shift-Del_DEL_A-A--,5,196,5,0,99,97,0.024876,0.975124,0.975051,0.999125,0.024074,0.494898
RP11-737O24.3_GRCh37_18:2946534-2946534_RNA_DEL_T-T--,5,196,5,0,99,97,0.024876,0.975124,0.975051,0.999125,0.024074,0.494898


## Testing process

In [66]:
def compute_metrics(testing:pd.DataFrame,classification:pd.Series):
    actual = testing.index.to_series().replace(r'^C\d+$','1',regex=True).replace(r'^NC\d+$','0',regex=True).astype(int)
    predicted = classification
    TP = ((actual == 1) & (predicted == 1)).sum(axis=0)
    FP = ((actual == 0) & (predicted == 1)).sum(axis=0)
    FN = ((actual == 1) & (predicted == 0)).sum(axis=0)
    TN = ((actual == 0) & (predicted == 0)).sum(axis=0)
    Accuracy = (TP + TN) / (TP + TN + FP + FN)
    Sensitivity = TP / (TP + FN)
    Specificity = TN / (TN + FP)
    Precision = TP / (TP + FP)
    Miss_rate = FN / (TP + FN)
    False_discovery_rate = FP / (TP + FP)
    False_omission_rate = FN / (FN + TN)
    metrics = pd.DataFrame({"Accuracy":Accuracy,"Sensitivity":Sensitivity,
                            "Specificity":Specificity,"Precision":Precision,
                            "Miss Rate":Miss_rate,"False Discovery Rate":False_discovery_rate,
                            "False Omission Rate":False_omission_rate},index=[0])
    return metrics

In [75]:
# do complete tesitng process
def create_tree(training:pd.DataFrame,testing:pd.DataFrame):
    ft = create_feature_table(training)
    F = ft.sort_values(by="gain(s)",ascending=False).index[0]

    group_A = training[(training[F] == 1)]
    group_B = training[(training[F] == 0)]
    gA_ft = create_feature_table(group_A)
    gB_ft = create_feature_table(group_B)

    alpha = gA_ft.sort_values("gain(s)",ascending=False).index[0]
    alpha_balance = gA_ft.sort_values("gain(s)",ascending=False).balance.head(1).sum()
    beta = gB_ft.sort_values("gain(s)",ascending=False).index[0]
    beta_balance = gB_ft.sort_values("gain(s)",ascending=False).balance.head(1).sum()
    
    def classify(sample):
        if sample[F] == 1:
            if sample[alpha] == 1:
                return 1 if alpha_balance > 0 else 0
            else:
                return 1 if alpha_balance < 0 else 0
        else:
            if sample[beta] == 1:
                return 1 if beta_balance > 0 else 0
            else:
                return 1 if beta_balance < 0 else 0
            
    classification = testing.apply(classify, axis=1)

    print(f"\t{F}")
    print(f"\t{alpha}")
    print(f"\t{beta}")
    return compute_metrics(testing, classification)

create_tree(data,data)

	BRAF_GRCh37_7:140453136-140453136_Missense-Mutation_SNP_A-A-T
	RTL1_GRCh37_14:101347507-101347507_Frame-Shift-Del_DEL_G-G--
	APC_GRCh37_5:112175951-112175952_Frame-Shift-Ins_INS_----A


Unnamed: 0,Accuracy,Sensitivity,Specificity,Precision,Miss Rate,False Discovery Rate,False Omission Rate
0,0.631841,0.307692,0.979381,0.941176,0.692308,0.058824,0.431138


## Evaluate Tree

In [74]:
# do three cross fold 
def create_training_testing_df(group: pd.DataFrame, folds: int):
    if folds == 1:
        return [group]
    training = group.sample(frac=1/folds, random_state=70)
    remaining = group.drop(training.index)
    other_sets = create_training_testing_df(remaining, folds - 1)
    return [training] + other_sets

sets = create_training_testing_df(data,3)

print("Trial 1:")
results_1 = create_tree(sets[0],pd.concat([sets[1],sets[2]]))
print("Trial 2:")
results_2 = create_tree(sets[1],pd.concat([sets[0],sets[2]]))
print("Trial 3:")
results_3 = create_tree(sets[2],pd.concat([sets[0],sets[1]]))

avg_metrics = (results_1 + results_2 + results_3) / 3
print(f"\n\t{avg_metrics}")

Trial 1:
	DRD5_GRCh37_4:9785349-9785349_3'UTR_SNP_G-G-C
	FUS_GRCh37_16:31196420-31196420_Silent_SNP_C-C-T
	APC_GRCh37_5:112175951-112175952_Frame-Shift-Ins_INS_----A
Trial 2:
	KRAS_GRCh37_12:25398281-25398281_Missense-Mutation_SNP_C-C-T
	HEPACAM_GRCh37_11:124794736-124794736_Silent_SNP_G-G-A_G-G-T
	APC_GRCh37_5:112175951-112175952_Frame-Shift-Ins_INS_----A
Trial 3:
	BRAF_GRCh37_7:140453136-140453136_Missense-Mutation_SNP_A-A-T
	HEPACAM_GRCh37_11:124794736-124794736_Silent_SNP_G-G-A_G-G-T
	TP53_GRCh37_17:7577539-7577539_Missense-Mutation_SNP_G-G-A

	   Accuracy  Sensitivity  Specificity  Precision  Miss Rate  \
0  0.517413     0.427987     0.621504   0.592829   0.572013   

   False Discovery Rate  False Omission Rate  
0              0.407171             0.417818  
