In [1]:
import pandas as pd
import numpy as np
import math
data = pd.read_csv("mutationc.csv",index_col=0)

### Phi Computation

In [2]:
def compute_h(P_C_t,P_NC_t):
    ans = 0.0
    if P_C_t > 0:
        ans -= P_C_t * math.log2(P_C_t)
    if P_NC_t > 0:
        ans -= P_NC_t * math.log2(P_NC_t)
    return ans

def compute_phi(group:pd.DataFrame,t:str):
    tL = group[(group[t] == 1)]
    tR = group[(group[t] == 0)]

    n_t = group.shape[0]
    n_t_C = group[group.index.str.contains(r"^C\d+$",regex=True)].shape[0]
    n_t_NC = group[group.index.str.contains(r"^NC\d+$",regex=True)].shape[0]
    n_tL = tL.shape[0]
    n_tR = tR.shape[0]
    n_tL_C = tL[tL.index.str.contains(r"^C\d+$",regex=True)].shape[0]
    n_tL_NC = tL[tL.index.str.contains(r"^NC\d+$",regex=True)].shape[0]
    n_tR_C = tR[tR.index.str.contains(r"^C\d+$",regex=True)].shape[0]
    n_tR_NC = tR[tR.index.str.contains(r"^NC\d+$",regex=True)].shape[0]
    P_L = n_tL / n_t
    P_R = n_tR / n_t
    P_C_t = n_t_C / n_t
    P_NC_t = n_t_NC / n_t
    P_C_tL = n_tL_C / n_tL if n_tL != 0 else 0
    P_NC_tL = n_tL_NC / n_tL if n_tL != 0 else 0
    P_C_tR = n_tR_C / n_tR if n_tR != 0 else 0
    P_NC_tR = n_tR_NC / n_tR if n_tR != 0 else 0
    balance = P_C_tL - P_C_tR
    H_t = compute_h(P_C_t, P_NC_t)
    H_st = P_L*compute_h(P_C_tL,P_NC_tL) + P_R*compute_h(P_C_tR,P_NC_tR)
    gain = H_t - H_st

    return {"n(t_L)":n_tL,"n(t_R)":n_tR,
            "n(t_L,C)":n_tL_C,"n(t_L,NC)":n_tL_NC,
            "n(t_R,C)":n_tR_C,"n(t_R,NC)":n_tR_NC,
            "P_L":P_L,"P_R":P_R,
            "H(s,t)":H_st,"H(T)":H_t,"gain(s)":gain,"balance":balance}

def create_feature_table(group:pd.DataFrame):
    ft = pd.DataFrame(columns=["n(t_L)","n(t_R)",
            "n(t_L,C)","n(t_L,NC)",
            "n(t_R,C)","n(t_R,NC)",
            "P_L","P_R",
            "H(s,t)","H(T)","gain(s)","balance"])

    for t in group.columns:
        results = compute_phi(group, t)
        ft.loc[t] = {
            "n(t_L)":results["n(t_L)"],"n(t_R)":results["n(t_R)"],
            "n(t_L,C)":results["n(t_L,C)"],"n(t_L,NC)":results["n(t_L,NC)"],
            "n(t_R,C)":results["n(t_R,C)"],"n(t_R,NC)":results["n(t_R,NC)"],
            "P_L":results["P_L"],"P_R":results["P_R"],
            "H(s,t)":results["H(s,t)"],"H(T)":results["H(T)"],"gain(s)":results["gain(s)"],
            "balance":results["balance"]
        }
    return ft

### Random Forest Generation

In [3]:
class tree:
    def __init__(self, F=None, alpha=None, alpha_balance=None, beta=None, beta_balance=None, classification=None):
        self.F = F
        self.alpha = alpha
        self.alpha_balance = alpha_balance
        self.beta = beta
        self.beta_balance = beta_balance
        self.classification = classification

In [4]:
def classify(sample,dt:tree):
    if sample[dt.F] == 1:
        if sample[dt.alpha] == 1:
            return 1 if dt.alpha_balance > 0 else 0
        else:
            return 1 if dt.alpha_balance < 0 else 0
    else:
        if sample[dt.beta] == 1:
            return 1 if dt.beta_balance > 0 else 0
        else:
            return 1 if dt.beta_balance < 0 else 0

In [5]:
def create_tree(group:pd.DataFrame):
    bootstrap = group.sample(n=len(group),replace=True)
    oob = pd.concat([group,bootstrap]).drop_duplicates(keep=False)
    # create base feature set
    ft = create_feature_table(bootstrap)
    F_set = ft.sample(n=(math.ceil(math.sqrt(bootstrap.shape[1]))))
    F = F_set.sort_values(by="gain(s)",ascending=False).index[0]

    # create group a and group b
    group_A = bootstrap[(bootstrap[F] == 1)]
    group_B = bootstrap[(bootstrap[F] == 0)]
    gA_ft = create_feature_table(group_A)
    gB_ft = create_feature_table(group_B)
    
    # create alpha and beta sqrt n feature sets
    alpha_set = gA_ft.sample(n=(math.ceil(math.sqrt(bootstrap.shape[1]))))
    alpha = alpha_set.sort_values("gain(s)",ascending=False).index[0]
    alpha_balance = alpha_set.sort_values("gain(s)",ascending=False).balance.head(1).sum()
    beta_set = gB_ft.sample(n=(math.ceil(math.sqrt(bootstrap.shape[1]))))
    beta = beta_set.sort_values("gain(s)",ascending=False).index[0]
    beta_balance = beta_set.sort_values("gain(s)",ascending=False).balance.head(1).sum()
    
    # classifying step
    # def classify(sample):
    #     if sample[F] == 1:
    #         if sample[alpha] == 1:
    #             return 1 if alpha_balance > 0 else 0
    #         else:
    #             return 1 if alpha_balance < 0 else 0
    #     else:
    #         if sample[beta] == 1:
    #             return 1 if beta_balance > 0 else 0
    #         else:
    #             return 1 if beta_balance < 0 else 0
            
    # classification = group.apply(classify, axis=1)

    return [oob.shape[0],oob,F,alpha,beta]

In [6]:
results = []
for trial in range(9):
    results.append(create_tree(data))

In [32]:
print("Root Features --")
for trial in range(9): print(f"\t{results[trial][2]}")
print("Child (L) Features --")
for trial in range(9): print(f"\t{results[trial][3]}")
print("Child (R) Features --")
for trial in range(9): print(f"\t{results[trial][4]}")

Root Features --
	NEFM_GRCh37_8:24772062-24772062_Silent_SNP_G-G-A
	TECTA_GRCh37_11:121032949-121032949_Silent_SNP_C-C-T
	RNF111_GRCh37_15:59376343-59376343_Frame-Shift-Del_DEL_C-C--
	MICAL3_GRCh37_22:18300932-18300932_Frame-Shift-Del_DEL_G-G--
	BRAF_GRCh37_7:140453136-140453136_Missense-Mutation_SNP_A-A-T
	CARD11_GRCh37_7:2968323-2968323_Frame-Shift-Del_DEL_G-G--
	ZNF271_GRCh37_18:32888087-32888090_RNA_DEL_AAAC-AAAC--
	MBD6_GRCh37_12:57921002-57921002_Frame-Shift-Del_DEL_C-C--
	MYO16_GRCh37_13:109379898-109379898_Silent_SNP_C-C-T
Child (L) Features --
	TNNI3_GRCh37_19:55669006-55669006_5'UTR_SNP_G-G-A
	PPP1R26_GRCh37_9:138379043-138379043_Frame-Shift-Del_DEL_G-G--
	MDGA2_GRCh37_14:47530771-47530771_Frame-Shift-Del_DEL_T-T--
	PCLO_GRCh37_7:82582944-82582944_Frame-Shift-Del_DEL_T-T--
	EXTL3_GRCh37_8:28573612-28573612_Silent_SNP_G-G-A
	DLX1_GRCh37_2:172952994-172952994_3'UTR_SNP_G-G-A
	KIAA2018_GRCh37_3:113371901-113371901_3'UTR_DEL_A-A--
	SOX11_GRCh37_2:5840844-5840844_3'UTR_SNP_C-C-T
	

In [35]:
print(f"OOB Sizes --")
for trial in range(9): print(f"\ttrial({trial}): {results[trial][0]}")
print(f"OOB AVG --")
sum = 0
for trial in range(9): sum = results[trial][0] + sum
print(f"\t{sum / 9}")

OOB Sizes --
	trial(0): 73
	trial(1): 72
	trial(2): 64
	trial(3): 65
	trial(4): 67
	trial(5): 77
	trial(6): 70
	trial(7): 67
	trial(8): 67
OOB AVG --
	69.11111111111111
