In [111]:
import pandas as pd
import numpy as np
data = pd.read_csv("mutationc.csv")

In [112]:
# returns set of dataframes where index 0 -> training set and index 1 -> testing set
def create_training_testing_df(group: pd.DataFrame, folds: int):
    if folds == 1:
        return [group]
    training = group.sample(frac=1/folds, random_state=None)
    remaining = group.drop(training.index)
    other_sets = create_training_testing_df(remaining, folds - 1)
    return [training] + other_sets

# returns dataframe of resulting TP FP FN TN and TP-FP Values
def compute_confusion_table(group:pd.DataFrame):
    actual_1d = group.iloc[:,0].replace(r'^C\d+$','1',regex=True).replace(r'^NC\d+$','0',regex=True).astype(int)
    predicted = group.iloc[:,1:]
    actual = pd.DataFrame(np.tile(actual_1d.values.reshape(actual_1d.shape[0],1), predicted.shape[1]), columns=predicted.columns, index=predicted.index)
    TP = ((actual == 1) & (predicted == 1)).sum(axis=0)
    FP = ((actual == 0) & (predicted == 1)).sum(axis=0)
    FN = ((actual == 1) & (predicted == 0)).sum(axis=0)
    TN = ((actual == 0) & (predicted == 0)).sum(axis=0)
    return pd.DataFrame({"TP":TP,"FP":FP,"FN":FN,"TN":TN,"TP-FP":(TP-FP)})

In [113]:
def testing_process(training:pd.DataFrame,testing:pd.DataFrame) :
    results = []
    
    ct = compute_confusion_table(training)
    # find top TP-FP feature define as F
    F = ct.sort_values(by='TP-FP',ascending=False).head(1).index[0]

    # derive group A and group B
    group_A = training[(training[F] == 1)]
    group_B = training[(training[F] == 0)]

    # get TP FP values for group A and group B
    gA_confusion_table = compute_confusion_table(group_A)
    gB_confusion_table = compute_confusion_table(group_B)

    # derive alpha and beta features
    alpha = gA_confusion_table.sort_values(by='TP-FP',ascending=False).index[1]
    beta = gB_confusion_table.sort_values(by='TP-FP',ascending=False).index[0]
    # classifying step
    def classify(sample):
        if sample[F] == 1:
            return 1 if sample[alpha] == 1 else 0
        else:
            return 1 if sample[beta] == 1 else 0
        
    testing['classification'] = testing.apply(classify, axis=1)
    # derive predicted values from classifications like before
    actual = testing.iloc[:,0].replace(r'^C\d+$','1',regex=True).replace(r'^NC\d+$','0',regex=True).astype(int)
    predicted = testing.classification
    TP = ((actual == 1) & (predicted == 1)).sum(axis=0)
    FP = ((actual == 0) & (predicted == 1)).sum(axis=0)
    FN = ((actual == 1) & (predicted == 0)).sum(axis=0)
    TN = ((actual == 0) & (predicted == 0)).sum(axis=0)
    Accuracy = (TP + TN) / (TP + TN + FP + FN)
    Sensitivity = TP / (TP + FN)
    Specificity = TN / (TN + FP)
    Precision = TP / (TP + FP)
    Miss_rate = FN / (TP + FN)
    False_discovery_rate = FP / (TP + FP)
    False_omission_rate = FN / (FN + TN)
    metrics = pd.DataFrame({"Accuracy":Accuracy,"Sensitivity":Sensitivity,
                            "Specificity":Specificity,"Precision":Precision,
                            "Miss Rate":Miss_rate,"False Discovery Rate":False_discovery_rate,
                            "False Omission Rate":False_omission_rate},index=[0])
    
    results.append(F)
    results.append(alpha)
    results.append(beta)
    results.append(testing)
    results.append(metrics)
    return results

In [114]:
sets = create_training_testing_df(data,3)
results_1 = testing_process(sets[0],pd.concat([sets[1],sets[2]]))
results_2 = testing_process(sets[1],pd.concat([sets[0],sets[2]]))
results_3 = testing_process(sets[2],pd.concat([sets[0],sets[1]]))

In [115]:
print(f"Trial 1 - Decision Tree Features:\n\t{results_1[0]}\n\t{results_1[1]}\n\t{results_1[2]}\n")
print(f"Trial 2 - Decision Tree Features:\n\t{results_2[0]}\n\t{results_2[1]}\n\t{results_2[2]}\n") 
print(f"Trial 3 - Decision Tree Features:\n\t{results_3[0]}\n\t{results_3[1]}\n\t{results_3[2]}\n")

Trial 1 - Decision Tree Features:
	BRAF_GRCh37_7:140453136-140453136_Missense-Mutation_SNP_A-A-T
	DRD5_GRCh37_4:9785349-9785349_3'UTR_SNP_G-G-C
	KRAS_GRCh37_12:25398284-25398284_Missense-Mutation_SNP_C-C-A_C-C-T_C-C-G

Trial 2 - Decision Tree Features:
	BRAF_GRCh37_7:140453136-140453136_Missense-Mutation_SNP_A-A-T
	DRD5_GRCh37_4:9785349-9785349_3'UTR_SNP_G-G-C
	KRAS_GRCh37_12:25398284-25398284_Missense-Mutation_SNP_C-C-A_C-C-T_C-C-G

Trial 3 - Decision Tree Features:
	DRD5_GRCh37_4:9785349-9785349_3'UTR_SNP_G-G-C
	DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--
	KRAS_GRCh37_12:25398281-25398281_Missense-Mutation_SNP_C-C-T



In [118]:
avg_metrics = (results_1[4] + results_2[4] + results_3[4]) / 3

print(results_1[4].Accuracy)
print(results_2[4].Accuracy)
print(results_3[4].Accuracy)
print(avg_metrics)

0    0.574627
Name: Accuracy, dtype: float64
0    0.634328
Name: Accuracy, dtype: float64
0    0.5
Name: Accuracy, dtype: float64
   Accuracy  Sensitivity  Specificity  Precision  Miss Rate  \
0  0.569652     0.308781     0.849524   0.676236   0.691219   

   False Discovery Rate  False Omission Rate  
0              0.323764             0.461935  
