# Computation of final result, and data for report

In [15]:
import pandas as pd
import numpy as np
import pylab as pl

## Usefull functions:

In [16]:
def DCG(relevances, part = None):
    if part == None:
        relevances = np.asarray(relevances)
    else:
        relevances = np.asarray(relevances)[:part]
    n_relevances = len(relevances)
    if n_relevances == 0:
        return 0.
    discounts = np.log2(np.arange(n_relevances) + 2)
    return np.sum(relevances / discounts)

def NDCG(relevances, part = None):
    best_dcg = DCG(sorted(relevances, reverse=True), part)
    if best_dcg == 0:
        return 0.
    return DCG(relevances, part) / best_dcg

##### Read pathway data

In [17]:
df_pathway = pd.read_csv("df_pathway.csv", sep = "\t")
df_relations = pd.read_csv("df_relations.csv", sep = "\t")
pathway_set = set(pd.read_csv("df_pathway.csv", sep = "\t")["Gene"])

In [18]:
def to_array_weights(str_weights):
    weights = map(lambda p: (float(p.split(":")[0]), p.split(":")[1]), str_weights.split(" "))
    return sorted(weights, reverse = True)
def to_str_weights(array_weights):
    return " ".join(map(lambda p: str(p[0])+":"+str(p[1]),array_weights))
def position_of_pathway_genes(weights):
    pathway = map(lambda x: x[1] in pathway_set, to_array_weights(weights))
    return np.where(pathway)[0]

In [29]:
def transform_df_with_pathway_genes(df_result):
    pathway_position = []
    pathway_len = []
    df_result = df_result.rename(columns={'NumberPathwayGenes': 'NumberOfCauseGenes'})
    for i in df_result.index:
        if pd.isnull(df_result.loc[i,"Weights"]):
            p = []
        else:
            p = position_of_pathway_genes(df_result.loc[i,"Weights"])
        pathway_position.append(" ".join(map(str, p)))
        pathway_len.append(len(p))
    df_result["NumberPathwayGenes"] = pathway_len
    df_result["PositionPathway"] = pathway_position
    return df_result
lasso_result = transform_df_with_pathway_genes(pd.read_csv("lasso_result.csv"))
randomized_lasso_result = transform_df_with_pathway_genes(pd.read_csv("randomized_lasso_result.csv"))
#rfe_result = transform_df_with_pathway_genes(pd.read_csv("rfe_result.csv"))
rfe_lasso_result = transform_df_with_pathway_genes(pd.read_csv("rfe_lasso_result.csv"))

## Producing final list of genes, for every target leave only NUMBER_OF_GENES_TO_SELECT best

In [30]:
NUMBER_OF_GENES_TO_SELECT = 100

In [39]:
df_data = pd.concat([lasso_result, randomized_lasso_result, rfe_lasso_result])
df_maximum_selected = df_data[df_data["NumberGenes"] >= NUMBER_OF_GENES_TO_SELECT].sort('NumberGenes',
                                ascending=True).groupby(['Method', 'Gene'], as_index=False).first()
df_result = pd.DataFrame(columns = ["Method", "Gene", "DCG", "NumberOfPatwayGenes", "PathwayRank", "SelectedGenes"],
                         dtype=float)
for i in df_maximum_selected.index:
    selected_genes = to_str_weights(to_array_weights(df_maximum_selected.loc[i, "Weights"])[:NUMBER_OF_GENES_TO_SELECT])
    pos_pathway = position_of_pathway_genes(selected_genes)
    dcg = DCG(np.in1d(range(NUMBER_OF_GENES_TO_SELECT), pos_pathway)*1)
    df_result = df_result.append(pd.DataFrame({"Method": df_maximum_selected.loc[i, "Method"],
                                  "Gene": df_maximum_selected.loc[i, "Gene"],
                                  "DCG": dcg,
                                  "PathwayRank": " ".join(map(str, pos_pathway)),
                                  "SelectedGenes" : selected_genes,
                                  "NumberOfPatwayGenes" : len(pos_pathway)}, index = [i]))


In [40]:
method_stat_df = df_result.groupby(["Method"], as_index = False).mean()
method_stat_df.to_csv("report/method_comparison.csv", index = False)
method_stat_df

Unnamed: 0,Method,DCG,NumberOfPatwayGenes
0,Lasso,1.104644,3.232558
1,RFE,1.031572,3.232558
2,RandomizedLasso,0.873925,2.976744


In [41]:
df_result.to_csv("report/result.csv", index = False)

## Producing final aggregation

In [48]:
matrix_of_lists = []
for i in df_result[df_result["Method"] == "Lasso"].index:
    matrix_of_lists.append(map(lambda x: x[1], to_array_weights(df_result.loc[i, "SelectedGenes"])))
pd.DataFrame(matrix_of_lists).to_csv("lists_for_aggregation.csv", index = False)

In [49]:
!Rscript lists_aggregation.r 10 > /dev/null

In [50]:
import os
os.remove("lists_for_aggregation.csv")
selected_genes_df = pd.read_csv("selected_genes_list.csv")
os.remove("selected_genes_list.csv")

In [51]:
selected_genes_df["InPathway"] = np.in1d(selected_genes_df["Genes"], list(pathway_set))
selected_genes_df["Rank"] = selected_genes_df.shape[0] - np.array(selected_genes_df.index) - 1
selected_genes_df.to_csv("report/selected_genes_list_lasso.csv", index = False)

selected_genes_df

Unnamed: 0,Genes,InPathway,Rank
0,VIT_14s0068g00930,True,9
1,VIT_14s0068g00920,True,8
2,VIT_18s0001g09400,False,7
3,VIT_06s0004g02620,False,6
4,VIT_03s0088g00260,False,5
5,VIT_04s0023g03370,True,4
6,VIT_05s0136g00260,True,3
7,VIT_00s0361g00040,True,2
8,VIT_06s0009g03040<br>VIT_06s0009g03050<br>VIT_...,True,1
9,VIT_15s0046g00170,True,0


In [47]:
def get_tex_compatible(df):
    new_df = df.copy()
    new_df["Genes"] = new_df["Genes"].str.replace("_","\_")
    new_df["Genes"] = map(lambda s: s[:30] + "..." if len(s) > 60 else s, new_df["Genes"])
    return new_df

get_tex_compatible(selected_genes_df.iloc[0:10,:]).to_csv("report/selected_genes_list_10.csv", index = False)
get_tex_compatible(selected_genes_df[np.logical_not(selected_genes_df["InPathway"])]).to_csv(
        "report/selected_genes_not_pathway.csv", index = False)
selected_genes_df[np.logical_not(selected_genes_df["InPathway"])]

Unnamed: 0,Genes,InPathway,Rank
1,VIT_06s0004g02620,False,23
2,VIT_18s0001g09400,False,22
4,VIT_03s0088g00260,False,20
16,VIT_14s0060g00900,False,8
18,VIT_13s0064g01480,False,6
19,VIT_02s0012g01570,False,5
20,VIT_16s0100g00270,False,4
22,VIT_12s0028g01150,False,2
23,VIT_03s0091g01290,False,1
24,VIT_07s0005g06460,False,0
