# Aracne Setup
Aracne uses information theoretic approaches for constructing gene regulatory networks using gene expression data. 

In [1]:
import os
import pandas
import matplotlib.pyplot as plt
import seaborn
import numpy

%matplotlib inline

class FilePaths():
    def __init__(self):
        self.dire = r'/home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray'
        self.input_file = os.path.join(self.dire, 'MicroarrayDEGs.csv')
        self.output_filename = os.path.join(self.dire, 'ArcneOutput.adj')
        self.TFlist_file=os.path.join(self.dire, 'TFsInMicroarrayDataProbeIDs.txt')
        
        
        
        
F=FilePaths()




## Aracne Output Formatting 
Aracne output is a large tsv file that needs formatting to a network file for import into cytoscape.

### Get annotation data from input file

In [2]:
data = pandas.read_csv(F.input_file, sep='\t')
anno_df = data[['ProbeID','GeneSymbol']]

print(anno_df)

           ProbeID GeneSymbol
0    11746909_a_at       A1CF
1    11736238_a_at      ABCA5
2      11724734_at      ABCB8
3      11723976_at      ABCC8
4    11718612_a_at      ABCD4
5    11758217_s_at    ABHD17C
6    11744541_a_at      ACKR3
7      11730931_at      ACSS3
8      11738454_at     ACTBL2
9      11727682_at     ACVR2B
10   11733762_a_at   ADAMTSL1
11   11756406_x_at        ADM
12   11743232_a_at       AFDN
13   11743984_a_at      AKAP1
14   11723804_a_at     AKAP12
15   11716720_a_at     AKT1S1
16     11733006_at     AMOTL2
17   11719576_s_at        AMT
18   11719577_a_at        AMT
19   11755899_x_at   ANKRD36B
20   11755987_a_at    ANKRD44
21   11756425_a_at     ANKS1A
22   11727683_a_at      AP3B2
23   11734819_s_at      APBB2
24   11723975_x_at      APOL3
25   11722736_s_at      APPL1
26   11733928_a_at      ARMC8
27   11751585_a_at       ASB8
28   11716966_a_at       ATN1
29   11716154_x_at    B4GALT1
..             ...        ...
183  11736576_x_at     TMEM53
184  11716

### Parse Aracne output file into pandas.DataFrame

In [3]:
def parse_data(data_file):
    with open(data_file) as f:
        data = [i for i in f if i[0]!='>' ]
        
    data =[i.split('\t') for i in data]
    headers = [i[0] for i in data]
    df =pandas.DataFrame(data, index=headers)
    
    return df
        
data = parse_data(F.output_filename)


### Extract the gene symbols from input file, merge with aracne output and replace ID's with GeneSymbols

In [4]:
def merge_with_annotation(data, anno_data):
    """
    
    """
    ## filter anno_data by entries in data 
    anno_data = anno_data.set_index('ProbeID')
    filtered = anno_data[anno_data.index.isin(list(data.index))]
    merged = pandas.merge(data, anno_data, left_index=True, right_index = True)
    merged = merged.reset_index()
    merged = merged.set_index(['ProbeID','GeneSymbol'])
    merged = merged.replace(merged.reset_index(level=1)['GeneSymbol'])
    merged = merged.drop(0, axis=1)
    merged.columns = range(merged.shape[1])
    return merged

merged = merge_with_annotation(data, anno_df)
print(merged)

                                0           1         2           3   \
ProbeID       GeneSymbol                                               
11746909_a_at A1CF           ACKR3  0.12346012   ANKRD44  0.09563734   
11736238_a_at ABCA5          ABCC8  0.17195092       AMT  0.13487245   
11724734_at   ABCB8         CLEC5A  0.12801919   EPB41L3   0.1886547   
11723976_at   ABCC8          ABCA5  0.17195092    CX3CL1  0.12825855   
11718612_a_at ABCD4            ADM  0.37476195   DENND2D  0.21161518   
11758217_s_at ABHD17C     ADAMTSL1  0.13565304  EPM2AIP1  0.19136123   
11744541_a_at ACKR3           A1CF  0.12346012      AFDN  0.16063678   
11730931_at   ACSS3          CCNJL  0.14317375     CTXN1  0.12212541   
11738454_at   ACTBL2         DSCC1  0.15376738      GMFG  0.13622747   
11727682_at   ACVR2B         ERBB3  0.17860863    HIVEP3  0.34162498   
11733762_a_at ADAMTSL1     ABHD17C  0.13565304     APOL3  0.11842228   
11756406_x_at ADM            ABCD4  0.37476195      BTG2  0.1893

### Create a network table with "interactant" as index and "interactors" and MI weights as values

In [5]:
def create_network_table(data):
    """
    
    
    ## sanity check 
                                 Interactant    Weight Interactant    Weight  \
    index         GeneSymbol                                               
    11746909_a_at A1CF             ABCA5  0.038230       ABCB8  0.024966   
    11736238_a_at ABCA5             A1CF  0.038230       ABCB8  0.024771  
    """
    col = ['Interactant','MI']*int(data.shape[1]/2) 
    data.columns = col
    
    
    ids = list(set(data.index.get_level_values(0)))
    print (data)
    
    
    
    
#     for id in ids[2:3]:
#         df  =data.loc[id]
# #         df = df.replace(None, numpy.nan)
#         print (df.transpose())
    
    
#     interactions = data['Interactant'].stack()
    
#     print (data)
#     weights = data['MI'].stack()
# #     weights.index = weights.index.droplevel(2)
#     weights = pandas.DataFrame(weights)
#     weights.columns = ['MI']
#     weights['MI'] = pandas.to_numeric(weights['MI'])
#     print (weights)

#     interactions.index = interactions.index.droplevel(2)
#     weights.index = weights.index.droplevel(2)
    
# #     print (data)
    

#     weights = pandas.DataFrame(weights)
#     interactions = pandas.DataFrame(interactions)
#     interactions.columns = ['Interactions']
#     weights.columns = ['MI']
#     df  = interactions.merge(weights, left_index=True, right_index=True)     
#     df = df.sort_index(level=1)
# #     df = df.sort_values(by = 'weights')
#     df = df.reset_index(level=1)
#     df = df.reset_index()
    
#     ## convert weights to float
#     df['MI'] = pandas.to_numeric(df['MI'])
    
#     ##Sort
#     df=df.sort_values(by = ['ProbeID','GeneSymbol','Interactions','MI'])
    
#     ##set index
#     df = df.set_index(['ProbeID','GeneSymbol','Interactions'])
    
#     return df
    
# print(create_network_table(merged))
# interactions = create_network_table(merged)

# print(interactions) #(100514, 1)

In [151]:


def create_network_table(data):
    """
    
    
    """
    ids = list(set(data.index.get_level_values(0)))
    df_dct = {}
    for i in ids:
        df = data.loc[i]
        df=df.reset_index()
        df = df.set_index(['GeneSymbol'])
        x = [i for i in range(df.shape[1]) if i%2==0]
        x1 = [i+1 for i in range(df.shape[1]) if i%2==0]
        MI = df[x1]
        names = df[x]
        gene = list(set(df.index.get_level_values(0)))[0]
        MI = pandas.DataFrame(pandas.to_numeric(MI.loc[gene])  ).reset_index(drop=True) 
        names = pandas.DataFrame(names.loc[gene]).reset_index(drop=True)
        df =pandas.concat([names,MI], axis=1)
        df.columns = ['Interactors','MI']
        df =df.fillna(value=numpy.nan)
        df = df.dropna(how='any')
        df.index.name = 'index'
        df_dct[(i,gene)] = df

    df = pandas.concat(df_dct)
    df.index=df.index.droplevel([0,2])
    df.index.name = 'Interactie'
    df = df.reset_index()
    df = df.set_index(['Interactie','Interactors'])
    return df
    
network = create_network_table(merged)
print(network)

                              MI
Interactie Interactors          
EPAS1      BTG2         0.102197
           CASTOR2      0.110531
           CCNJL        0.099119
           CMTM4        0.258810
           FER          0.136804
           HAUS5        0.141458
           ICAM4        0.095180
           MAFG         0.199179
           RND2         0.099025
           SHTN1        0.127087
           SORBS1       0.106015
           WIZ          0.143479
           XPO1         0.261555
           ZFPM1        0.133050
TSC22D1    ABCA5        0.107250
           CCDC90B      0.283641
           CDC20B       0.081119
           COL3A1       0.225131
           CX3CL1       0.093045
           EDC4         0.107014
           FAM170A      0.093862
           FANCD2       0.083065
           FBXW2        0.175528
           FER          0.120302
           GLUD1        0.186470
           HLA-G        0.114996
           JAZF1        0.114455
           KCNJ5        0.100776
          

In [155]:
def to_file(df, fname):
    df = df.reset_index()
    df.to_csv(fname, index=False, sep='\t')    
to_file(network, 'Network.txt')


In [7]:


'''
                                             MI          
                                           mean       std
ProbeID       GeneSymbol Interactions                    
11715412_a_at EPAS1      BTG2          0.143821  0.056323
                         CASTOR2       0.143821  0.056323
                         CCNJL         0.143821  0.056323
                         CMTM4         0.143821  0.056323
                         FER           0.143821  0.056323
'''

import pandas
import numpy
def summarize(data):
    data =data.reset_index(level=0, drop=True)
    interactors = sorted(list(set(data.index.get_level_values(0))))
    for interactor in interactors:
        for label, df in data.loc[interactor].groupby(level=0):
            print (label, df.loc[label])
            
            
            
#         print(interactor,'\n', data.loc[interactor])
#         print (data.loc[interactor])
    
#     print(data.groupby(level=[0,1,2]).aggregate(numpy.mean))
#     return data.groupby(level=[0,1,2]).aggregate(numpy.mean)
    
    
    
    
#     data =data.reset_index(level=0, drop=True)
#     for label, df in data.groupby(level=[0,1]):
#         df_dct[label] = df.loc[label].sum()
        
#     df = pandas.DataFrame(df_dct)
#     df = df.transpose()
#     df.index.name = ['Interactor','Interactant']
        
#     print(data.groupby(level=[0,1]).aggregate(numpy.mean))


print (summarize(interactions))

    
# reset = interactions.reset_index()
# tester = reset[  (reset['ProbeID'] == '11715412_a_at') & ( reset['GeneSymbol'] == 'EPAS1') & ( reset['Interactions'] == 'BTG2') ] 
# print(tester['MI'].sum()/tester['MI'].shape[0])
# print(tester.groupby(by = list(tester.columns[:2])).aggregate(numpy.mean) )




# print(interactions.groupby(level=[0,1,2]).aggregate([numpy.mean, numpy.std]))
# print(interactions.loc[['11715412_a_at','EPAS1','BTG2']])

NameError: name 'interactions' is not defined

In [None]:
ACKR3 = [0.095637,0.110439,0.110816, 0.119141,0.123460]
'''sum 0.5594929999999999 '''

ANKRD44 =[0.095637,0.110439, 0.110816, 0.119141, 0.123460]
print (sum(ANKRD44))