# Aracne Input Formatting
Aracne uses information theoretic approaches for constructing gene regulatory networks using gene expression data. 

In [78]:
import os
import pandas
import matplotlib.pyplot as plt
import seaborn
import numpy
import logging
logging.basicConfig(format = '%(asctime)s - %(name)s - %(message)s')
LOG=logging.getLogger()
LOG.setLevel(logging.INFO)
%matplotlib inline

class FilePaths():
    def __init__(self):
#         self.dire = r'/home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray'
        self.dire = r'C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray'
        self.input_file = os.path.join(self.dire, 'MicroarrayDEGs.csv')
        self.output_filename = os.path.join(self.dire, 'ArcneOutput.adj')
        self.TFlist_file=os.path.join(self.dire, 'TFsInMicroarrayDataProbeIDs.txt')
        self.analysis_dir = os.path.join(self.dire, 'Analysis')
        self.treated_dir = os.path.join(self.analysis_dir, 'TGFb')
        self.control_dir = os.path.join(self.analysis_dir, 'Control')
        self.none_dir = os.path.join(self.analysis_dir, 'None')
        
        self.aracne_path = os.path.join(os.path.dirname(self.dire), 'aracne2.jar')
        
        self.timepoints = [15,30,60,90,120,150,180]
        
        self.treated_results = {i: os.path.join(self.treated_dir, str(i)) for i in self.timepoints}
        self.control_results = {i: os.path.join(self.control_dir, str(i)) for i in self.timepoints}
        self.none_results =    {i: os.path.join(self.none_dir, str(i)) for i in [0,180]}
        
        self._create_directories()
    
    ## create some organized directories for the output
    def _create_directories(self):
        """
        
        """
        for treatment in [self.treated_results, self.control_results, self.none_results]:
            for time in treatment.keys():
                if os.path.isdir(treatment[time])!=True:
                    os.makedirs( treatment[time])
        
        
        
F=FilePaths()


## Splitting Data
Microarray data is currently in one massive array. My initial testing for setting up an aracne workflow was done on the entire frame but I need to split the frame into control Vs treated averaged data. Perhaps it would be prudent to take the log ratio of treated to control as input, then average over repeats. Then isolate time points and run them separately.

### Parse Data into Useful Format

In [57]:
def parse_input_data(f):
    """
    """
    df = pandas.read_csv(f, sep='\t', index_col=[0,1])
    labels = list(df.columns)
    treat, time, rep = zip(*[i.split('_') for i in labels])
    df = df.transpose()
    df['Treatment']= treat
    df['Time'] = [int(i) for i in time]
    df['Repeat'] = [int(i) for i in rep]
    df = df.reset_index(drop=True)
    df = df.set_index(['Treatment','Time','Repeat'])
    df = df.transpose()
    return df

data = parse_input_data(F.input_file)

## Split frame and average repeats
Turns out this may not be as needed as I thought. Just use the 6 repeats rather than averaging first. 

In [16]:
# def split_df(data):
#     """
    
#     """
# #     data = data.groupby(level = [0,1], axis=1).aggregate(numpy.mean)
#     data = data.transpose()
#     data = data.reset_index()
#     data['Time'] = pandas.to_numeric(data['Time'])
#     data = data.set_index(['Treatment','Time'])
#     data = data.sort_index(level=[0,1])
#     return data.transpose()

# # print(data)
# split = split_df(data)
# print(split)

Treatment                 Control                                      \
Time                          15       15       15       15       15    
ProbeID       GeneSymbol                                                
Repeat                          4        1        5        6        3   
11746909_a_at A1CF        2.68738  3.02086  2.46422  2.81869  2.78952   
11736238_a_at ABCA5        4.7083  4.57503  4.62884  4.66972  4.67257   
11724734_at   ABCB8       2.98169  3.04122  3.05107  2.89673   3.1669   
11723976_at   ABCC8       3.23419  3.59044  3.33374  3.33537  3.23487   
11718612_a_at ABCD4       5.25866  4.62595  5.12392  5.25076  5.16112   
11758217_s_at ABHD17C     8.07195  8.03954  8.04837  8.04649   8.1341   
11744541_a_at ACKR3       2.37871  3.02689  2.61286  2.49904  2.39577   
11730931_at   ACSS3       2.07238  2.44476  2.36787  2.54338  2.30433   
11738454_at   ACTBL2      2.31414  2.31212  2.37544  2.42575  2.60165   
11727682_at   ACVR2B      3.73684  3.53092  3.83393

## Write each treatment and time point to a structured directory tree

In [79]:
def write_to_csv(data, directory, treatment='TGFb', time=15):
    """
    
    """
    data = data.transpose()
    data = data.sort_index(level=[0,1,2])
    data=data.transpose()
    data = data[treatment,time]
    fname=os.path.join(directory, '{}_{}.csv'.format(treatment, time))
    LOG.info('Writing csv to {}'.format(fname))
    if os.path.isfile(fname):
        os.remove(fname)
    data.to_csv(fname, sep='\t')
    return fname


# write_to_csv(data, F.control_results[15], treatment='Control',time=15)
treatments  = ['TGFb','Control','None']

def iterate_over_timepoints(data):
    """
    
    """
    d = {}
    d['treated'] = {}
    d['control'] = {}
    d['none'] = {}
    
    for t in [15,30,60,90,120,150,180]:
        d['treated'][t] = write_to_csv(data, F.treated_results[t], treatment='TGFb', time=t)
        d['control'][t] = write_to_csv(data, F.control_results[t], treatment='Control', time=t)
    
    for t in [0,180]:
        d['none'][t] = write_to_csv(data, F.none_results[t], treatment='none', time=t)
    return d
            
files = iterate_over_timepoints(data)
            

2017-07-16 22:32:56,717 - root - Writing csv to C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\15\TGFb_15.csv
2017-07-16 22:32:56,732 - root - Writing csv to C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\15\Control_15.csv
2017-07-16 22:32:56,753 - root - Writing csv to C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\30\TGFb_30.csv
2017-07-16 22:32:56,776 - root - Writing csv to C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\30\Control_30.csv
2017-07-16 22:32:56,801 - root - Writing csv to C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\60\TGFb_60.csv
2017-07-16 22:32:56,822 - root - Writing csv to C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\60\Control_60.csv
2017-07-16 22:32:56,844 - root - Writing csv to C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\90\TGFb_90

## Write  shell script that runs all with the same parameters

In [134]:
def run1aracne(input_path, dpi_tolernce=0.85, p_val=0.05):
    output_path = input_path[:-4]+'Network.csv'
    command = 'java -jar {} -i {} -o {} -e {} -p {}'.format(F.aracne_path, input_path, output_path, dpi_tolernce, p_val)
    os.system(command)
    return output_path
    
def run_aracne(input_files, dpi_tolernce=0.85, p_val=0.05):
    """
    
    """
    out = {}
    for treatment in input_files:
        out[treatment] = {}
        for time in input_files[treatment]:
            print(input_files[treatment][time])
            LOG.info('Running {}'.format(input_files[treatment][time]))
            out[treatment][time] = run1aracne(input_files[treatment][time], dpi_tolernce=dpi_tolernce, p_val=p_val)
    return out

network_files = run_aracne(files)


2017-07-16 23:18:59,436 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\15\Control_15.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\15\Control_15.csv


2017-07-16 23:19:00,263 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\180\Control_180.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\180\Control_180.csv


2017-07-16 23:19:01,278 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\150\Control_150.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\150\Control_150.csv


2017-07-16 23:19:02,214 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\120\Control_120.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\120\Control_120.csv


2017-07-16 23:19:03,178 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\90\Control_90.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\90\Control_90.csv


2017-07-16 23:19:04,055 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\60\Control_60.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\60\Control_60.csv


2017-07-16 23:19:04,765 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\30\Control_30.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\Control\30\Control_30.csv


2017-07-16 23:19:05,486 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\None\0\none_0.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\None\0\none_0.csv


2017-07-16 23:19:06,244 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\None\180\none_180.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\None\180\none_180.csv


2017-07-16 23:19:07,292 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\15\TGFb_15.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\15\TGFb_15.csv


2017-07-16 23:19:08,071 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\180\TGFb_180.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\180\TGFb_180.csv


2017-07-16 23:19:08,848 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\150\TGFb_150.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\150\TGFb_150.csv


2017-07-16 23:19:09,588 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\120\TGFb_120.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\120\TGFb_120.csv


2017-07-16 23:19:10,338 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\90\TGFb_90.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\90\TGFb_90.csv


2017-07-16 23:19:11,108 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\60\TGFb_60.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\60\TGFb_60.csv


2017-07-16 23:19:11,859 - root - Running C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\30\TGFb_30.csv


C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\TGFb\30\TGFb_30.csv


# Aracne Output Formatting 
Aracne output is a large tsv file that needs formatting to a network file for import into cytoscape.

### Get annotation data from input file

In [121]:
data = pandas.read_csv(F.input_file, sep='\t')
anno_df = data[['ProbeID','GeneSymbol']]

LOG.info(anno_df.head())

2017-07-16 23:09:22,832 - root -          ProbeID GeneSymbol
0  11746909_a_at       A1CF
1  11736238_a_at      ABCA5
2    11724734_at      ABCB8
3    11723976_at      ABCC8
4  11718612_a_at      ABCD4


### Parse Aracne output file into pandas.DataFrame

In [143]:
def parse_data(data_file):
    with open(data_file) as f:
        data = [i for i in f if i[0]!='>' ]
        
    data =[i.split('\t') for i in data]
    headers = [i[0] for i in data]
    df =pandas.DataFrame(data, index=headers)
    
    return df

def parse_all(data_files):
    """
    """
    output={}
    for treatment in data_files :
        output[treatment]={}
        for time in  data_files[treatment]:
            output[treatment][time]= parse_data(data_files[treatment][time])
    return output
            
df_dct = parse_all(network_files)
LOG.info(df_dct)


2017-07-16 23:24:10,049 - root - {'control': {15:                          0              1           2              3    \
11746909_a_at  11746909_a_at  11744541_a_at  0.09219639    11727682_at   
11736238_a_at  11736238_a_at    11723976_at  0.22129466  11718612_a_at   
11724734_at      11724734_at  11718612_a_at  0.10148671  11733762_a_at   
11723976_at      11723976_at  11736238_a_at  0.22129466  11718612_a_at   
11718612_a_at  11718612_a_at  11736238_a_at  0.22563548    11724734_at   
11758217_s_at  11758217_s_at  11736238_a_at  0.18293891    11723976_at   
11744541_a_at  11744541_a_at  11746909_a_at  0.09219639  11736238_a_at   
11730931_at      11730931_at  11736238_a_at   0.1375739    11723976_at   
11738454_at      11738454_at  11736238_a_at  0.12183497    11723976_at   
11727682_at      11727682_at  11746909_a_at  0.23486881    11723976_at   
11733762_a_at  11733762_a_at  11746909_a_at  0.21582629    11724734_at   
11756406_x_at  11756406_x_at    11724734_at  0.12252463    117

### Extract the gene symbols from input file, merge with aracne output and replace ID's with GeneSymbols

In [144]:
def merge_with_annotation(data, anno_data):
    """
    
    """
    ## filter anno_data by entries in data 
    anno_data = anno_data.set_index('ProbeID')
    filtered = anno_data[anno_data.index.isin(list(data.index))]
    merged = pandas.merge(data, anno_data, left_index=True, right_index = True)
    merged = merged.reset_index()
    merged = merged.set_index(['ProbeID','GeneSymbol'])
    merged = merged.replace(merged.reset_index(level=1)['GeneSymbol'])
    merged = merged.drop(0, axis=1)
    merged.columns = range(merged.shape[1])
    return merged

def merge_for_all_time_points(dct):
    """
    """
    merged_dct = {}
    for treatment in dct:
        merged_dct[treatment] = {}
        for time in dct[treatment]:
            merged_dct[treatment][time] =merge_with_annotation(dct[treatment][time], anno_df)
    return merged_dct

merged_dct = merge_for_all_time_points(df_dct)
LOG.info(merged_dct)

2017-07-16 23:24:41,190 - root - {'control': {15:                             0           1         2           3         4    \
ProbeID       GeneSymbol                                                      
11746909_a_at A1CF        ACKR3  0.09219639    ACVR2B  0.23486881  ADAMTSL1   
11736238_a_at ABCA5       ABCC8  0.22129466     ABCD4  0.22563548   ABHD17C   
11724734_at   ABCB8       ABCD4  0.10148671  ADAMTSL1  0.09057609       ADM   
11723976_at   ABCC8       ABCA5  0.22129466     ABCD4  0.17084997   ABHD17C   
11718612_a_at ABCD4       ABCA5  0.22563548     ABCB8  0.10148671     ABCC8   
11758217_s_at ABHD17C     ABCA5  0.18293891     ABCC8  0.21582629     ABCD4   
11744541_a_at ACKR3        A1CF  0.09219639     ABCA5  0.09219639     ABCD4   
11730931_at   ACSS3       ABCA5   0.1375739     ABCC8  0.18778519     ABCD4   
11738454_at   ACTBL2      ABCA5  0.12183497     ABCC8  0.10450354     ABCD4   
11727682_at   ACVR2B       A1CF  0.23486881     ABCC8  0.07994476     ACKR3   
11

### Create a network table with "interactant" as index and "interactors" and MI weights as values

In [145]:


def create_network_table(data):
    """
    
    
    """
    ids = list(set(data.index.get_level_values(0)))
    df_dct = {}
    for i in ids:
        df = data.loc[i]
        df=df.reset_index()
        df = df.set_index(['GeneSymbol'])
        x = [i for i in range(df.shape[1]) if i%2==0]
        x1 = [i+1 for i in range(df.shape[1]) if i%2==0]
        MI = df[x1]
        names = df[x]
        gene = list(set(df.index.get_level_values(0)))[0]
        MI = pandas.DataFrame(pandas.to_numeric(MI.loc[gene])  ).reset_index(drop=True) 
        names = pandas.DataFrame(names.loc[gene]).reset_index(drop=True)
        df =pandas.concat([names,MI], axis=1)
        df.columns = ['Interactors','MI']
        df =df.fillna(value=numpy.nan)
        df = df.dropna(how='any')
        df.index.name = 'index'
        df_dct[(i,gene)] = df

    df = pandas.concat(df_dct)
    df.index=df.index.droplevel([0,2])
    df.index.name = 'Interactie'
    df = df.reset_index()
    df = df.set_index(['Interactie','Interactors'])
    df = df.sort_index(level=[0,1])
    return df

def create_networks(data):
    """
    
    """
    network_dct = {}
    for treatment in data:
        network_dct[treatment] = {}
        for time in data[treatment]:
            network_dct[treatment][time] = create_network_table(data[treatment][time])
    return network_dct

networks = create_networks(merged_dct)
#     print(network)

## Output results to file for import into cytoscape

In [158]:
def create_sif(data, output_directory):
    """
    """
    ## creat out dir
    directory  = os.path.join(F.analysis_dir,output_directory)
    if os.path.isdir(directory)!=True:
        os.makedirs(directory)
        
    filename_dct = {}
    for treatment in data:
        filename_dct[treatment] = {}
        for time in data[treatment]:
            filename = os.path.join(directory, '{}_{}.csv'.format(treatment,time))
            filename_dct[treatment][time] = filename
            df = data[treatment][time]
            ## remove duplicates
            df = df.drop_duplicates()
            df.to_csv(filename, sep='\t')
    LOG.info('Data written to: {}'.format(directory))
    return filename_dct
    
    
create_sif(networks, 'AnalysisOutput')

2017-07-16 23:39:20,568 - root - Data written to: C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\AnalysisOutput


{'control': {15: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisOutput\\control_15.csv',
  30: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisOutput\\control_30.csv',
  60: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisOutput\\control_60.csv',
  90: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisOutput\\control_90.csv',
  120: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisOutput\\control_120.csv',
  150: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisOutput\\control_150.csv',
  180: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisOutput\\control_180.csv'},
 'none': {0: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisOutput\\none_0.csv',
  180: 'C:\

### Remove Duplicate Entries

In [None]:
network = network.drop_duplicates()

### Write to file

In [None]:
def to_file(df, fname):
    df = df.reset_index()
    df.to_csv(fname, index=False, sep='\t')    
to_file(network, 'Network.txt')
