# Aracne Input Formatting
Aracne uses information theoretic approaches for constructing gene regulatory networks using gene expression data. 

In [261]:
import os
import pandas
import matplotlib.pyplot as plt
import seaborn
import numpy
import logging
logging.basicConfig(format = '%(asctime)s - %(name)s - %(message)s')
LOG=logging.getLogger()
LOG.setLevel(logging.INFO)
%matplotlib inline

class FilePaths():
    def __init__(self):
        self.dire = r'/home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray'
#         self.dire = r'C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray'
        self.input_file = os.path.join(self.dire, 'MicroarrayDEGs.csv')
        self.output_filename = os.path.join(self.dire, 'ArcneOutput.adj')
        self.TFlist_file=os.path.join(os.path.dirname(self.dire), 'TFsInMicroarrayDataProbeIDs.txt')
        self.analysis_dir = os.path.join(self.dire, 'Analysis')
        self.treated_dir = os.path.join(self.analysis_dir, 'TGFb')
        self.control_dir = os.path.join(self.analysis_dir, 'Control')
        self.none_dir = os.path.join(self.analysis_dir, 'None')
        
        self.aracne_path = os.path.join(os.path.dirname(self.dire), 'aracne2.jar')
        
        self.timepoints = [15,30,60,90,120,150,180]
        
        self.treated_results = {i: os.path.join(self.treated_dir, str(i)) for i in self.timepoints}
        self.control_results = {i: os.path.join(self.control_dir, str(i)) for i in self.timepoints}
        self.none_results =    {i: os.path.join(self.none_dir, str(i)) for i in [0,180]}
        
        self._create_directories()
    
    ## create some organized directories for the output
    def _create_directories(self):
        """
        
        """
        for treatment in [self.treated_results, self.control_results, self.none_results]:
            for time in treatment.keys():
                if os.path.isdir(treatment[time])!=True:
                    os.makedirs( treatment[time])
        
        
        
F=FilePaths()


## Splitting Data
Microarray data is currently in one massive array. My initial testing for setting up an aracne workflow was done on the entire frame but I need to split the frame into control Vs treated averaged data. Perhaps it would be prudent to take the log ratio of treated to control as input, then average over repeats. Then isolate time points and run them separately.

### Parse Data into Useful Format

In [262]:
def parse_input_data(f):
    """
    """
    df = pandas.read_csv(f, sep='\t', index_col=[0,1])
    labels = list(df.columns)
    treat, time, rep = zip(*[i.split('_') for i in labels])
    df = df.transpose()
    df['Treatment']= treat
    df['Time'] = [int(i) for i in time]
    df['Repeat'] = [int(i) for i in rep]
    df = df.reset_index(drop=True)
    df = df.set_index(['Treatment','Time','Repeat'])
    df = df.transpose()
    return df

data = parse_input_data(F.input_file)

## Split frame and average repeats
Turns out this may not be as needed as I thought. Just use the 6 repeats rather than averaging first. 

In [220]:
# def split_df(data):
#     """
    
#     """
# #     data = data.groupby(level = [0,1], axis=1).aggregate(numpy.mean)
#     data = data.transpose()
#     data = data.reset_index()
#     data['Time'] = pandas.to_numeric(data['Time'])
#     data = data.set_index(['Treatment','Time'])
#     data = data.sort_index(level=[0,1])
#     return data.transpose()

# # print(data)
# split = split_df(data)
# print(split)
print('d')

d


## Write each treatment and time point to a structured directory tree

In [263]:
def write_to_csv(data, directory, treatment='TGFb', time=15):
    """
    
    """
    data = data.transpose()
    data = data.sort_index(level=[0,1,2])
    data=data.transpose()
    data = data[treatment,time]
    fname=os.path.join(directory, '{}_{}.csv'.format(treatment, time))
    LOG.info('Writing csv to {}'.format(fname))
    if os.path.isfile(fname):
        os.remove(fname)
    data.to_csv(fname, sep='\t')
    return fname


# write_to_csv(data, F.control_results[15], treatment='Control',time=15)
treatments  = ['TGFb','Control','None']

def iterate_over_timepoints(data):
    """
    
    """
    d = {}
    d['treated'] = {}
    d['control'] = {}
    d['none'] = {}
    
    for t in [15,30,60,90,120,150,180]:
        d['treated'][t] = write_to_csv(data, F.treated_results[t], treatment='TGFb', time=t)
        d['control'][t] = write_to_csv(data, F.control_results[t], treatment='Control', time=t)
    
    for t in [0,180]:
        d['none'][t] = write_to_csv(data, F.none_results[t], treatment='none', time=t)
    return d
            
files = iterate_over_timepoints(data)
            

2017-07-18 12:50:35,549 - root - Writing csv to /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/TGFb/15/TGFb_15.csv
2017-07-18 12:50:35,730 - root - Writing csv to /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/Control/15/Control_15.csv
2017-07-18 12:50:35,737 - root - Writing csv to /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/TGFb/30/TGFb_30.csv
2017-07-18 12:50:35,744 - root - Writing csv to /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/Control/30/Control_30.csv
2017-07-18 12:50:35,750 - root - Writing csv to /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/TGFb/60/TGFb_60.csv
2017-07-18 12:50:35,756 - root - Writing csv to /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/Control/60/Control_60.csv
2017-07-18 12:50:35,762 - root - Writing csv to /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/TGFb/90/TGFb_90.csv
20

## Write  shell script that runs all with the same parameters

In [None]:
import subprocess
from multiprocessing.pool import ThreadPool

def run1aracne(input_path, dpi_tolerance=0.85, p_val=None, tf_list = F.TFlist_file, threshold=None, algorithm='fixed_bandwidth', mode='complete',
               bootstrap=6, bins=6):
    """
    
    """
    modes = ['complete','discovery','preprocessing']
    if mode not in modes:
        raise Exception('{} not in {}'.format(mode, modes))
    algorithms = ['fixed_bandwidth','variable_bandwidth','naive_bayes', 'adaptive_partitioning']
    if algorithm not in algorithms:
        raise Exception('{} not in {}'.format(algorithm, algorithms))
        
    output_path = input_path[:-4]+'Network.csv'
    if threshold!=None:
        command = 'java -jar {} -i {} -o {} -e {} -l {} -t {} -a {} -m {} -r {} -b {}'.format(F.aracne_path, input_path,
                                                                              output_path, dpi_tolerance,
                                                                              tf_list, threshold,
                                                                             algorithm, mode, bootstrap, bins)
    elif p_val!=None:
        command = 'java -jar {} -i {} -o {} -e {} -l {} -p {} -a {} -m {} -r {} -b {}'.format(F.aracne_path, input_path,
                                                                              output_path, dpi_tolerance,
                                                                              tf_list, p_val,
                                                                             algorithm, mode, bootstrap, bins)
    tool = subprocess.Popen(command.split(' '), stdout=subprocess.PIPE)
    LOG.info('\nOutput:\n\n: {}'.format(tool.stdout.read()))
    return output_path
    
    
def get_results(data):
    """
    
    """
    for treatment in data:
        for time in data[treatment]:
            data[treatment][time]=data[treatment][time].get()
    return data


def run_aracne(input_files,dpi_tolerance=0.85, p_val=None, threshold = None, tf_list=F.TFlist_file, algorithm='fixed_bandwidth', mode='complete',
               bootstrap=6, bins=6):
    """
    
    """
    if (p_val == None) and (threshold == None):
        raise Exception('one of p_val or threshold must not be None')
    
    num = None  # set to the number of workers you want (it defaults to the cpu count of your machine)
    tp = ThreadPool(num)    
    out = {}
    for treatment in sorted(input_files):
        out[treatment] = {}
        for time in sorted(input_files[treatment]):
#             print(input_files[treatment][time])
            LOG.info('Running {}'.format(input_files[treatment][time]))
            out[treatment][time] = tp.apply_async( run1aracne, (input_files[treatment][time], dpi_tolerance, p_val, 
                                                                tf_list, threshold, algorithm, mode, bootstrap,
                                                                bins))
            
    tp.close()
    tp.join()
    return get_results(out)

# print(files)
network_files = run_aracne(files, threshold=0.5, algorithm='variable_bandwidth', p_val=0.1, bins=20 )
# run1aracne(files['control'][15])

2017-07-18 13:24:52,811 - root - Running /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/Control/15/Control_15.csv
2017-07-18 13:24:52,811 - root - Running /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/Control/30/Control_30.csv
2017-07-18 13:24:52,813 - root - Running /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/Control/60/Control_60.csv
2017-07-18 13:24:52,840 - root - Running /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/Control/90/Control_90.csv
2017-07-18 13:24:52,841 - root - Running /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/Control/120/Control_120.csv
2017-07-18 13:24:52,871 - root - Running /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/Control/150/Control_150.csv
2017-07-18 13:24:52,899 - root - Running /home/b3053674/Documents/Miscellaneous/Aracne/AracneMicroarray/Analysis/Control/180/Control_180.csv
2017-07-18 13:24:52,9

# Aracne Output Formatting 
Aracne output is a large tsv file that needs formatting to a network file for import into cytoscape.

### Get annotation data from input file

In [269]:
data = pandas.read_csv(F.input_file, sep='\t')
anno_df = data[['ProbeID','GeneSymbol']]

LOG.info(anno_df.head())

2017-07-18 12:57:54,541 - root -          ProbeID GeneSymbol
0  11746909_a_at       A1CF
1  11736238_a_at      ABCA5
2    11724734_at      ABCB8
3    11723976_at      ABCC8
4  11718612_a_at      ABCD4


### Parse Aracne output file into pandas.DataFrame

In [270]:
def parse_data(data_file):
    with open(data_file) as f:
        data = [i for i in f if i[0]!='>' ]
        
    data =[i.split('\t') for i in data]
    headers = [i[0] for i in data]
    df =pandas.DataFrame(data, index=headers)
    
    return df

def parse_all(data_files):
    """
    """
    output={}
    for treatment in data_files :
        output[treatment]={}
        for time in  data_files[treatment]:
            output[treatment][time]= parse_data(data_files[treatment][time])
    return output
            
df_dct = parse_all(network_files)
LOG.info(df_dct)


TypeError: coercing to Unicode: need string or buffer, ApplyResult found

### Extract the gene symbols from input file, merge with aracne output and replace ID's with GeneSymbols

In [195]:
def merge_with_annotation(data, anno_data):
    """
    
    """
    ## filter anno_data by entries in data 
    anno_data = anno_data.set_index('ProbeID')
    filtered = anno_data[anno_data.index.isin(list(data.index))]
    merged = pandas.merge(data, anno_data, left_index=True, right_index = True)
    merged = merged.reset_index()
    merged = merged.set_index(['ProbeID','GeneSymbol'])
    merged = merged.replace(merged.reset_index(level=1)['GeneSymbol'])
    merged = merged.drop(0, axis=1)
    merged.columns = range(merged.shape[1])
    return merged

def merge_for_all_time_points(dct):
    """
    """
    merged_dct = {}
    for treatment in dct:
        merged_dct[treatment] = {}
        for time in dct[treatment]:
            merged_dct[treatment][time] =merge_with_annotation(dct[treatment][time], anno_df)
    return merged_dct

merged_dct = merge_for_all_time_points(df_dct)
LOG.info(merged_dct)

2017-07-17 00:41:46,473 - root - {'control': {15:                             0           1         2           3         4    \
ProbeID       GeneSymbol                                                      
11746909_a_at A1CF        ACKR3  0.09219639    ACVR2B  0.23486881  ADAMTSL1   
11736238_a_at ABCA5       ABCC8  0.22129466     ABCD4  0.22563548   ABHD17C   
11724734_at   ABCB8       ABCD4  0.10148671  ADAMTSL1  0.09057609       ADM   
11723976_at   ABCC8       ABCA5  0.22129466     ABCD4  0.17084997   ABHD17C   
11718612_a_at ABCD4       ABCA5  0.22563548     ABCB8  0.10148671     ABCC8   
11758217_s_at ABHD17C     ABCA5  0.18293891     ABCC8  0.21582629     ABCD4   
11744541_a_at ACKR3        A1CF  0.09219639     ABCA5  0.09219639     ABCD4   
11730931_at   ACSS3       ABCA5   0.1375739     ABCC8  0.18778519     ABCD4   
11738454_at   ACTBL2      ABCA5  0.12183497     ABCC8  0.10450354     ABCD4   
11727682_at   ACVR2B       A1CF  0.23486881     ABCC8  0.07994476     ACKR3   
11

### Create a network table with "interactant" as index and "interactors" and MI weights as values

In [196]:


def create_network_table(data):
    """
    
    
    """
    ids = list(set(data.index.get_level_values(0)))
    df_dct = {}
    for i in ids:
        df = data.loc[i]
        df=df.reset_index()
        df = df.set_index(['GeneSymbol'])
        x = [i for i in range(df.shape[1]) if i%2==0]
        x1 = [i+1 for i in range(df.shape[1]) if i%2==0]
        MI = df[x1]
        names = df[x]
        gene = list(set(df.index.get_level_values(0)))[0]
        MI = pandas.DataFrame(pandas.to_numeric(MI.loc[gene])  ).reset_index(drop=True) 
        names = pandas.DataFrame(names.loc[gene]).reset_index(drop=True)
        df =pandas.concat([names,MI], axis=1)
        df.columns = ['Interactors','MI']
        df =df.fillna(value=numpy.nan)
        df = df.dropna(how='any')
        df.index.name = 'index'
        df_dct[(i,gene)] = df

    df = pandas.concat(df_dct)
    df.index=df.index.droplevel([0,2])
    df.index.name = 'Interactie'
    df = df.reset_index()
    df = df.set_index(['Interactie','Interactors'])
    df = df.sort_index(level=[0,1])
    return df

def create_networks(data):
    """
    
    """
    network_dct = {}
    for treatment in data:
        network_dct[treatment] = {}
        for time in data[treatment]:
            network_dct[treatment][time] = create_network_table(data[treatment][time])
    return network_dct

networks = create_networks(merged_dct)
#     print(network)

## Output results to file for import into cytoscape

In [197]:
def create_sif(data, output_directory):
    """
    """
    ## creat out dir
    directory  = os.path.join(F.analysis_dir,output_directory)
    if os.path.isdir(directory)!=True:
        os.makedirs(directory)
        
    filename_dct = {}
    for treatment in data:
        filename_dct[treatment] = {}
        for time in data[treatment]:
            filename = os.path.join(directory, '{}_{}.csv'.format(treatment,time))
            filename_dct[treatment][time] = filename
            df = data[treatment][time]
            ## remove duplicates
#             print(df.shape)
#             df = df.drop_duplicates()
#             print(df.shape)
            df.to_csv(filename, sep=',')
    LOG.info('Data written to: {}'.format(directory))
    return filename_dct
    
    
create_sif(networks, 'AnalysisWithTFList')

2017-07-17 00:42:11,234 - root - Data written to: C:\Users\Ciaran\Documents\Miscellaneous\Aracne\AracneMicroarray\Analysis\AnalysisWithTFList


{'control': {15: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisWithTFList\\control_15.csv',
  30: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisWithTFList\\control_30.csv',
  60: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisWithTFList\\control_60.csv',
  90: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisWithTFList\\control_90.csv',
  120: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisWithTFList\\control_120.csv',
  150: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisWithTFList\\control_150.csv',
  180: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisWithTFList\\control_180.csv'},
 'none': {0: 'C:\\Users\\Ciaran\\Documents\\Miscellaneous\\Aracne\\AracneMicroarray\\Analysis\\AnalysisWith

In [168]:
print(networks['control'][15])

                              MI
Interactie Interactors          
A1CF       ACKR3        0.092196
           ACVR2B       0.234869
           ADAMTSL1     0.215826
           AFDN         0.137574
           AKAP12       0.079945
           AKT1S1       0.117514
           AMOTL2       0.117514
           AMT          0.088152
           AMT          0.082545
           ANKS1A       0.167275
           APBB2        0.079945
           APOL3        0.196808
           ASB8         0.096007
           BRF1         0.082545
           C9orf72      0.196709
           CAMK2A       0.150851
           CD8A         0.150851
           CD8A         0.121835
           CDC20B       0.105768
           CMTM4        0.085956
           COL3A1       0.126849
           CORO1A       0.129231
           CPEB3        0.080257
           CX3CL1       0.126078
           CYFIP2       0.234869
           DACT1        0.090576
           DUSP2        0.104504
           DUSP3        0.124923
          