In [1]:
import pandas as pd
import numpy as np

In [2]:
exp9606 = pd.read_csv('9606_experimental.tsv', sep='\t') # human function data
exp9606

Unnamed: 0,species,ProteinId,Protein,Function,English,Type,DeterminationType,Number
0,9606,ENSP00000000233,ARF5,GO:0006810,transport,UniProtKB-KW,IEA,3
1,9606,ENSP00000000233,ARF5,GO:0006810,transport,UniProtKB-KW,IEA,3
2,9606,ENSP00000000233,ARF5,GO:0007154,cell communication,InterPro,IEA,2
3,9606,ENSP00000000233,ARF5,GO:0007154,cell communication,InterPro,IEA,2
4,9606,ENSP00000000233,ARF5,GO:0007165,signal transduction,InterPro,IEA,2
5,9606,ENSP00000000233,ARF5,GO:0007165,signal transduction,InterPro,IEA,2
6,9606,ENSP00000000233,ARF5,GO:0007264,small GTPase mediated signal transduction,InterPro,IEA,2
7,9606,ENSP00000000233,ARF5,GO:0007264,small GTPase mediated signal transduction,InterPro,IEA,2
8,9606,ENSP00000000233,ARF5,GO:0008104,protein localization,UniProtKB-KW,IEA,3
9,9606,ENSP00000000233,ARF5,GO:0008104,protein localization,UniProtKB-KW,IEA,3


In [3]:
funcs = np.unique(exp9606.Function).astype(str) # all unique functions donefor
pnames = np.unique(exp9606.ProteinId).astype(str) # all unique protein names

In [4]:
plabels = {}
flabels = {}

In [5]:
pzeros = [np.zeros(shape=len(funcs), dtype=np.dtype('b')) for i in range(len(pnames))]
len(pzeros), pzeros[0].shape

(17901, (16322,))

In [19]:
fzeros = [np.zeros(shape=len(pnames), dtype=np.dtype('b')) for i in range(len(funcs))]
len(fzeros), fzeros[0].shape

(16322, (17901,))

In [7]:
one = np.asarray(1).astype(np.dtype('b'))

In [8]:
#pyfuncs = list(funcs)
#%timeit pyfuncs.index('GO:0006810')
#del pyfuncs

In [9]:
hfuncs = {}
for i, f in enumerate(funcs): # using a hash for indexing was faster than a python search
    hfuncs[f] = i

In [10]:
hprots = {}
for i, p in enumerate(pnames): # using a hash for indexing was faster than a python search
    hprots[p] = i

In [11]:
#%timeit hfuncs['GO:0006810']

In [12]:
for p in pnames:
    plabels[p] = pzeros.pop()

In [20]:
for f in funcs:
    flabels[f] = fzeros.pop()

In [21]:
for i, p in exp9606.iterrows():
    #%timeit plabels[p.ProteinId][hfuncs[p.Function]] # useing a hash to look up index is ~400X faster!
    #%timeit plabels[p.ProteinId][pyfuncs.index(p.Function)]
    plabels[p.ProteinId][hfuncs[p.Function]] = one
    flabels[p.Function][hprots[p.ProteinId]] = one

In [23]:
#np.save('data/labels.npy', full)
#del full
funcs

array(['GO:0000002', 'GO:0000003', 'GO:0000009', ..., 'GO:2001304',
       'GO:2001306', 'GO:2001311'],
      dtype='<U10')

In [22]:
np.save('data/plabels_dict.npy', plabels) # saves a dict of {PROTEIN_NAME : FUNCTION_LABELS}
del plabels
np.save('data/flabels_dict.npy', flabels) # saves a dict of {FUNCTION_NAME : PROTEIN_LABELS}
del flabels

In [None]:
#np.save('data/labels_protein_order.npy', pnames) # saves a list of PROTEIN_NAME's for easier indexing

In [None]:
#np.save('data/labels_function_order.npy', funcs) # saves a list of FUNCTION_NAMES, this is order of functions in each entry of FUNCTION_LABELS
del funcs, hfuncs, hprots

In [None]:
del exp9606, zeros, one, pnames

In [None]:
import csv

In [None]:
interaction_file = 'InteractionData/Protein-Protein_Combined-Interactions.txt'
# this is the file 9606.protein.links.full.v10.5.txt processed to be in in array format, using only the combined score, by perl for quicker reading.

In [None]:
with open(interaction_file, 'r') as data:
    reader = csv.reader(data)
    pnames = np.asarray(reader.__next__()) # grab first line, which is the names of all proteins
    in1 = np.array([[np.int(k) for k in line[0].split(' ')] for line in reader],dtype = np.int16)+1 # deserialize the res of the data
del data, reader

In [None]:
in1.shape, in1.dtype, in1.sum(), in1.max(), in1.min(), in1.dtype, in1, np.unique(in1).shape

In [None]:
np.save('data/interactions_raw.npy', in1) # saves the unnormalized first order interaction data
np.save('data/interactions_order.npy', pnames) # saves the order of proteins in interactions.npy

In [None]:
nin1 = in1.astype(np.float64)

In [None]:
means = np.mean(in1, axis=1)

In [None]:
coef = np.sqrt(means)

In [None]:
np.divide(nin1, coef, dtype=np.float64, out=nin1)
np.divide(nin1.T, coef, dtype=np.float64, out=nin1.T)
nin1.shape, np.unique(nin1).shape

In [None]:
inin1 = (nin1*(10**16)).astype(np.int16) # shift numbers over and remove decimal, so that they can be checked more easily. I found some mirrored pairs were off by an extremley small number
mnin1 = (np.equal(inin1, inin1.transpose())).astype(np.int16) # check interactions against their mirrored selves, then turn to ints

In [None]:
np.unique(mnin1).shape # see number of different states within data. min1 should only have one state when in1 is mirrored across diagonals.

In [None]:
np.all(mnin1) # returns true if min1 is all 1's, meaning that in1 is mirrored across diagonal

In [None]:
mnin1.min(), mnin1.max() # check to see if all no value is 0. if value is 0, then in1 is not mirrored across diagonal.

In [None]:
del inin1, mnin1

In [None]:
weights = [1, 0.8, 0.6]

In [None]:
nin2 = np.matmul(nin1, nin1) # make 2nd order interactions
nin3 = np.matmul(nin2, nin1) # make 3rd orfer interacitons

In [None]:
cnin2 = nin1+nin2      # Composited Normalized Interactions 2nd Order
cnin3 = nin1+nin2+nin3 # Composited Normalized Interactions 3rd Order

In [None]:
cnin2.shape, cnin3.shape

In [None]:
np.save('data/interactions_normalized_1.npy', nin1.astype(np.float32))
np.save('data/interactions_normalized_2.npy', cnin2.astype(np.float32))
np.save('data/interactions_normalized_3.npy', cnin3.astype(np.float32))