In [21]:
import csv
import numpy as np

In [2]:
interaction_file = 'InteractionData/Protein-Protein_Combined-Interactions.txt'
# this is the file 9606.protein.links.full.v10.5.txt processed to be in in 
# array format, using only the combined score, by perl for quicker reading.

In [3]:
with open(interaction_file, 'r') as data:
    reader = csv.reader(data)
    pnames = np.asarray(reader.__next__()) # grab first line, which is the names of all proteins
    in1 = np.array([[np.int(k) for k in line[0].split(' ')] for line in reader],dtype = np.int16)+1 # deserialize the res of the data
del data, reader

In [4]:
pnames, in1.shape

(array(['9606.ENSP00000370023', '9606.ENSP00000344331',
        '9606.ENSP00000311364', ..., '9606.ENSP00000379654',
        '9606.ENSP00000321424', '9606.ENSP00000398632'], dtype='<U20'),
 (19576, 19576))

In [5]:
in1.shape, in1.dtype, in1.sum(), in1.max(), in1.min(), in1.dtype, in1, np.unique(in1).shape

((19576, 19576),
 dtype('int16'),
 3163492402,
 1000,
 0,
 dtype('int16'),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int16),
 (851,))

In [6]:
np.save('data/PPI_raw.npy', in1)
# saves the unnormalized first order interaction data
np.save('data/PPI-Order.npy', pnames)
# saves the order of proteins in interactions.npy

In [7]:
nin1 = in1.astype(np.float64)
nin1.shape

(19576, 19576)

In [8]:
means = np.mean(in1, axis=1)
means.shape

(19576,)

In [9]:
coef = np.sqrt(means)
coef.shape

(19576,)

In [10]:
np.divide(
    nin1, coef, 
    dtype=np.float64, 
    out=nin1
)
np.divide(
    nin1.T, 
    coef, 
    dtype=np.float64, 
    out=nin1.T
)
nin1.shape, np.unique(nin1).shape

((19576, 19576), (7615513,))

In [11]:
inin1 = (nin1*(10**16)).astype(np.int16)
# shift numbers over and remove decimal, so that they can be checked more easily.
# I found some mirrored pairs were off by an extremley small number
mnin1 = (np.equal(inin1, inin1.transpose())).astype(np.int16)
# check interactions against their mirrored selves, then turn to ints

In [12]:
np.unique(mnin1).shape
# see number of different states within data.
# min1 should only have one state when
# in1 is mirrored across diagonals.

(1,)

In [13]:
np.all(mnin1)
# returns true if min1 is all 1's,
# meaning that in1 is mirrored across diagonal

True

In [14]:
mnin1.min(), mnin1.max()
# check to see if all no value is 0.
# if value is 0, then in1 is not
# mirrored across diagonal.

(1, 1)

In [15]:
del inin1, mnin1

In [16]:
nin2 = np.matmul(nin1, nin1) # make 2nd order interactions
nin3 = np.matmul(nin2, nin1) # make 3rd orfer interacitons

In [17]:
cnin2 = nin1+nin2      # Composited Normalized Interactions 2nd Order
cnin3 = nin1+nin2+nin3 # Composited Normalized Interactions 3rd Order

In [18]:
cnin2.shape, cnin3.shape

((19576, 19576), (19576, 19576))

In [19]:
np.save('data/PPI_normalized_1.npy', nin1.astype(np.float32))
np.save('data/PPI_normalized_2.npy', cnin2.astype(np.float32))
np.save('data/PPI_normalized_3.npy', cnin3.astype(np.float32))

In [2]:
import pandas as pd

In [23]:
exp9606 = pd.read_csv('9606_experimental.tsv', sep='\t') # human function data
exp9606

Unnamed: 0,species,ProteinId,Protein,Function,English,Type,DeterminationType,Number
0,9606,ENSP00000000233,ARF5,GO:0006810,transport,UniProtKB-KW,IEA,3
1,9606,ENSP00000000233,ARF5,GO:0006810,transport,UniProtKB-KW,IEA,3
2,9606,ENSP00000000233,ARF5,GO:0007154,cell communication,InterPro,IEA,2
3,9606,ENSP00000000233,ARF5,GO:0007154,cell communication,InterPro,IEA,2
4,9606,ENSP00000000233,ARF5,GO:0007165,signal transduction,InterPro,IEA,2
5,9606,ENSP00000000233,ARF5,GO:0007165,signal transduction,InterPro,IEA,2
6,9606,ENSP00000000233,ARF5,GO:0007264,small GTPase mediated signal transduction,InterPro,IEA,2
7,9606,ENSP00000000233,ARF5,GO:0007264,small GTPase mediated signal transduction,InterPro,IEA,2
8,9606,ENSP00000000233,ARF5,GO:0008104,protein localization,UniProtKB-KW,IEA,3
9,9606,ENSP00000000233,ARF5,GO:0008104,protein localization,UniProtKB-KW,IEA,3


In [24]:
funcs = np.unique(exp9606.Function).astype(str) # all unique functions donefor
afnames = np.unique(exp9606.ProteinId).astype(str) # all unique protein names

In [23]:
plabels = np.zeros(shape=(len(pnames), len(funcs)), dtype=np.dtype('b'))#[np.zeros(shape=len(funcs), dtype=np.dtype('b')) for i in range(len(pnames))]
len(plabels), plabels[0].shape

(19576, (16322,))

In [24]:
hprots = {}
for i, p in enumerate(pnames): # using a hash for indexing was faster than a python search
    hprots[p[5:]] = i

In [25]:
hfuncs = {}
for i, f in enumerate(funcs): # using a hash for indexing was faster than a python search
    hfuncs[f] = i

In [26]:
one = np.asarray(1).astype(np.dtype('b'))

In [27]:
for i, p in exp9606.iterrows():
    #%timeit plabels[p.ProteinId][hfuncs[p.Function]] # useing a hash to look up index is ~400X faster!
    #%timeit plabels[p.ProteinId][pyfuncs.index(p.Function)]
    #print('9606.'+p.ProteinId in pnames, p.Function)
    #print('protein found', p.ProteinId)
    if '9606.'+p.ProteinId in pnames:
        plabels[hprots[p.ProteinId]][hfuncs[p.Function]] = one

In [28]:
plabels.shape, plabels

((19576, 16322), array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0]], dtype=int8))

In [29]:
np.save('data/GO.npy', plabels)

In [30]:
np.save('data/Function-Order.npy', funcs)

In [19]:
cancerfuncs = {}

In [26]:
with open('InteractionData/9606humanCancerExperimental.gvs', 'r') as f:
    for line in f:
        l = line.split('\t')
        p = l[0]
        fs = l[1:]
        fs[-1] = fs[-1][:-1] # remove '\n'
        for f in fs:
            cancerfuncs[p] = cancerfuncs.get(p, np.zeros(shape=funcs.shape, dtype=np.uint16))
            cancerfuncs[p][hfuncs[f]] = 1

IndexError: invalid index to scalar variable.