In [1]:
import pandas as pd
import numpy as np

# Reformat data for PyTorch and Save

In [2]:
# load data
proteins = pd.read_csv('data/Datasetv1.3.csv', index_col=0)
protein_links = pd.read_csv('data/protein_links.csv', index_col=0)

# make string_protein_id the index
proteins['#string_protein_id'] = proteins['#string_protein_id'].astype(int)
proteins = proteins.set_index('#string_protein_id')


In [3]:
# find unique proteins in protein_links
unique_proteins = np.unique(list(protein_links.protein1) + list(protein_links.protein2))
unique_proteins = np.sort(unique_proteins)
unique_proteins

array([   233,    412,   1008, ..., 485668, 485672, 485678])

In [4]:
# create a unique id system starting at 0 (for link list for pytorch, etc...)
protein_ids_dict = {id: protein for id, protein in enumerate(unique_proteins)}
protein_ids_dict_inv = {protein: id for id, protein in enumerate(unique_proteins)}

In [5]:
# create the link list in this new unique id system
link_list = [(protein_ids_dict_inv[p1], protein_ids_dict_inv[p2]) for p1, p2 in zip(protein_links.protein1, protein_links.protein2)]

In [6]:
# save the id dictionary and link list
np.save('data/protein_ids_dict.npy', protein_ids_dict)
np.save('data/link_list.npy', link_list)

In [7]:
# generate X feature matrix
all_features = ['protein_size'] + list(proteins.columns[4:])
valid_features = [feature for feature in all_features if (~proteins[feature].isna()).sum() > 1000] # a feature is valid if > 5% of proteins have it

# FIXME: currently the feature matrix is mostly NaN's. this is a placeholder while we get a proper protein feature dataset
proteins_ = proteins.loc[unique_proteins][all_features].fillna(0) # simply replace NaN's with 0 :(
X = proteins_.to_numpy()

In [8]:
np.save('data/protein_features.npy', X)

In [9]:
# save Y labels
# FIXME: where do these labels come from?
Y = proteins.Label

In [10]:
np.save('data/protein_labels.npy', Y)