In [10]:
import numpy as np
from tqdm import tqdm
import pandas as pd

In [12]:
df = pd.read_csv('data/cf_ba_combined.csv')

In [4]:
mhc_id = df['mhc_name'].unique()
mhc_name2id = {}
for i, m in enumerate(mhc_id):
    mhc_name2id[m] = i

pep_id = df['peptide'].unique()
pep_name2id = {}
for i, p in enumerate(pep_id):
    pep_name2id[p] = i

In [None]:
import math
edge1 = []
edge2 = []
negedge1 = []
negedge2 = []
edge_weight_mp = []
edge_weight_mp_neg = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    bind_aff = row['affinity']
    if bind_aff < 500:
        m_name = row['mhc_name']
        p_name = row['peptide']
        m_id = mhc_name2id[m_name]
        p_id = pep_name2id[p_name]
        edge1.append(m_id)
        edge2.append(p_id)
        edge_w = 1 - math.log(bind_aff) / math.log(50000)
        edge_weight_mp.append(edge_w)
    else:
        m_name = row['mhc_name']
        p_name = row['peptide']
        m_id = mhc_name2id[m_name]
        p_id = pep_name2id[p_name]
        negedge1.append(m_id)
        negedge2.append(p_id)
        edge_w = 1 - math.log(bind_aff) / math.log(50000)
        edge_weight_mp_neg.append(edge_w)

edge_mp = np.stack((edge1, edge2))
edge_mp_neg = np.stack((negedge1, negedge2))
edge_weight_mp = np.array(edge_weight_mp)
edge_weight_mp_neg = np.array(edge_weight_mp_neg)

In [7]:
edge_mp = np.concatenate((edge_mp, edge_mp_neg), axis=1)
edge_weight_mp = np.concatenate((edge_weight_mp, edge_weight_mp_neg))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

mhc = np.load('data/mhc_2d_lm.npy')
mhc = np.reshape(mhc, (mhc.shape[0], -1))
sim_mhc = (cosine_similarity(mhc))

threshold = 0.7
mask = (sim_mhc>threshold)
edge_weight_mm = sim_mhc[mask]
idx = np.where(mask)
edge_mm = np.stack((idx[0], idx[1]))
col_eq = edge_mm[0]==edge_mm[1]
edge_mm = edge_mm[:, ~col_eq]
edge_weight_mm = edge_weight_mm[~col_eq]
cond2 = edge_mm[0]<=edge_mm[1]
edge_mm = edge_mm[:, cond2]
edge_weight_mm = edge_weight_mm[cond2]

In [None]:
from scipy.sparse import lil_matrix
from sklearn.metrics.pairwise import cosine_similarity
pt = np.load('data/pt_2d_lm.npy')
pt = np.reshape(pt, (pt.shape[0], -1))
def batch_cosine_similarity_sparse(X, threshold=0.6, batch_size=10000):
    sim_matrix = lil_matrix((X.shape[0], X.shape[0]), dtype=float)
    for start in tqdm(range(0, X.shape[0], batch_size)):
        end = start + batch_size
        batch_similarity = (cosine_similarity(X[start:end], X))
        batch_similarity[batch_similarity <= threshold] = 0
        sim_matrix[start:end] = batch_similarity
    sim_matrix_csr = sim_matrix.tocsr() 
    return sim_matrix_csr

threshold = 0.8
sim_pt = batch_cosine_similarity_sparse(pt, threshold)

In [21]:
sim_pt_coo = sim_pt.tocoo()
edge_pp = np.stack((sim_pt_coo.row, sim_pt_coo.col))
edge_pp_weight = sim_pt_coo.data
col_eq = edge_pp[0]==edge_pp[1]
edge_pp = edge_pp[:, ~col_eq]
edge_pp_weight = edge_pp_weight[~col_eq]

cond2 = edge_pp[0]<=edge_pp[1]
edge_pp = edge_pp[:, cond2]
edge_pp_weight = edge_pp_weight[cond2]

In [9]:
np.save('data/edge_mp.npy', edge_mp)
np.save('data/edge_mm.npy', edge_mm)
np.save('data/edge_pp.npy', edge_pp)
np.save('data/edge_pp_weight.npy', edge_pp_weight)
np.save('data/edge_mp_weight.npy', edge_weight_mp)
np.save('data/edge_mm_weight.npy', edge_weight_mm)