In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares



ModuleNotFoundError: No module named 'implicit'

In [3]:
class Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size, input_size),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [4]:
df = pd.read_csv('data/pp-contingency-matrix.csv')
df

Unnamed: 0,uniprot_id,reactome_pathway_id
0,A0A075B5J3,R-MMU-198955
1,A0A075B5J3,R-MMU-202165
2,A0A075B5J3,R-MMU-202168
3,A0A075B5J3,R-MMU-202174
4,A0A075B5J3,R-MMU-202214
...,...,...
79492,V9GXR0,R-MMU-983422
79493,V9GXR0,R-MMU-983424
79494,V9GXR0,R-MMU-983425
79495,V9GXR0,R-MMU-983427


In [5]:
protein_pathway_df = pd.crosstab(df['uniprot_id'], df['reactome_pathway_id'])
protein_pathway_df

reactome_pathway_id,R-MMU-1006143,R-MMU-1006169,R-MMU-1008200,R-MMU-1008243,R-MMU-1008248,R-MMU-1011576,R-MMU-1011598,R-MMU-1011600,R-MMU-1013012,R-MMU-1013013,...,R-MMU-994140,R-MMU-994148,R-MMU-996727,R-MMU-996755,R-MMU-997237,R-MMU-997263,R-MMU-997309,R-MMU-997311,R-MMU-997314,R-MMU-997326
uniprot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A075B5J3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A075B5J4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A075B5J7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A075B5J9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A075B5K0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
V9GWY0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V9GX35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V9GXQ3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V9GXR0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# compute the number of 1 in each row
stats = protein_pathway_df.sum(axis=1)
# extract the average, minimum and maximum from stats
avg = stats.mean()
min = stats.min()
max = stats.max()
# print the results
print('Average: ', avg)
print('Minimum: ', min)
print('Maximum: ', max)
# compute the frequency of each value in stats
freq = stats.value_counts()
# print the frequency
print(freq)


Average:  8.304293324976497
Minimum:  1
Maximum:  446
1      2853
2      1378
4       885
3       820
5       524
       ... 
104       1
86        1
192       1
100       1
95        1
Name: count, Length: 112, dtype: int64


In [7]:
protein_pathway_matrix = protein_pathway_df.values
protein_pathway_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
# print the number of colmns 
print('Number of columns: ', protein_pathway_matrix.shape[1])

Number of columns:  9390


## Autoencoders

In [26]:
ae = Autoencoder(protein_pathway_matrix.shape[1], 50)
criterion = nn.BCELoss()
optimizer = optim.Adam(ae.parameters(), lr=0.001)

protein_pathway_tensor = torch.FloatTensor(protein_pathway_matrix)

for epoch in range(100):  # Number of epochs
    optimizer.zero_grad()  # Zero the gradients
    outputs = ae(protein_pathway_tensor)  # Forward pass
    loss = criterion(outputs, protein_pathway_tensor)  # Compute the loss
    loss.backward()  # Backward pass
    optimizer.step()  # Update the weights

    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, 100, loss.item()))

Epoch [1/100], Loss: 0.6944
Epoch [2/100], Loss: 0.6937
Epoch [3/100], Loss: 0.6929
Epoch [4/100], Loss: 0.6919
Epoch [5/100], Loss: 0.6908
Epoch [6/100], Loss: 0.6893
Epoch [7/100], Loss: 0.6877
Epoch [8/100], Loss: 0.6857
Epoch [9/100], Loss: 0.6834
Epoch [10/100], Loss: 0.6808
Epoch [11/100], Loss: 0.6779
Epoch [12/100], Loss: 0.6746
Epoch [13/100], Loss: 0.6709
Epoch [14/100], Loss: 0.6669
Epoch [15/100], Loss: 0.6625
Epoch [16/100], Loss: 0.6577
Epoch [17/100], Loss: 0.6526
Epoch [18/100], Loss: 0.6471
Epoch [19/100], Loss: 0.6412
Epoch [20/100], Loss: 0.6350
Epoch [21/100], Loss: 0.6284
Epoch [22/100], Loss: 0.6215
Epoch [23/100], Loss: 0.6143
Epoch [24/100], Loss: 0.6068
Epoch [25/100], Loss: 0.5990
Epoch [26/100], Loss: 0.5910
Epoch [27/100], Loss: 0.5828
Epoch [28/100], Loss: 0.5744
Epoch [29/100], Loss: 0.5658
Epoch [30/100], Loss: 0.5571
Epoch [31/100], Loss: 0.5483
Epoch [32/100], Loss: 0.5393
Epoch [33/100], Loss: 0.5303
Epoch [34/100], Loss: 0.5212
Epoch [35/100], Loss: 0

In [27]:
z = ae.encoder(torch.from_numpy(protein_pathway_matrix).float())
z

tensor([[1.3409, 1.1902, 1.2969,  ..., 1.3034, 1.3574, 1.3377],
        [1.3409, 1.1902, 1.2969,  ..., 1.3034, 1.3574, 1.3377],
        [2.3627, 1.8935, 1.7375,  ..., 2.4602, 2.5662, 2.4860],
        ...,
        [0.8103, 0.7288, 0.7123,  ..., 0.8376, 0.7317, 0.7703],
        [1.6432, 1.5359, 1.5853,  ..., 1.7429, 1.6684, 1.7545],
        [0.2384, 0.2281, 0.2227,  ..., 0.2328, 0.2161, 0.2320]],
       grad_fn=<ReluBackward0>)

In [50]:
target_protein = torch.from_numpy(protein_pathway_matrix[60]).float()
print(target_protein)

target = ae.encoder(target_protein)
similarities = cosine_similarity([target.detach().numpy()], z.detach().numpy())

k = 5
top_k_proteins = similarities.argsort()[0][::-1][:k]
for p in top_k_proteins:
    print(p, similarities[0][p])

tensor([0., 0., 0.,  ..., 0., 0., 0.])
50 1.0000001
20 1.0000001
19 1.0000001
1511 1.0000001
43 1.0000001


## MF

In [67]:
matrix = csr_matrix(protein_pathway_matrix)
mf_model = AlternatingLeastSquares(factors=120)

mf_model.fit(matrix)

protein_factors = mf_model.user_factors

100%|██████████| 15/15 [00:00<00:00, 31.29it/s]


In [69]:
target_protein = 30
protein_factors = mf_model.user_factors
similarities = cosine_similarity([protein_factors[target_protein]], protein_factors)

k = 5
top_k_proteins = similarities.argsort()[0][::-1][:k]
for p in top_k_proteins:
    print(p, similarities[0][p])




82 1.0
100 1.0
41 1.0
40 1.0
39 1.0


## Subset selection

In [None]:
train = pd.read_csv('data/train.csv', index_col=0)
target = 'group'
features = train.columns.tolist()
features.remove(target)
features

In [6]:
best = []
acc = {}
while len(best) < 3:
    max_acc = 0
    remaining_features = list(set(features) - set(best))
    new_pval = pd.Series(index=remaining_features)
    for new_column in remaining_features:
        model = LogisticRegression()
        model.fit(train[best + [new_column]], train[target])
        target_predicted = model.predict(train[best + [new_column]])
        acc = metrics.accuracy_score(train[target], target_predicted)
        if max_acc < acc:
            max_acc = acc
            max_column = new_column
    best.append(max_column)
    features.remove(max_column)
    print('Best columns: ', best)
    print('Accuracy: ', max_acc)

  new_pval = pd.Series(index=remaining_features)


Best columns:  ['E9PWX8']
Accuracy:  1.0


  new_pval = pd.Series(index=remaining_features)


Best columns:  ['E9PWX8', 'D3Z7B5']
Accuracy:  1.0


  new_pval = pd.Series(index=remaining_features)


Best columns:  ['E9PWX8', 'D3Z7B5', 'E9Q1M6']
Accuracy:  1.0


In [1]:
best = []
acc = {}
max_acc = 0
remaining_features = list(set(features) - set(best))
new_pval = pd.Series(index=remaining_features)
for new_column in remaining_features:
    model = LogisticRegression()
    model.fit(train[best + [new_column]], train[target])
    target_predicted = model.predict(train[best + [new_column]])
    acc = metrics.accuracy_score(train[target], target_predicted)
    if max_acc < acc:
        max_acc = acc
        max_column = new_column
best.append(max_column)
features.remove(max_column)

while len(best) < 3:
    last = best[-1]
    # get from reactome_pathway_id the index of the row with last
    target_protein = protein_pathway_df.index.get_loc(last)
    similarities = cosine_similarity([protein_factors[target_protein]], protein_factors)

    k = 5
    top_k_proteins = similarities.argsort()[0][::-1][:k]

    for k in top_k_proteins:
        max_acc = 0
        new_column = protein_pathway_df.index[k]
        print(k, new_column)
        


NameError: name 'features' is not defined

In [74]:
key = 'A0A075B5J9'
if key in protein_pathway_df.index:
    print('Found')
    print(protein_pathway_df.loc[key])
else:
    print('Not found')


Found
reactome_pathway_id
R-MMU-1006143    0
R-MMU-1006169    0
R-MMU-1008200    0
R-MMU-1008243    0
R-MMU-1008248    0
                ..
R-MMU-997263     0
R-MMU-997309     0
R-MMU-997311     0
R-MMU-997314     0
R-MMU-997326     0
Name: A0A075B5J9, Length: 9390, dtype: int64


In [None]:
        model = LogisticRegression()
        model.fit(train[best + [new_column]], train[target])
        target_predicted = model.predict(train[best + [new_column]])
        acc = metrics.accuracy_score(train[target], target_predicted)
        if max_acc < acc:
            max_acc = acc
            max_column = new_column

    best.append(max_column)
    features.remove(max_column)
    print('Best columns: ', best)
    print('Accuracy: ', max_acc)



In [None]:


k = 5
top_k_proteins = similarities.argsort()[0][::-1][:k]
for p in top_k_proteins:
    print(p, similarities[0][p])   

    print('Best columns: ', best)
    print('Accuracy: ', max_acc)