# SSQA - Tutorials

In [1]:
import sys, os, re, random, warnings, subprocess, torch
sys.path.append(os.path.dirname(os.getcwd()))
warnings.filterwarnings("ignore")

Here we recall the full pipeline for Pattern Matching

## 1 - Data Collection



In [None]:
from config import PFAM_DATA
from data import *

In [None]:
pfam_data(f"{DATA}/{DATASET}", "full.fasta")
structfam = get_structures(DATA, DATASET)
build_patterns(structfam, f"{DATA}/{DATASET}")

Define the dataset `DATASET` and the file `filename` where all **aligned** sequences are. The data file `data.pt` will be stored in `PFAM_DATA/DATASET` folder 

In [None]:
DATASET = "russ"
filename = "aligned.fasta"

We start with the two first steps :
- Clustering and splitting the clusters between training and testing set with MMSEQS
- Build HMMer profile with HHsuite

In [None]:
build_protein_df(f"{PFAM_DATA}/{DATASET}", filename)

# We build clusters with MMSEQS
subprocess.run(
    f'mmseqs easy-cluster "{PFAM_DATA}/{DATASET}/unaligned.fasta" "{DATA}/{DATASET}/tmp/clusters.tsv" "{DATA}/{DATASET}/tmp" --min-seq-id 0.7',
    shell=True)

# We compute cluster weights
cluster_weights(folder)

# We split between training and validation set (useful for training RBM)
split_train_val_set(folder)

# We compute profiles
subprocess.run(f'hhmake -i {PFAM_DATA}/{DATASET}/aligned.fasta -M 100', shell=True)
build_profiles(folder)

Next step consists in retrieving the pattern, for this, three methods are available :
1. Retrieving the specific structure from a known `uniprot_id`
2. Retrieving available structure from the PFAM family `pfam_id`
3. Use the `PatternInference` to retrieve a pattern by inference (if no pattern available)

In [None]:
# Method 1 : Retrieving the specific structure from a known `uniprot_id`

uniprot_id = "P0A9J8"
nat_seq = "TSENPLLALREKISALDEKLLALLAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAHYITRLFQLIIEDSVLTQQALLQQH"
search_pattern(f"{DATA}/{MUT_DATASET}", uniprot_id, nat_seq)[0]

In [None]:
# Method 2 : Retrieving available structure from the PFAM family `pfam_id`

strucfam = get_structures(DATASET)
build_patterns(structfam, f"{PFAM_DATA}/{DATASET}")

In [None]:
# Method 3 : Use the `PatternInference` to retrieve a pattern by inference (if no pattern available)

infer_pattern(f"{PFAM_DATA}/{DATASET}", indices = [0])

## 2 - Secondary Structure Inference

### Training

The first thing to do is to train the model if it is not done (a trained model is available in `data/utils`. Here is the way to do it if you wish so with the adapted `NetSurfP2`.

Reference : 

*NetSurfP-2.0: Improved prediction of protein structural features by integrated deep learning, Klausen, Michael Schantz and Jespersen, Martin Closter and Nielsen, Henrik and Jensen, Kamilla Kjaergaard and Jurtz, Vanessa Isabell and Soenderby, Casper Kaae and Sommer, Morten Otto Alexander and Winther, Ole and Nielsen, Morten and Petersen, Bent and others*

In [None]:
from torch.utils.data import DataLoader
from torch import optim

from data import SecondaryStructureAnnotatedDataset, collate_sequences_train
from ss_inference import NetSurfP2

We first retrieve the dataset we collected and reformate from `NetSurfP2`. The training and validation set are available at `data/utils`

In [None]:
train_dataset = SecondaryStructureAnnotatedDataset(f"{UTILS}/training_set.pt", 50)
train_loader = DataLoader(train_dataset, batch_size = 15, collate_fn = collate_sequences_train,
                        shuffle = True, drop_last=True)

val_dataset = SecondaryStructureAnnotatedDataset(f"{UTILS}/validation_set.pt", 50)
val_loader = DataLoader(val_dataset, batch_size = 15, collate_fn = collate_sequences_train,
                        shuffle=False, drop_last=False)

device = torch.device('cuda')

model = NetSurfP2(50, name="netsurp2")
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

We then train, 5 epochs should be enough with these parameters to reach a palier

In [None]:
max_acc = 0
for i in range(5):
    model.train_epoch(train_loader, optimizer, i)
    mean_ss3_acc, _ = model.val_epoch(val_loader, i)
    if mean_ss3_acc > max_acc:
        torch.save(model.state_dict(), f"{UTILS}/nsp2_50feats.h5")
        max_acc = mean_ss3_acc

### Predicting

Once training is done, model is then ready for use. We load a dataset using `SSQAData_SSinf` and `collate_sequences`.

In [None]:
batch_size = 16
dataset = SSQAData_SSinf(f"{PFAM_DATA}/{DATASET}/data.pt")
loader = DataLoader(dataset, batch_size = batch_size, 
                          shuffle = False, drop_last=False, collate_fn = collate_sequences)

We load the model :

In [None]:
model_ss = NetSurfP2(50, "nsp")
model_ss = model_ss.to(device)
optimizer = optim.Adam(model_ss.parameters(), lr=0.001)
model_ss.load_state_dict(torch.load(f"{UTILS}/nsp2_50feats.h5"))

We can then predict

In [None]:
ss3 = torch.zeros(len(dataset), 3, 100)
ss8 = torch.zeros(len(dataset), 8, 100)

for batch_idx, data in enumerate(loader):
    x = torch.tensor(data[0]).float().cuda()
    _, s8, s3 = model_ss(x)
    ss_ = F.softmax(s3,1).detach().cpu()
    ss3[batch_size*batch_idx: batch_size*(batch_idx+1), :, :ss_.size(-1)] = ss_
    ss_ = F.softmax(s8,1).detach().cpu()
    ss8[batch_size*batch_idx: batch_size*(batch_idx+1), :, :ss_.size(-1)] = ss_

## 3 - SSQA

For an overview of how to handle, please refer to `Chorismate Mutase - Russ et al. 2020` notebook

## 4 - Restricted Boltzman Machine

In [None]:
dataset = SSQAData_RBM(f"{DATA}/{DATASET}/data.pt")
loader = DataLoader(dataset, batch_size = 100, shuffle = True)
batch_size, q, N = dataset.seqs.size()

In [None]:
pots = torch.zeros(q+1, N)
for w, v in zip(dataset.weights,dataset):
    pots += w*v
pots /= torch.sum(dataset.weights)
pots = (pots-pots.mean(0)[None]).view(-1).float().to(device)

In [None]:
visible_layers = ["sequence"]
hidden_layers = ["hidden"]

v = OneHotLayer(pots, N=N, q=q+1, name="sequence")
h = GaussianLayer(N=200, name="hidden")

E = [(v.name, h.name)]

model_rbm = MRF(layers = {v.name: v,
                    h.name: h}, edges = E, name = "")

for visible in visible_layers:
    edge = model_rbm.get_edge(visible, "hidden")

optimizer = optim.Adam(model.parameters(), lr=0.001)

#model.load(f"{DATA}/{DATASET}/weights/seq-reg-200_4320.h5")
#model.ais(n_inter = 2000, verbose = True)

In [None]:
for epoch in range(40000):
    model.train_epoch(optimizer, loader, visible_layers, hidden_layers, [gamma], epoch,
          savepath=f"{PFAM_DATA}/{DATASET}/weights/seq-reg-200")
    if not epoch % 30:
        model.val(val_loader, visible_layers, hidden_layers, epoch)

## 5 - Sampling with RBM and SSQA

In [None]:
visible_layers = ["sequence"]
hidden_layers = ["hidden"]
file_weights

v = OneHotLayer(pots, N=N, q=q+1, name="sequence")
h = GaussianLayer(N=200, name="hidden")

E = [(v.name, h.name)]

model_rbm = MRF(layers = {v.name: v,
                    h.name: h},
            edges = E,
            name = "")

for visible in visible_layers:
    edge = model1.get_edge(visible, "hidden")
    
model_rbm.load(f"{DATA}/{DATASET}/weights/{file_weights}.h5")
model_rbm.ais(n_inter = 2000, verbose = True)