In [25]:
from secondary_structure.utils import *
import pandas as pd
import pickle

In [26]:
DATA = "/home/cyril/Documents/These/data/luciferase"
MODELS = "/home/cyril/Documents/These/torch-pgm/nsp2/models"

aa_letters = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 
              'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V',  'W', 'Y']
aa_dict = {k:i for i,k in enumerate(aa_letters)}

In [36]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import pandas as pd

def from_fasta_to_df(file):
    proteins_df = pd.DataFrame(columns = ["aligned_seq", "seq"])
    with open(file, "r") as input_handle:
        for seq in SeqIO.parse(input_handle, "fasta"):
            sequence = str(seq.seq)
            if "X" in sequence: 
                continue
            proteins_df.loc[seq.id] = [sequence, sequence.replace(".", "")]
    return proteins_df

def from_df_to_fasta(df, folder):
    records_aligned = []
    records_unaligned = []
    for ind, data in df.iterrows():
        records_aligned.append(SeqRecord(Seq(data.aligned_seq), id = ind))
        records_unaligned.append(SeqRecord(Seq(data.seq), id = ind))

    with open(f"{folder}/aligned.fasta", "w") as handle:        
        SeqIO.write(records_aligned, handle, "fasta")
    with open(f"{folder}/unaligned.fasta", "w") as handle:        
        SeqIO.write(records_unaligned, handle, "fasta")

In [37]:
filename = 'full.fasta'
proteins_df = from_fasta_to_df(f"{DATA}/{filename}")


In [23]:
proteins_df

NameError: name 'proteins_df' is not defined

In [5]:
filename = 'full.fasta'
proteins_df = from_fasta_to_df(f"{DATA}/{filename}")
proteins_df = proteins_df.drop_duplicates(subset = ["seq"])
proteins_df.seq = proteins_df.aligned_seq.apply(lambda s : "".join([c for c in s if c in aa_letters]))
proteins_df["length"] = proteins_df.seq.apply(lambda seq : len(seq))
# proteins_df = proteins_df[(proteins_df.length <= 32) * (proteins_df.length >= 28)]
from_df_to_fasta(proteins_df, DATA)

In [21]:
df = pd.read_csv(f"{DATA}/out_cluster.tsv", sep="\t", names = ["ind","cluster"])

In [15]:
lux_seqs = np.array(list(proteins_df.seq.apply(lambda x : [aa_dict[c] for c in x])))

In [19]:
pickle.dump(lux_seqs, open(f"{DATA}/raw_seqs.pkl", "wb"))

In [27]:
import netsurfp2 as nsp2
with open(f"{DATA}/aligned.fasta") as f:
    protlist = nsp2.parse_fasta(f)

In [28]:
http://www.rcsb.org/pdb/rest/customReport?pfamAccession=&customReportColumns=pfamAccession,pfamId,pfamDescription&format=csv&service=wsfile

{'0000_A0A1H5RHN3_9PSEU_3-297': ['A0A1H5RHN3_9PSEU/3-297 <unknown description>',
  'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXLSVSLGLWQDRXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXPPEEXXAXXLETAXRAXAXXEXXXXXXXXXXXAXXXXXAXXXXXXXGXXYXPXEXLWIGXEMAXTXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXWDAXXXFALXXXGXXXTAXIGXXXXXAXXXAXXXXXXXTXXXXSXXXXXXXRXXXXXXXXMXXXSXXLXTXXFXXXXXXGXXPXXLXXAXXXXVXXXXTXXVXXXXXRXXXXXDPXXAXXXXXXXXTXIXAXMXGXXXVXAXSVXAXXXXXXXXXALTGXXXXXXXXXXXXXXXXXXXXXXXXRXXXXXXSXXXXXTXXXXXGXVXXAXXLXXXXXGXXTXXXXXSXXXXXXSXXXXDXXXXXVXXXXVXXVXXXXXXXXXXXXXXXXXXXRXXXGXXXWXXHXXXXXXXXXGRXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXSRXDXXXRXXXXAXXXXXAXXXXXTXXXXAXXXXXXXXXLXXXXXXXAXXXEXXXXXSXXXXAXXXIXXXAXXXXXVXXXRXXXQXXXLXXXLXXXXXXXXXDGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXQKSXXXXTXXVXXXXXXDXXXXXGXTVXXXXXVGSXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRGYXRXXLXRXXLXXXAXAXPKSXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXPXXXXLXXXXTXXXXXIXXXXXXXXAXXXXXAXXXXXXXXXXFXXXXXXXXXGXX

In [None]:
process_msa(f"{DATA}/aligned.fasta")

In [3]:
prof = process_msa(a[1])["profile"]

NameError: name 'a' is not defined

In [24]:
def process_msa(seq, parse_to_mem = False):
    hhm_name = f"{DATA}/aligned.hhm"
    msa_name = f"{DATA}/aligned.fasta"

    with open(hhm_name) as fp:
        res = parse_hhm(fp, seq=seq)
        del res["seq"]
    return res

def freq(freqstr):
    if freqstr == '*':
        return 0.
    p = 2**(int(freqstr) / -1000)
    assert 0 <= p <= 1.0
    return p


def parse_hhm(hhmfp, seq=None):
    neff = None
    for line in hhmfp:
        if line[0:4] == 'NEFF' and neff is None:
            neff = float(line[4:].strip())
        if line[0:8] == 'HMM    A':
            header1 = line[7:].strip().split('\t')
            break

    header2 = next(hhmfp).strip().split('\t')
    next(hhmfp)

    hh_seq = []
    profile = []
    for line in hhmfp:
        if line[:2] == '//':
            break
        aa = line[0]
        hh_seq.append(aa)

        freqs = line.split(None, 2)[2].split('\t')[:20]
        features = {h: freq(i) for h, i in zip(header1, freqs)}
        assert len(freqs) == 20

        mid = next(hhmfp)[7:].strip().split('\t')

        features.update({h: freq(i) for h, i in zip(header2, mid)})

        profile.append(features)
        next(hhmfp)

    hh_seq = ''.join(hh_seq)
    seq = seq or hh_seq
    profile = vectorize_profile(profile, seq, hh_seq)

    return {
        'seq': seq,
        'profile': profile,
        'neff': neff,
        'header': header1 + header2,
    }





def vectorize_profile(profile,
                      seq,
                      hh_seq,
                      amino_acids=None,
                      profile_header=None):
    if profile_header is None:
        profile_header = PROFILE_HEADER

    if amino_acids is None:
        amino_acids = AMINO_ACIDS

    seqlen = len(seq)
    aalen = len(amino_acids)
    proflen = len(profile_header)
    profmat = np.zeros((seqlen, aalen + proflen + 1), dtype='float')

    for i, aa in enumerate(seq):
        aa_idx = amino_acids.find(aa)
        if aa_idx > -1:
            profmat[i, aa_idx] = 1.

    if len(profile) == len(seq):
        for i, pos in enumerate(profile):
            for j, key in enumerate(profile_header, aalen):
                profmat[i, j] = pos[key]
    else:
        hh_index = -1
        for i, restype in enumerate(seq):
            if restype != 'X':
                hh_index += 1
                assert restype == hh_seq[hh_index]

            if hh_index >= 0:
                for j, key in enumerate(profile_header, aalen):
                    profmat[i, j] = profile[hh_index][key]

    profmat[:, -1] = 1.

    return profmat

In [28]:
from tqdm.notebook import tqdm
import random

protlist = {k:v for k,v in random.sample(protlist.items(), 100)}
dataset = dict()
for k,(_,v) in tqdm(protlist.items()):
    prof = process_msa(v)["profile"]
    idx = np.where(prof[:,:20].sum(1))
    dataset[k] = prof[idx]




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [33]:
import pickle
pickle.dump(dataset, open(f"{DATA}/hmm_dataset", "wb"))

In [21]:
import pypdb
import ssbio

ModuleNotFoundError: No module named 'ssbio'