In [2]:
import json
import os
from Bio import SeqIO

In [3]:
# Set paths of raw MobiDB files
# Set path of curated-disorder-merge fasta file
curated_fasta_path = "../../../data/MobiDB/curated-disorder-merge-2074.fasta"

# Set path of derived-missing_residues-th_90 fasta file
missing_fasta_path = "../../../data/MobiDB/derived-missing_residues-th_90-35136.fasta"

# Set path of derived-mobile-th_90 fasta file
mobile_fasta_path = "../../../data/MobiDB/derived-mobile-th_90-1137.fasta"

In [4]:
# Set save locations for generated fasta files
# Set allseq.fasta save path
allseq_path = "out/"

# Set alldisorder.fasta save path
alldisorder_path = "out/"

# Set individual AA sequence fasta files save path
seq_fastas_path = "out/Seq/"

# Set individual disorder fasta files save path
disorder_fastas_path = "out/Disorder/"

# Make save locations
for path in [allseq_path, alldisorder_path, seq_fastas_path, disorder_fastas_path]:
    if not os.path.exists(path):
        os.mkdir(path)

In [5]:
def add_protein(protein_code, protein_dict):
    """
    Using the inputted protein code, write protein information into allseq.fasta and 
    alldisorder.fasta and create new individual AA seq and binary disorder fasta files
    for the protein.
    
    Input:
    - protein_code => string of acc number assigned to protein by MobiDB
    - protein_dict => dict with format: (key, value) = ("Q9UPN6", ["Q9UPN6|sequence|SR-rel...","MEAVK...", "10000..."])
    
    Output:
    - new protein entry in allseq.fasta and alldisorder.fasta
    - new individual fasta files for protein
    """
    # Write AA sequence of protein into allseq.fasta as entry 
    with open(allseq_path + "allseq.fasta", "a") as file:
        header, seq, _ = protein_dict.get(protein_code)
        seqstring = "\n".join(seq[i:i+80] for i in range(0, len(seq), 80))
        file.write(">" + header + "\n" + seqstring + "\n")
    
    # Write individual AA sequence fasta file for protein 
    with open(seq_fastas_path + protein_code + "_seq" +".fasta", "w+") as file:
        header, seq, _ = protein_dict.get(protein_code)
        seqstring = "\n".join(seq[i:i+80] for i in range(0, len(seq), 80))
        file.write(">" + header + "\n" + seqstring + "\n")

    # Write binary disorder sequence of protein into alldisorder.fasta as entry 
    with open(alldisorder_path + "alldisorder.fasta", "a") as file:
        header, _, seq = protein_dict.get(protein_code)
        seqstring = "\n".join(seq[i:i+80] for i in range(0, len(seq), 80))
        file.write(">" + header + "\n" + seqstring + "\n")
    
    # Write individual binary disorder sequence fasta file for protein
    with open(disorder_fastas_path + protein_code + "_disorder" + ".fasta", "w+") as file:
        header, _, seq = protein_dict.get(protein_code)
        seqstring = "\n".join(seq[i:i+80] for i in range(0, len(seq), 80))
        file.write(">" + header + "\n" + seqstring + "\n")

In [6]:
def merge_mobile_missing(protein_code, mobile_dict, missing_dict):
    """
    Merges missing and mobile binary disorder annotations for protein given by input protein_code 
    into one binary disorder sequence and writes this sequence into allseq.fasta and alldisorder.fasta 
    and new individual AA seq and binary disorder fasta files for the protein.
    
    Residue is labeled 1 in merged annotation as long as it is labeled 1 by either missing or mobile annotations.
    
    
    Input:
    - protein_code => string of acc number assigned to protein by MobiDB
    - mobile_dict => dict containing info for proteins with mobile annotations
    - missing_dict => dict containing info for proteins with missing annotations
    ** dict format: (key, value) = ("Q9UPN6", ["Q9UPN6|sequence|SR-rel...","MEAVK...", "10000..."])
    
    Output:
    - new protein entry in allseq.fasta
    - merged binary disorder sequence protein entry alldisorder.fasta
    - new individual fasta files for protein
    """
    # Merge annotations
    mobile_seq = mobile_dict.get(protein_code)[2]
    missing_seq = missing_dict.get(protein_code)[2]

    merge_seq = []
    for s1, s2 in zip(mobile_seq, missing_seq):
        if s1 == s2:
            merge_seq.append(s1)
        elif (s1 == '1') or (s2 == '1'):  # Effectively an else
            merge_seq.append('1')
    merge_seq = ''.join(merge_seq)  # Convert list to string
            
    # Create temporary protein_dict for protein
    merge_protein_dict = {}
    merge_protein_dict[protein_code] = [mobile_dict[protein_code][0], mobile_dict[protein_code][1], merge_seq]
    
    # Write fasta files
    add_protein(protein_code, merge_protein_dict)

There are the following numbers of proteins in each file:
 - curated-disorder-merge: 2074
 - derived-missing_residues-th_90: 35136
 - derived-mobile-th_90: 1137
 - total unique: 36391

In [7]:
# Load in fasta files with biopython
curated_fasta = list(SeqIO.parse(curated_fasta_path, 'fasta'))
missing_fasta = list(SeqIO.parse(missing_fasta_path, 'fasta'))
mobile_fasta = list(SeqIO.parse(mobile_fasta_path, 'fasta'))

In [8]:
# Clean lists of unnecessary entries
# The raw FASTAs contain not only the amino acid sequences,
# but also residue labels from varying sources and other annotations
clean_curated_fasta = []
clean_missing_fasta = []
clean_mobile_fasta = []

for entry in curated_fasta:
    # Extracting AA sequence of protein
    if "sequence" in entry.description:
        clean_curated_fasta.append(entry)
    # Extracting binary disorder sequence of protein
    if "curated-disorder-merge" in entry.description:
        clean_curated_fasta.append(entry)
        
for entry in missing_fasta:
    # Extracting AA sequence of protein
    if "sequence" in entry.description:
        clean_missing_fasta.append(entry)
    # Extracting binary disorder sequence of protein
    if "derived-missing_residues-th_90" in entry.description:
        clean_missing_fasta.append(entry)
        
for entry in mobile_fasta:
    # Extracting AA sequence of protein
    if "sequence" in entry.description:
        clean_mobile_fasta.append(entry)
    # Extracting binary disorder sequence of protein
    if "derived-mobile-th_90" in entry.description:
        clean_mobile_fasta.append(entry)

In [9]:
# Check length of lists 
# Length should be double the amount of proteins in each category since 
# there are two entries (AA seq and binary label seq) per protein
print(f"length of clean_curated_fasta: {len(clean_curated_fasta)}")
print(f"length of clean_missing_fasta: {len(clean_missing_fasta)}")
print(f"length of clean_mobile_fasta: {len(clean_mobile_fasta)}")

length of clean_curated_fasta: 4148
length of clean_missing_fasta: 70272
length of clean_mobile_fasta: 2274


In [10]:
# Create dicts from cleaned lists
# Make dicts with string of protein acc as key and list 
# containing description and string of protein AA sequence as value
# (key, value) = ("Q9UPN6", ["Q9UPN6|sequence|SR-rel...","MEAVKTFNSELYSLNDYKP..."])
curated_dict = {}
missing_dict = {}
mobile_dict = {}

for entry in clean_curated_fasta:
    if "sequence" in entry.description:
        curated_dict[entry.description.split("|")[0]] = [entry.description, str(entry.seq)]

for entry in clean_missing_fasta:
    if "sequence" in entry.description:
        missing_dict[entry.description.split("|")[0]] = [entry.description, str(entry.seq)]
        
for entry in clean_mobile_fasta:
    if "sequence" in entry.description:
        mobile_dict[entry.description.split("|")[0]] = [entry.description, str(entry.seq)]
        
# Append string of binary disorder sequence of protein to value list
# (key, value) = ("Q9UPN6", ["Q9UPN6|sequence|SR-rel...","MEAVKTFNSELYSLND...", "100000000000..."])
for entry in clean_curated_fasta:
    if "curated-disorder-merge" in entry.description:
        curated_dict[entry.description.split("|")[0]].append(str(entry.seq))
        
for entry in clean_missing_fasta:
    if "derived-missing_residues-th_90" in entry.description:
        missing_dict[entry.description.split("|")[0]].append(str(entry.seq))

for entry in clean_mobile_fasta:
    if "derived-mobile-th_90" in entry.description:
        mobile_dict[entry.description.split("|")[0]].append(str(entry.seq))

In [11]:
# Check length of dicts
# Length should equal amount of proteins in each category
print(f"length of curated_dict: {len(curated_dict)}")
print(f"length of missing_dict: {len(missing_dict)}")
print(f"length of mobile_dict: {len(mobile_dict)}")

length of curated_dict: 2074
length of missing_dict: 35136
length of mobile_dict: 1137


In [12]:
# Create lists of protein acc in each dict
# set() creates set of keys from dict
curated_accs = set(curated_dict)
missing_accs = set(missing_dict)
mobile_accs = set(mobile_dict)

In [13]:
# Check length of lists
# Length should equal amount of proteins in each category
print(f"length of curated_accs: {len(curated_accs)}")
print(f"length of missing_accs: {len(missing_accs)}")
print(f"length of mobile_accs: {len(mobile_accs)}")

length of curated_accs: 2074
length of missing_accs: 35136
length of mobile_accs: 1137


In [14]:
# Create set of all protein acc excluding repeats
all_accs = curated_accs | missing_accs | mobile_accs

In [15]:
# Check length of set
# Length should equal total amount of proteins excluding repeats
print(f"length of all_accs: {len(all_accs)}")

length of all_accs: 36391


In [16]:
# Generate fasta files
for acc in all_accs: 
    if acc in curated_accs:
        add_protein(acc, curated_dict)
    elif acc in missing_accs and acc in mobile_accs: 
        merge_mobile_missing(acc, mobile_dict, missing_dict)
    elif acc in missing_accs and acc not in mobile_accs:
        add_protein(acc, missing_dict)
    elif acc in mobile_accs and acc not in missing_accs:  # Effectively an else
        add_protein(acc, mobile_dict)