In [125]:
import os
import json
from Bio import SeqIO

In [126]:
#set file locations of raw MobiDB data files(uploaded to Google Drive)

#set file location of curated-disorder-merge fasta file
curated_fasta_loc = "../MobiDB Data/Raw/curated-disorder-merge-2074.fasta"
print(f"curated-disorder-merge fasta file location: '{curated_fasta_loc}'")

#set file location of derived-missing_residues-th_90 fasta file
missing_fasta_loc = "../MobiDB Data/Raw/derived-missing_residues-th_90-35136.fasta"
print(f"derived-missing_residues-th_90 fasta file location: '{missing_fasta_loc}'")

#set file location of derived-mobile-th_90 fasta file
mobile_fasta_loc = "../MobiDB Data/Raw/derived-mobile-th_90-1137.fasta"
print(f"derived-mobile-th_90 fasta file location: '{mobile_fasta_loc}'")

curated-disorder-merge fasta file location: '../MobiDB Data/Raw/curated-disorder-merge-2074.fasta'
derived-missing_residues-th_90 fasta file location: '../MobiDB Data/Raw/derived-missing_residues-th_90-35136.fasta'
derived-mobile-th_90 fasta file location: '../MobiDB Data/Raw/derived-mobile-th_90-1137.fasta'


In [127]:
#set save locations for generated fasta files

#set allseq.fasta save location
allseq_loc = "../MobiDB Data/Processed/"
print(f"allseq.fasta save location: '{allseq_loc}'")

#set alldisorder.fasta save location
alldisorder_loc = "../MobiDB Data/Processed/"
print(f"alldisorder.fasta save location: '{alldisorder_loc}'")

#set individual AA sequence fasta files save location
seq_fastas_loc = "../MobiDB Data/Processed/Seq/"
print(f"seq fastas save location: '{seq_fastas_loc}'")

#set individual disorder fasta files save location
disorder_fastas_loc = "../MobiDB Data/Processed/Disorder/"
print(f"disorder fastas save location: '{disorder_fastas_loc}'")

allseq.fasta save location: '../MobiDB Data/Processed/'
alldisorder.fasta save location: '../MobiDB Data/Processed/'
seq fastas save location: '../MobiDB Data/Processed/Seq/'
disorder fastas save location: '../MobiDB Data/Processed/Disorder/'


In [128]:
#define function for generating fasta files

def add_protein(protein_code, protein_dict):
    
    """
    Using the inputted protein code, write protein information into allseq.fasta and 
    alldisorder.fasta and create new individual AA seq and binary disorder fasta files
    for the protein.
    
    Input:
    - protein_code => string of acc number assigned to protein by MobiDB
    - protein_dict => dict with format:(key, value) = ("Q9UPN6", ["Q9UPN6|sequence|SR-rel...","MEAVK...", "10000..."])
    
    Output:
    - new protein entry in allseq.fasta and alldisorder.fasta
    - new individual fasta files for protein
    
    """
    
    #write AA sequence of protein into allseq.fasta as entry 
    with open(allseq_loc + "allseq.fasta", "a") as file:
        file.write(">" + protein_dict.get(protein_code)[0] + "\n" + protein_dict.get(protein_code)[1] + "\n")
    
    #write individual AA sequence fasta file for protein 
    with open(seq_fastas_loc + protein_code + "_seq" +".fasta", "w+") as file:
        file.write(">" + protein_dict.get(protein_code)[0] + "\n" + protein_dict.get(protein_code)[1] + "\n")

    #write binary disorder sequence of protein into alldisorder.fasta as entry 
    with open(alldisorder_loc + "alldisorder.fasta", "a") as file:
        file.write(">" + protein_dict.get(protein_code)[0] + "\n" + protein_dict.get(protein_code)[2] + "\n")
    
    #write individual binary disorder sequence fasta file for protein
    with open(disorder_fastas_loc + protein_code + "_disorder" + ".fasta", "w+") as file:
        file.write(">" + protein_dict.get(protein_code)[0] + "\n" + protein_dict.get(protein_code)[2] + "\n")

In [129]:
#define function for merging missing and mobile binary disorder annotations 
#into one binary disorder sequence and writing it into fasta files 

def merge_mobile_missing(protein_code, mobile_dict, missing_dict):
    
    """
    Merges missing and mobile binary disorder annotations for protein given by inputted protein_code 
    into one binary disorder sequence and writes this sequence into allseq.fasta and alldisorder.fasta 
    and new individual AA seq and binary disorder fasta files for the protein.
    
    Residue is labeled 1 in merged annotation as long as it is labeled 1 by either missing or mobile annotations.
    
    
    Input:
    - protein_code => string of acc number assigned to protein by MobiDB
    - mobile_dict => dict containing info for proteins with mobile annotations
    - missing_dict => dict containing info for proteins with missing annotations
    ** dict format: (key, value) = ("Q9UPN6", ["Q9UPN6|sequence|SR-rel...","MEAVK...", "10000..."])
    
    Output:
    - new protein entry in allseq.fasta
    - merged binary disorder sequence protein entry alldisorder.fasta
    - new individual fasta files for protein
    
    """
    
    #merging annotations
    mobile_seq = mobile_dict.get(protein_code)[2]
    missing_seq = missing_dict.get(protein_code)[2]
    merge_seq = ""
   
    for s in range(max(len(mobile_seq), len(missing_seq))):
        if mobile_seq[s] == missing_seq[s]:
            merge_seq += mobile_seq[s]
        elif (mobile_seq[s] == '1') or (missing_seq[s] == '1'):
            merge_seq += '1'
            
    #create temporary protein_dict for protein
    merge_protein_dict = {}
    merge_protein_dict[protein_code] = [mobile_dict[protein_code][0],mobile_dict[protein_code][1], merge_seq]
    
    #write fasta files
    add_protein(protein_code, merge_protein_dict)

Will be checking code throughout script by checking if length of dictionaries and lists are correct.

Know from exploration of data that there are:
 - 2074 proteins in curated-disorder-merge
 - 35136 proteins in derived-missing_residues-th_90
 - 1137 proteins in derived-mobile-th_90
 - 36391 proteins when all combined and repeats disregarded

In [130]:
#load in fasta files with biopython

curated_fasta = SeqIO.parse(curated_fasta_loc, 'fasta')
missing_fasta = SeqIO.parse(missing_fasta_loc, 'fasta')
mobile_fasta = SeqIO.parse(mobile_fasta_loc, 'fasta')

In [131]:
#load fastas into lists

curated_fasta_lst = []
missing_fasta_lst = []
mobile_fasta_lst = []

for entry in curated_fasta:
    curated_fasta_lst.append(entry)
    
for entry in missing_fasta:
    missing_fasta_lst.append(entry)
    
for entry in mobile_fasta:
    mobile_fasta_lst.append(entry)

In [132]:
#clean lists of unnecessary entries

clean_curated_fasta_lst = []
clean_missing_fasta_lst = []
clean_mobile_fasta_lst = []

for entry in curated_fasta_lst:
    #extracting AA sequence of protein
    if "sequence" in entry.description:
        clean_curated_fasta_lst.append(entry)
    #extracting binary disorder sequence of protein
    if "curated-disorder-merge" in entry.description:
        clean_curated_fasta_lst.append(entry)
        
for entry in missing_fasta_lst:
    #extracting AA sequence of protein
    if "sequence" in entry.description:
        clean_missing_fasta_lst.append(entry)
    #extracting binary disorder sequence of protein
    if "derived-missing_residues-th_90" in entry.description:
        clean_missing_fasta_lst.append(entry)
        
for entry in mobile_fasta_lst:
    #extracting AA sequence of protein
    if "sequence" in entry.description:
        clean_mobile_fasta_lst.append(entry)
    #extracting binary disorder sequence of protein
    if "derived-mobile-th_90" in entry.description:
        clean_mobile_fasta_lst.append(entry)

In [133]:
#checking length of lists 

#length should be double the amount of proteins in each category since 
#there are two entries (AA seq and binary disorder seq) per protein

print(f"length of clean_curated_fasta_lst: '{len(clean_curated_fasta_lst)}'")
print(f"length of clean_missing_fasta_lst: '{len(clean_missing_fasta_lst)}'")
print(f"length of clean_mobile_fasta_lst: '{len(clean_mobile_fasta_lst)}'")

length of clean_curated_fasta_lst: '4148'
length of clean_missing_fasta_lst: '70272'
length of clean_mobile_fasta_lst: '2274'


In [134]:
#create dicts from cleaned lists

#make dicts with string of protein acc as key and list 
#containing description and string of protein AA sequence as value

#(key, value) = ("Q9UPN6", ["Q9UPN6|sequence|SR-rel...","MEAVKTFNSELYSLNDYKP..."])

curated_dict = {}
missing_dict = {}
mobile_dict = {}

for entry in clean_curated_fasta_lst:
    if "sequence" in entry.description:
        curated_dict[entry.description.split("|")[0]] = [entry.description, str(entry.seq)]

for entry in clean_missing_fasta_lst:
    if "sequence" in entry.description:
        missing_dict[entry.description.split("|")[0]] = [entry.description, str(entry.seq)]
        
for entry in clean_mobile_fasta_lst:
    if "sequence" in entry.description:
        mobile_dict[entry.description.split("|")[0]] = [entry.description, str(entry.seq)]
        
#append string of binary disorder sequence of protein to value list
#(key, value) = ("Q9UPN6", ["Q9UPN6|sequence|SR-rel...","MEAVKTFNSELYSLND...", "100000000000..."])

for entry in clean_curated_fasta_lst:
    if "curated-disorder-merge" in entry.description:
        curated_dict[entry.description.split("|")[0]].append(str(entry.seq))
        
for entry in clean_missing_fasta_lst:
    if "derived-missing_residues-th_90" in entry.description:
        missing_dict[entry.description.split("|")[0]].append(str(entry.seq))

for entry in clean_mobile_fasta_lst:
    if "derived-mobile-th_90" in entry.description:
        mobile_dict[entry.description.split("|")[0]].append(str(entry.seq))

In [135]:
#check length of dicts
#length should equal amount of proteins in each category

print(f"length of curated_dict: '{len(curated_dict)}'")
print(f"length of missing_dict: '{len(missing_dict)}'")
print(f"length of mobile_dict: '{len(mobile_dict)}'")

length of curated_dict: '2074'
length of missing_dict: '35136'
length of mobile_dict: '1137'


In [136]:
#create lists of protein acc in each dict(will be used in final preprocessing)

curated_acc_lst = []
missing_acc_lst = []
mobile_acc_lst = []

for key in curated_dict:
    curated_acc_lst.append(key)
    
for key in missing_dict:
    missing_acc_lst.append(key)

for key in mobile_dict:
    mobile_acc_lst.append(key)

In [137]:
#check length of lists
#length should equal amount of proteins in each category

print(f"length of curated_acc_lst: '{len(curated_acc_lst)}'")
print(f"length of missing_acc_lst: '{len(missing_acc_lst)}'")
print(f"length of mobile_acc_lst: '{len(mobile_acc_lst)}'")

length of curated_acc_lst: '2074'
length of missing_acc_lst: '35136'
length of mobile_acc_lst: '1137'


In [138]:
#create list of all protein acc excluding repeats(will be used in final preprocessing)

all_acc_lst = []

all_acc_lst.extend(curated_acc_lst)

for acc in missing_acc_lst:
    if acc not in curated_acc_lst:
        all_acc_lst.append(acc)

for acc in mobile_acc_lst:
    if acc not in curated_acc_lst and acc not in missing_acc_lst:
        all_acc_lst.append(acc)

In [139]:
#check length of list
#length should equal total amount of proteins excluding repeats

print(f"length of all_acc_lst: '{len(all_acc_lst)}'")

length of all_acc_lst: '36391'


In [140]:
#generate fasta files

for acc in all_acc_lst: 
    if acc in curated_acc_lst:
        add_protein(acc, curated_dict)
    elif acc in missing_acc_lst and acc in mobile_acc_lst: 
        merge_mobile_missing(acc, mobile_dict, missing_dict)
    elif acc in missing_acc_lst and acc not in mobile_acc_lst:
        add_protein(acc, missing_dict)
    elif acc in mobile_acc_lst and acc not in missing_acc_lst:
        add_protein(acc, mobile_dict)