In [1]:
# Import the libraries and functions
from functions import *

## Load the data
We start with uploading the file containing information regarding the disordered regions (`curated.mjson`) which further will be used to filter the instances from the input.

In [2]:
# Set the maximum width of the columns
pd.set_option('display.max_colwidth', 20)

In [3]:
# Create a dataframe with DisProt instances from curated.mjson database
data = list()

with open('curated.mjson', 'r') as file:
    for line in file:
        obj = json.loads(line)
        rows = json_parser(obj)
        data.extend(rows)

curated_disprot = pd.DataFrame(data)

# Calculate the length of disordered regions
curated_disprot['length'] = curated_disprot['end'] - curated_disprot['start'] + 1
curated_disprot = curated_disprot[(curated_disprot['feature'] == 'disorder') & (curated_disprot['source'] == 'disprot')]
curated_disprot['region'] = curated_disprot.apply(lambda row: f"{row['acc']}_{row['start']}-{row['end']}", axis=1)
curated_disprot.to_csv('curated_disprot.csv', index=False)

print(f'The number of instances in the Curated Disprot database: {len(curated_disprot)}')
curated_disprot.head()

The number of instances in the Curated Disprot database: 3151


Unnamed: 0,acc,evidence,feature,source,start,end,length,region
6927,P03265,curated,disorder,disprot,294,334,41,P03265_294-334
6928,P03265,curated,disorder,disprot,454,464,11,P03265_454-464
6929,P49913,curated,disorder,disprot,134,170,37,P49913_134-170
6930,P03045,curated,disorder,disprot,1,107,107,P03045_1-107
6931,P00004,curated,disorder,disprot,1,105,105,P00004_1-105


Apart from separate columns with the Uniprot ID, database related information and the start-end position of the disordered region we create a `region` column comprising the ID, start and end position. This will be used for further disordered regions selection.

Then we upload the XML file with the BLAST results and filter it using the previously loaded `curated_disprot` dataframe.

In [4]:
# Set the path to the input XML files
database_path = '{}/databases/uniprot'.format(directory)
files = os.listdir(database_path)

common_df = pd.DataFrame()

# Merge files into one dataframe
for file_name in files:
    file_path = os.path.join(database_path, file_name)
    df = blast_parser(file_path) # iterates over XML files
    common_df = pd.concat([common_df, df], ignore_index=True)

# print(f'The number of instances in the dataframe: {len(common_df)}')

In [5]:
# Filter only disordered regions in the common_df filtering with the Uniprot IDs from the curated_disordered
disordered = common_df[common_df['query_id'].isin(curated_disprot['acc'])]
disordered.to_csv('disordered_df.csv', index=False)

print(f'The number of rows with the disordered regions: {len(disordered)}')
disordered.head()

The number of rows with the disordered regions: 7393


Unnamed: 0,query_id,subject_id,query_len,hsp_len,query_seq,match_seq,subject_seq,query_start,query_end,subject_start,subject_end,identity,positive,gaps,eval,bit_score,count
200,Q9H832,A0A6J2FM24,354,356,MAESPTEEAATA--GA...,MAESPTEEAATA GA...,MAESPTEEAATATAGA...,1,354,1,354,350,350,4,0.0,1851.0,200
201,Q9H832,A0A3Q7W6Y2,354,356,MAESPTEEAATA--GA...,MAESPTEEAATA GA...,MAESPTEEAATATAGA...,1,354,1,354,350,350,4,0.0,1851.0,200
202,Q9H832,A0A2U3VK69,354,356,MAESPTEEAATA--GA...,MAESPTEEAATA GA...,MAESPTEEAATATAGA...,1,354,1,354,350,350,4,0.0,1851.0,200
203,Q9H832,A0A2Y9JVH5,354,358,MAESPTEEAATA----...,MAESPTEEAATA ...,MAESPTEEAATATATA...,1,354,1,356,351,351,6,0.0,1854.0,200
204,Q9H832,A0A8C7ALE4,354,358,MAESPTEEAATA----...,MAESPTEEAATA ...,MAESPTEEAATATATA...,1,354,1,356,351,351,6,0.0,1854.0,200


In [6]:
# # Open XML file as a dataframe - for one file
# input_file = '{}/databases/uniprot/curated_uniprot.fasta_75'.format(directory) # change the file name if necessary
# df = blast_parser(input_file)
# print(f'The number of instances: {len(df)}')

# # Keep only disordered regions in the initial dataframe filtering with curated_disordered dataframe
# disordered = df[df['query_id'].isin(curated_disprot['acc'])]
# disordered.to_csv('disordered_df.csv', index=False)

# print(f'The number of rows with the disordered regions: {len(disordered)}')
# disordered.head()

In [7]:
# # Extract the disordered regions IDs and positions
# dis_regs = set()

# for i, row in disordered.iterrows():
#     dis_id = row[0]
#     matching_row = curated_disprot[curated_disprot['acc'] == dis_id]
#     if not matching_row.empty:
#         region = matching_row['region']
#         dis_regs.update(region)

# print(f'The number of disordered regions in the database: {len(dis_regs)}')

# # Define an array of disordered regions ids
# disprot_ids = dis_regs

# # Dropdown list of Uniprot query IDs for disordered regions
# output = widgets.Select(options=disprot_ids,
#     rows=10,
#     description='Uniprot ID: ',
#     layout={'width': 'max-content'},
#     disabled=False)

# display(output)

In [8]:
# Save the data for building MSA of the disordered regions
# id_dis = output.value # the Uniprot ID + start and end positions
# id_split = id_dis.split('_')[0] # the Uniprot ID
# i = 1 # change to the necessary region
# store the same id for all notebooks
# %store id_dis 
# id_split
# %store i
# selected_dis = disordered[disordered['query_id'].isin([id_dis.split('_')[0]])] # the information for one query ID
# print(f'The number of sequences for the MSA of the {id_split} protein: {len(selected_dis)}')
# selected_dis.head()

In [9]:
# Filter the curated_disprot dataframe based on selected ID
# curated_query = curated_disprot[curated_disprot['region'] == id_dis]
# curated_query

## 1. Multiple Sequence Alignment

Within this framework, we will compare the MSA results obtained directly from the BLAST output with those generated from the ClustalOmega and MAFFT.

### 1.1 MSA from the BLAST output (local alignment)
This code iterates through each row of the dataframe for a previously selected query ID. If there is no gap, it maps the amino acid from the subject sequence to the corresponding position in the query sequence, starting from the beginning of the query sequence. The resulting mapped amino acids are then added to a new count. In the end we have a FASTA file with the subject IDs and the aligned sequences.

In [10]:
# # Retrieve query sequence and its length from the disordered dataframe
# query_sequence = disordered[disordered['query_id'] == id_split]['query_seq'].unique()[0]
# query_sequence = re.sub(r'[-]', '', query_sequence)
# query_len = disordered[disordered['query_id'] == id_split]['query_len'].unique()[0]
# print(f'The Uniprot ID: {id_split}', '\n'
#       f'The length of the sequence: {query_len}', '\n'
#       f'The sequence: {query_sequence}')

In [11]:
# Create a dataframe of disprot sequences
uniprot_ids = disordered['query_id'].unique()

data = []

for i in uniprot_ids:
    query_sequence = disordered[disordered['query_id'] == i]['query_seq'].unique()[0]
    query_sequence = re.sub(r'[-]', '', query_sequence)
    query_len = len(query_sequence)
    
    data.append([i, query_sequence, query_len])

disprot_sequences = pd.DataFrame(data, columns=['disprot_id', 'query_sequence', 'query_len'])
print(f'The number of proteins with the disordered regions: {len(disprot_sequences)}')
disprot_sequences.head()

The number of proteins with the disordered regions: 39


Unnamed: 0,disprot_id,query_sequence,query_len
0,Q9H832,MAESPTEEAATAGAGA...,354
1,Q8IW19,MSGGFELQPRDGGPRV...,511
2,Q99967,MADHMMAMNHGRFPDG...,270
3,Q9CXY6,MRGDRGRGRGGRFGSR...,390
4,Q8R464,PLLLLWAAAAGPGTGQ...,379


In [12]:
# # 1.2 Build the MSA from the BLAST - for all alignments
# # Iterate through each row in the disprot_sequences 
# for ind_q, row_q in disprot_sequences.iterrows():
#     query_id = row_q['disprot_id']
#     query_sequence = row_q['query_sequence']
#     query_len = row_q['query_len']
#     selected_dis = disordered[disordered['query_id'].isin([query_id])]
#     print(ind_q, query_id, query_len, query_sequence[:3],"...", query_sequence[-3:])
    
#     # Save the file
#     out_file = f'{directory}/results/alignments/output_files/blast/{query_id}_blast.fasta'
    
#     # Initialize the first row with a length of the query sequence
#     with open(out_file, 'w') as fout:
#         mapped_seq = ['-'] * query_len

#         # Write the header line for the query sequence
#         fout.write('>{}\n'.format(query_id))

#         # Map the query sequence to the mapped_seq list
#         c = 0
#         for l_q in query_sequence:
#             if l_q != ' ' and l_q != '-':
#                 mapped_seq[c] = l_q
#                 c += 1

#         # Write the query_mapped_seq sequence to the output file
#         fout.write('{}\n'.format(''.join(mapped_seq)))

#         # Map the subject sequences to the mapped_seq list and write to the output file
#         for index, row in selected_dis.iterrows():
#             if query_id == row['subject_id']:
#                 continue
                
#             c = 0
#             query_start = row['query_start']
#             for l_q, l_s in zip(row['query_seq'], row['subject_seq']):
#                 if l_q != ' ' and l_q != '-': # if the initial aa from query is not empty or gapped
#                     if query_start + c - 1 < len(mapped_seq): # added the condition
#                         mapped_seq[query_start + c - 1] = l_s if l_s != ' ' else '-' # assign aa to subject
#                         c += 1
#             fout.write('>{}\n{}\n'.format(row['subject_id'], ''.join(mapped_seq)))

### 1.2. MSA from the ClustalOmega (global alignment)

Initially, we need to preprocess the dataframe into a suitable input format file for ClustalOmega. For each Uniprot ID (query and all the subject) we extract the sequence from Uniprot using `get_fasta` function. 

These sequences are NOT aligned at that time!

In [13]:
### RUNS FOR A LONG TIME
# # Retrieve the unaligned sequences from the local machine - for all disprot sequences at once
# for d_id, d_row in disprot_sequences.iterrows():
#     id_split = d_row['disprot_id']
#     seq = d_row['query_sequence']
#     output_file = f'{directory}/results/alignments/input_files/{id_split}_input.fasta'
    
#     # Iterating over selected_dis
#     with open(output_file, 'w') as fout:
#         selected_dis = disordered[disordered['query_id'].isin([id_split])]
#         # Write the query sequence to the output file as the first line
#         fout.write(">{}\n{}\n".format(id_split, seq))
#         print(id_split, len(seq)) # correct

#         for index, row in selected_dis.iterrows():
#             accession = row['subject_id']
#             sequence = get_fasta(accession)
#             print(index, accession, len(sequence))
#             if id_split == accession: # remove duplicates
#                 continue
#             fout.write(sequence)

In [14]:
# # Retrieve the unaligned sequences from the local machine - for one sequence
# output_file = f'{directory}/results/alignments/input_files/{id_split}_input.fasta'

# with open(output_file, 'w') as fout:
#     # Write the query sequence to the output file as the first line
#     fout.write('>{}\n{}\n'.format(id_split, query_sequence))

#     for index, row in selected_dis.iterrows():
#         accession = row['subject_id']
#         sequence = get_fasta(accession)
#         print(index, accession, len(sequence))
#         if id_split == accession: # remove duplicates
#             continue
#         fout.write(sequence)

In [17]:
# ClustalOmega MSA generator
def clustalo_generator(input_folder, output_folder):
    # Iterate over all input files in the input folder
    for input_file in os.listdir(input_folder):
        if input_file.endswith('.fasta'): # remove 1 later
            # Extract ID from the file name
            id_split = os.path.splitext(input_file)[0].split('_')[0]
            print(f'MSA ClustalOmega is generated for the {id_split} protein')
            output_file = os.path.join(output_folder, f'{id_split}_clustal.fasta') # name of the output file

            # Define the ClustalOmega command
            clustalomega_cline = ClustalOmegaCommandline(
                infile=os.path.join(input_folder, input_file),
                outfile=output_file,
                outputorder='input-order',
                verbose=False,
                auto=True,
                force=True)

            # Run the ClustalOmega command
            subprocess.run(str(clustalomega_cline), shell=True)

After obtaining the sequences, we build the ClustalOmega MSA using the function `clustalo_generator`.

In [18]:
# Run the ClustalOmega generator - it takes a while
input_folder = directory + '/results/alignments/input_files'
output_folder = directory + '/results/alignments/output_files/clustal'

clustalo_generator(input_folder, output_folder)

P00742
Q93KQ4
Q9BYF1
A4L7I2
O14958
P07342
A8AZZ3
O14727
P01019
Q9CXY6
Q9H832
Q86FP8
P01097
O15922
Q9BYI3
S7W634
J8TM36
O00585
A1L1Q4
P00392
Q8IU57
O43474
Q84852
P00736
Q8IW19
Q8R464
Q99967
Q8WUG5
Q5VZK9
O35274
O00308
S6B291
Q9H0E2
O43791




Q5T4W7
O88339
P04370-5
Q9Z2F5
Q8K4J6


In [26]:
# Remove gaps from the query sequence of the alignment
def remove_extra_gaps(alignment):
    # Get the first sequence
    first_seq = alignment[0].seq

    # Find the positions of non-gap characters in the first sequence
    non_gap_positions = [i for i, base in enumerate(first_seq) if base != '-']

    # Create a list to hold SeqRecord objects for the filtered alignment
    filtered_seqs = []

    for seq_record in alignment:
        # Extract and join the non-gap characters
        filtered_seq = Seq(''.join(seq_record.seq[i] for i in non_gap_positions), seq_record.seq.alphabet)
        
        # Create a new SeqRecord with the filtered sequence
        filtered_record = SeqRecord(
            seq=filtered_seq,
            id=seq_record.id,
            description=seq_record.description)
        
        filtered_seqs.append(filtered_record)

    # Create a new alignment with filtered SeqRecord objects
    new_alignment = MultipleSeqAlignment(filtered_seqs, alphabet=alignment._alphabet)

    return new_alignment

# Apply gaps removing to all files
def process_folder(input_folder):
    # Iterate over all input files in the input folder
    for input_file in os.listdir(input_folder):
        if input_file.endswith('.fasta'):
            # Set file paths
            file_path = os.path.join(input_folder, input_file)

            # Read the alignment and remove extra gaps
            alignment = AlignIO.read(file_path, 'fasta')
            filtered_alignment = remove_extra_gaps(alignment)

            # Write the filtered alignment to a new file
            with open(file_path, "w") as output_handle:
                AlignIO.write(filtered_alignment, output_handle, 'fasta')

In [27]:
# Remove gaps from all ClustalOmega MSA
folder = f'{directory}/results/alignments/output_files/clustal'

process_folder(folder)

We choose the following parameters for MSA ClustalOmega:

- Output format: Pearson/FASTA
- Order: input

Next, we will save the outputs in the corresponding directories. After that, we need to open the files in Jalview and remove gaps in the first line corresponding to the query Uniprot ID to maintain the correct length of sequences.
As a result, we have 2 alignments for comparison: BLAST and ClustalOmega.

### 1.3 Preparing the proteins for analysis

In [891]:
# Prepare all sequences for the following comparison - the alignments fasta files
blast_path = f'{directory}/results/alignments/output_files/blast/'
# al_blast = f'{directory}/results/alignments/output_files/blast/{id_split}_blast.fasta'
# al_clustal = f'{directory}/results/alignments/output_files/clustal/{id_split}_clustal.fasta'

# Make a dataframes of these alignments
# blast_seqs = get_seqs(al_blast)
blast_seqs = print_dis_seqs(blast_path, 'blast', id_split)
# clustal_seqs = get_seqs(al_clustal)
%store blast_seqs 
# clustal_seqs

# The number of rows and columns (sequences and length of the sequence)
id_split, blast_seqs.shape # clustal_seqs.shape

Stored 'blast_seqs' (ndarray)


('P00392', (200, 561))

In [892]:
# blast_seqs_dict = {}

# for ind_q, row_q in disprot_sequences.iterrows():
#     query_id = row_q['disprot_id']
    
#     for blast_file in os.listdir(blast_path):
#         blast_seqs = print_dis_seqs(blast_path, 'blast', query_id)
#         blast_seqs_dict[blast_file] = blast_seqs

In [893]:
# for blast_file, blast_seqs in blast_seqs_dict.items():
#     print(f"BLAST file: {blast_file}, MSA shape: {blast_seqs.shape}")

In [894]:
# Calculate the statistics
# Returns the values of occupancy and entropy for each alignment
def stats_calculation(seqs, q_id):
    
    data = []
    aa = 'ACDEFGHIKLMNPQRSTVWY'

    for i, column in enumerate(seqs.T):

        count = Counter(column)
        try:
            count.pop('-')
        except KeyError:
            pass
        count_sorted = sorted(count.items(), key=lambda x:x[1], reverse=True)

        non_gap = np.count_nonzero(column != '-')
        occupancy = non_gap / column.size

        probabilities = [count.get(k, 0.0) / column.size for k in aa]

        entropy = scipy.stats.entropy(probabilities, base=20)
        data.append([i, q_id, occupancy, entropy, count_sorted])

    df_calc = pd.DataFrame(data, columns=['pos', 'query_id', 'occupancy', 'entropy', 'counts'])
    return df_calc

In [895]:
blast_calc = stats_calculation(blast_seqs, id_split) # initial MSA BLAST
blast_calc.to_csv(f'results/stats/blast_calc_{id_split}.csv')
# clustal_calc = stats_calculation(clustal_seqs, id_split) # initial MSA ClustalOmega

In [896]:
# Merge files with statistics
folder_path = 'results/stats/'
stats_total = pd.DataFrame()

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        data = pd.read_csv(file_path, index_col=0)
        stats_total = pd.concat([stats_total, data], ignore_index=True)

stats_total.to_csv('results/stats_total.csv')
print(f'The length of the stats_total dataframe: {len(stats_total)}')
stats_total.head()

The length of the stats_total dataframe: 21870


Unnamed: 0,pos,query_id,occupancy,entropy,counts
0,0,P01019,1.0,0.0,"[('M', 200)]"
1,1,P01019,1.0,0.346925,"[('A', 143), ('T..."
2,2,P01019,1.0,0.110752,"[('P', 186), ('A..."
3,3,P01019,1.0,0.191072,"[('A', 169), ('T..."
4,4,P01019,1.0,0.211104,"[('G', 166), ('S..."


In [897]:
print(stats_total['query_id'].nunique())
print(stats_total['query_id'].unique())

39
['P01019' 'A1L1Q4' 'Q9BYI3' 'Q9H832' 'Q5T4W7' 'S7W634' 'Q8IW19' 'Q93KQ4'
 'Q9CXY6' 'Q8IU57' 'J8TM36' 'O43474' 'Q84852' 'O88339' 'Q8R464' 'Q99967'
 'A8AZZ3' 'P07342' 'P01097' 'Q9Z2F5' 'Q8K4J6' 'O00585' 'P00392' 'O14727'
 'Q86FP8' 'P00742' 'P04370-5' 'Q5VZK9' 'Q8WUG5' 'O35274' 'S6B291' 'P00736'
 'A4L7I2' 'O15922' 'O00308' 'Q9BYF1' 'O14958' 'O43791' 'Q9H0E2']


### 1.4 Calculating and removing redundant regions from MSA
We will use CD-Hit tool. Given the aligned sequences as an input and setting the threshold of 62% we will keep only non-redundant regions in MSA.

In [838]:
# # Calculate redundancy 
# def calculate_Nf(msa_file, threshold, id_split):

#     output_file = f"/Users/alina/HMM/results/alignments/input_files/non-redundant/Nf_{id_split}.fasta"
#     cd_hit_path = "/Users/alina/cd-hit/cd-hit"

#     # Run CD-HIT to cluster the sequences (excluding the first line) and remove redundancy
#     cmd = f"{cd_hit_path} -i {msa_file} -o {output_file} -c {threshold} -n 4 > /dev/null"
#     subprocess.call(cmd, shell=True)

#     # Read the first line from the original MSA file
#     with open(msa_file, "r") as msa_handle:
#         first_record = next(SeqIO.parse(msa_handle, "fasta"))

#     # Temporarily store the non-redundant sequences in a list
#     non_redundant_sequences = []
#     with open(output_file, "r") as output_handle:
#         for record in SeqIO.parse(output_handle, "fasta"):
#             non_redundant_sequences.append(record)

#     # Write the non-redundant sequences to the output file
#     with open(output_file, "w") as final_handle:
#         SeqIO.write([first_record] + non_redundant_sequences, final_handle, "fasta")

#     # Count the number of sequences in the MSA and the non-redundant MSA
#     total_sequences = sum(1 for record in SeqIO.parse(msa_file, "fasta"))
#     non_redundant_sequences_count = len(non_redundant_sequences)

#     # Calculate the effective sequences (Nf)
#     Nf = non_redundant_sequences_count / total_sequences
#     print("The number of non-redundant sequences:", non_redundant_sequences_count)
#     print("The total number of sequences:", total_sequences)
#     print("The ratio of non-redundant sequences (Nf):", "{:.2f}".format(Nf))

#     return

In [839]:
# Calculate redundancy and the number of effective sequences with calculate_Nf function using the .fasta as an input
# ali_file = f'{directory}/results/alignments/input_files/{id_split}_input_1.fasta'
# print(id_split)
# calculate_Nf(ali_file, 0.62, id_split) 

# calculate_Nf(al_blast, 0.62, id_split)
# calculate_Nf(al_clustal, 0.62, id_split)
# calculate_Nf(al_mafft, 0.62, id_split)

In [840]:
# # Save the generated cluster file for the MSA notebook
# data_file = f"/Users/alina/HMM/results/alignments/input_files/non-redundant/Nf_{id_split}.fasta.clstr"
# %store data_file

In [841]:
# # Load non_redundant MSA from ClustalOmega
# non_redundant = f'{directory}/results/alignments/output_files/clustal/non-redundant/Nf_{id_split}_clustal.fasta'

# nr_seqs = get_seqs(non_redundant)
# %store nr_seqs

# # Check the shape of non-redundant MSA
# print(f"The shape of non-redundant MSA for {id_split} protein:", nr_seqs.shape)
# nr_seqs

### 1.5 Defining disordered regions in MSA

Here we will look at the positions of disordered regions in an alignment. Then we'll extract these regions for the separate analysis.

In [794]:
# # Extract the lists of start and end regions
# start_regions = curated_query['start'].tolist()
# end_regions = curated_query['end'].tolist()
# print(f'The lists of start and end positions of the {id_split} disordered regions: \n'
#       f'Start regions: {start_regions}, \n'
#       f'End regions: {end_regions}')

In [795]:
# output_directory = f'{directory}/results/alignments/output_files/disordered'

# start_regions = []
# end_regions = []

# for i, row in curated_disprot.iterrows():
# #     id_dis = row.loc['acc']
#     start_regions.append(row.loc['start'])
#     end_regions.append(row.loc['end'])
#     separate_disordered_regions = select_dis_regions(al_blast, id_dis, start_regions, end_regions, output_directory)

# print(f"The lists of start and end positions of the disordered regions: \n"
#           f"Start regions: {start_regions}, \n"
#           f"End regions: {end_regions}")

### 1.5.1 Define disordered regions for the redundant MSAs

In [126]:
# Saves disordered regions from MSA output in separate FASTA files
def select_dis_regions(msa_file, query_id, output_directory):

    sep_sequences = []  # Collect trimmed sequences
    id_split = query_id.split('_')[0]
    start_position = int(query_id.split('_')[1].split('-')[0])
    end_position = int(query_id.split('_')[1].split('-')[1])
    print(id_split, start_position, end_position)

    with open(msa_file, 'r') as msa_handle:
        msa_records = list(SeqIO.parse(msa_handle, 'fasta'))

    records = []
    query_record_id = str(query_id)
    subject_record_ids = []

    for j, record in enumerate(msa_records):
        sequence = record.seq
        if len(sequence) >= start_position > 0 and end_position <= len(sequence):
            trimmed_sequence = sequence[start_position - 1: end_position]

            if j == 0:
                record_id = query_record_id
                description = ''
            else:
                if record.id not in subject_record_ids:
                    subject_record_ids.append(record.id)
                record_id = subject_record_ids[-1]
                description = record.description

            disordered_record = SeqIO.SeqRecord(trimmed_sequence, id=record_id, description=description)
            records.append(disordered_record)
        else:
            print(f'Invalid region: start={start_position}, end={end_position}')

    if records:
        output_file_separate = os.path.join(output_directory, f'{query_id}.fasta')
        SeqIO.write(records, output_file_separate, 'fasta')

    sep_sequences.extend([record.seq for record in records])  # Extend the collected sequences

    return sep_sequences

In [157]:
def print_dis_seqs(directory, align_type, query_id):

    for file_name in os.listdir(directory):
        if f'{query_id}_' in file_name and f'_disordered' not in file_name:
            alignment_file = os.path.join(directory, file_name)
            seqs = []

            with open(alignment_file) as f:
                for record in AlignIO.read(f, 'fasta'):
                    seqs.append(np.array(list(record.seq), dtype='str'))

    if seqs:
        return np.array(seqs, dtype='str')
    else:
        return np.array([])  # Return an empty array if no sequences were found

In [172]:
# Split the disordered regions with the select_dis_regions function
output_directory = f'{directory}/results/alignments/output_files/disordered'
separate_disordered_regions = select_dis_regions(al_blast, id_dis, output_directory)

# if there is more than 1 region
dis_seqs = print_dis_seqs(output_directory, 'disordered', id_split)
%store dis_seqs
# if isinstance(dis_seqs, list):  # Check if dis_seqs is a list of several disordered regions
#     print(id_split, dis_seqs[0].shape, type(dis_seqs[0]))
#     print(id_split, dis_seqs[1].shape, type(dis_seqs[1]))
# else: # if dis_seqs consists of one disordered region only
#     print(id_split, dis_seqs.shape, type(dis_seqs))
    
print(id_split, dis_seqs.shape, type(dis_seqs))    

Q8IW19 450 511
Stored 'dis_seqs' (ndarray)
Q8IW19 (200, 62) <class 'numpy.ndarray'>


In [250]:
dis_seqs.shape

(200, 62)

In [251]:
blast_seqs.shape

(200, 651)

In [173]:
# # Calculation of occupancy and entropy
# if isinstance(dis_seqs, list):
#     dis_calc0 = stats_calculation(dis_seqs[0], id_split)
#     dis_calc1 = stats_calculation(dis_seqs[0], id_split)
#     %store dis_calc0 dis_calc1
# else:
dis_calc = stats_calculation(dis_seqs, id_split)
%store dis_calc

Stored 'dis_calc' (DataFrame)


### 1.5.2 Define disordered regions for the non-redundant MSAs

In [170]:
# Split the disordered regions with the help of select_dis_regions function
output_directory = f"{directory}/results/alignments/output_files/disordered/non-redundant"
separate_disordered_regions = select_dis_regions(non_redundant, id_dis, output_directory)

# if there are more than 1 region
dis_seqs_nr = print_dis_seqs(output_directory, 'disordered', id_split)
%store dis_seqs_nr
# if isinstance(dis_seqs, list):  # Check if dis_seqs is a list of several disordered regions
#     print(id_dis, dis_seqs_nr[0].shape, type(dis_seqs_nr[0]))
#     print(id_dis, dis_seqs_nr[1].shape, type(dis_seqs_nr[1]))
# else: # if dis_seqs consists of one disordered region only
print(id_split, dis_seqs_nr.shape, type(dis_seqs_nr))

Q8IW19 450 511
Stored 'dis_seqs_nr' (ndarray)
Q8IW19 (4, 62) <class 'numpy.ndarray'>


In [171]:
# # Calculation of occupancy and entropy
# if isinstance(dis_seqs, list):
#     dis_calc_nr0 = stats_calculation(dis_seqs_nr[0], id_split)
#     dis_calc_nr1 = stats_calculation(dis_seqs_nr[0], id_split)
#     %store dis_calc_nr0 dis_calc_nr1
# else:
dis_calc_nr = stats_calculation(dis_seqs_nr, id_split)
%store dis_calc_nr

Stored 'dis_calc_nr' (DataFrame)
