In [1]:
# Set the path and input parameters
import os
directory = os.getcwd() # the main directory of the project

# The credentials for the remote cluster
name = 'alina'
server = 'ecate'

In [2]:
# Import the libraries and functions
from functions import *

## Load the data
We start with uploading the file containing information regarding the disordered regions (`curated.mjson`) which further will be used to filter the instances from the input.

In [3]:
# Set the maximum width of the columns
pd.set_option('display.max_colwidth', 20)

In [4]:
# Create a dataframe with DisProt instances from curated.mjson database
data = list()

with open('curated.mjson', 'r') as file:
    for line in file:
        obj = json.loads(line)
        rows = json_parser(obj)
        data.extend(rows)

curated_disprot_df = pd.DataFrame(data)

# Calculate the length of disordered regions
curated_disprot_df['length'] = curated_disprot_df['end'] - curated_disprot_df['start'] + 1
curated_disprot_df = curated_disprot_df[(curated_disprot_df['feature'] == 'disorder') & (curated_disprot_df['source'] == 'disprot')]
curated_disprot_df['region'] = curated_disprot_df.apply(lambda row: '{}_{}-{}'.format(row['acc'], row['start'], row['end']), axis=1)
curated_disprot_df.to_csv('curated_disprot.csv', index=False)

print('The number of instances in the Curated Disprot database: {}'.format(len(curated_disprot_df)))
curated_disprot_df.head()

The number of instances in the Curated Disprot database: 3151


Unnamed: 0,acc,evidence,feature,source,start,end,length,region
6927,P03265,curated,disorder,disprot,294,334,41,P03265_294-334
6928,P03265,curated,disorder,disprot,454,464,11,P03265_454-464
6929,P49913,curated,disorder,disprot,134,170,37,P49913_134-170
6930,P03045,curated,disorder,disprot,1,107,107,P03045_1-107
6931,P00004,curated,disorder,disprot,1,105,105,P00004_1-105


Apart from separate columns with the Uniprot ID, database related information and the start-end position of the disordered region we create a `region` column comprising the ID, start and end position. This will be used for further disordered regions selection.

Then we upload the XML file with the BLAST results and filter it using the previously loaded `curated_disprot` dataframe.

In [5]:
# Set the path to the input XML files
database_path = '{}/databases/uniprot'.format(directory)
files = os.listdir(database_path)

common_df = pd.DataFrame()

# Merge files into one dataframe
for file_name in files:
    file_path = os.path.join(database_path, file_name)
    df = blast_parser(file_path) # iterates over XML files
    common_df = pd.concat([common_df, df], ignore_index=True)

In [6]:
# Filter only disordered regions in the common_df filtering with the Uniprot IDs from the curated_disordered
disordered_df = common_df[common_df['query_id'].isin(curated_disprot_df['acc'])]
disordered_df.to_csv('disordered_df.csv', index=False)

print('The number of rows with the disordered regions: {}'.format(len(disordered_df)))
disordered_df.head()

The number of rows with the disordered regions: 7393


Unnamed: 0,query_id,subject_id,query_len,hsp_len,query_seq,subject_seq,query_start,query_end,subject_start,subject_end,eval,bit_score,count
200,Q9H832,A0A6J2FM24,354,356,MAESPTEEAATA--GA...,MAESPTEEAATATAGA...,1,354,1,354,0.0,1851.0,200
201,Q9H832,A0A3Q7W6Y2,354,356,MAESPTEEAATA--GA...,MAESPTEEAATATAGA...,1,354,1,354,0.0,1851.0,200
202,Q9H832,A0A2U3VK69,354,356,MAESPTEEAATA--GA...,MAESPTEEAATATAGA...,1,354,1,354,0.0,1851.0,200
203,Q9H832,A0A2Y9JVH5,354,358,MAESPTEEAATA----...,MAESPTEEAATATATA...,1,354,1,356,0.0,1854.0,200
204,Q9H832,A0A8C7ALE4,354,358,MAESPTEEAATA----...,MAESPTEEAATATATA...,1,354,1,356,0.0,1854.0,200


## 1. Multiple Sequence Alignment

Within this framework, we will compare the MSA results obtained directly from the BLAST output with that generated from the ClustalOmega.

### 1.1 MSA from the BLAST output (local alignment)
This code iterates through each row of the dataframe for a previously selected query ID. If there is no gap, it maps the amino acid from the subject sequence to the corresponding position in the query sequence, starting from the beginning of the query sequence. The resulting mapped amino acids are then added to a new count. In the end we have a FASTA file with the subject IDs and the aligned sequences.

We construct a simple dataframe of query IDs and sequences. They will be used to set the maximum length of the alignment.

In [7]:
# Create a dataframe of disprot sequences
uniprot_ids = disordered_df['query_id'].unique()

data = []

for i in uniprot_ids:
    query_sequence = disordered_df[disordered_df['query_id'] == i]['query_seq'].unique()[0]
    query_sequence = re.sub(r'[-]', '', query_sequence)
    query_len = disordered_df[disordered_df['query_id'] == i]['query_len'].unique()[0]
    
    data.append([i, query_sequence, query_len])

disprot_sequences = pd.DataFrame(data, columns=['disprot_id', 'query_sequence', 'query_len'])
print('The number of proteins with the disordered regions: {}'.format(len(disprot_sequences)))

The number of proteins with the disordered regions: 39


In [8]:
# Build the MSA from the BLAST output
for ind_q, row_q in disprot_sequences.iterrows():
    query_id = row_q['disprot_id']
    query_sequence = row_q['query_sequence']
    query_len = row_q['query_len']
    selected_dis = disordered_df[disordered_df['query_id'].isin([query_id])]
    print(ind_q, query_id, query_len, query_sequence[:3], '...', query_sequence[-3:])
    
    # Save the files
    out_file = '{}/results/alignments/output_files/blast/{}_blast.fasta'.format(directory, query_id)
    
    # Initialize the first row with a length of the query sequence
    with open(out_file, 'w') as fout:
        mapped_seq = ['-'] * query_len

        # Write the header line for the query sequence
        fout.write('>{}\n'.format(query_id))

        # Map the query sequence to the mapped_seq list
        c = 0
        for l_q in query_sequence:
            if l_q != ' ' and l_q != '-':
                mapped_seq[c] = l_q
                c += 1

        # Write the mapped_seq sequence to the output file
        fout.write('{}\n'.format(''.join(mapped_seq)))

        # Map the subject sequences to the mapped_seq list and write to the output file
        for index, row in selected_dis.iterrows():
            if query_id == row['subject_id']:
                continue
                
            c = 0
            query_start = row['query_start']
            for l_q, l_s in zip(row['query_seq'], row['subject_seq']):
                if l_q != ' ' and l_q != '-': # if the initial aa from query is not empty or gapped
                    if query_start + c - 1 < len(mapped_seq): # added the condition
                        mapped_seq[query_start + c - 1] = l_s if l_s != ' ' else '-' # assign aa to subject
                        c += 1
            fout.write('>{}\n{}\n'.format(row['subject_id'], ''.join(mapped_seq)))

0 Q9H832 354 MAE ... LRV
1 Q8IW19 511 MSG ... KRK
2 Q99967 270 MAD ... VSC
3 Q9CXY6 390 MRG ... TQE
4 Q8R464 388 PLL ... FFI
5 Q8K4J6 964 QSV ... SCL
6 Q9BYI3 521 MFT ... STD
7 Q9BYF1 805 MSS ... TSF
8 Q8WUG5 538 MAS ... PAL
9 Q9H0E2 274 QRG ... EEP
10 Q84852 1225 TSV ... HVH
11 Q5VZK9 1371 MTE ... IFV
12 Q8IU57 520 MAG ... YMA
13 Q5T4W7 220 MEL ... CLG
14 Q93KQ4 116 MGS ... QKL
15 Q86FP8 80 MKH ... NQD
16 P00742 488 GES ... EVI
17 P01019 476 MAP ... PLS
18 O88339 575 MST ... FLL
19 O35274 817 MMK ... NST
20 P07342 687 YSS ... GKH
21 O15922 447 AGG ... RHP
22 O00308 870 MAS ... GQE
23 O14727 1248 MDA ... TLE
24 P00736 705 WLL ... EED
25 P00392 561 MTH ... CAG
26 O43474 513 MAV ... RHF
27 O43791 374 MSR ... KQS
28 J8TM36 277 MQK ... RSS
29 A1L1Q4 408 LSF ... MWR
30 A4L7I2 2474 MDS ... GPK
31 S7W634 831 TSE ... RHL
32 Q9Z2F5 430 MSG ... DQL
33 S6B291 466 YAM ... PGK
34 O14958 399 MKR ... INT
35 A8AZZ3 195 MKK ... AVK
36 P04370-5 169 MAS ... ARR
37 O00585 134 MAQ ... KGP
38 P01097 85 MLP 

### 1.2. MSA from the ClustalOmega (global alignment)

Initially, we need to preprocess the dataframe into a suitable input format file for ClustalOmega. For each Uniprot ID (query and all the subjects) we extract the sequence from Uniprot using `get_fasta` function. 

These sequences are NOT aligned at that time!

In [9]:
# # RUNS FOR A LONG TIME - 4-5 minutes for one MSA
# # Retrieve the unaligned sequences from the local machine
# for d_id, d_row in disprot_sequences.iterrows():
#     id_split = d_row['disprot_id']
#     seq = get_fasta(id_split) # the whole sequence including the name of the protein
#     lines = seq.split('\n')
#     seq_join = ''.join(lines[1:]) # only the sequence of amino acids
#     output_file = '{}/results/alignments/input_files/{}_input.fasta'.format(directory, id_split)
    
#     with open(output_file, 'w') as fout:
#         selected_dis = disordered_df[disordered_df['query_id'].isin([id_split])]
#         # Write the query sequence to the output file as the first line
#         fout.write('>{}\n{}\n'.format(id_split, seq_join))
#         print('Retrieving sequences for {} MSA, query sequence length: {}'.format(id_split, len(seq_join)))

#         for index, row in selected_dis.iterrows():
#             accession = row['subject_id']
#             sequence = get_fasta(accession) # the whole sequence including the name of the protein
#             lines_subj = sequence.split('\n')
#             seq_subj_join = ''.join(lines_subj[1:]) # only the sequence of amino acids
#             print(index, accession, len(seq_subj_join))
#             if id_split == accession: # remove duplicates of the query sequence
#                 continue
#             fout.write(sequence)

After obtaining the sequences, we build the ClustalOmega MSA using the function `clustalo_generator`.

In [10]:
# # Run the ClustalOmega generator - 1-5 minutes for an alignment
# input_folder = '{}/results/alignments/input_files'.format(directory)
# output_folder = '{}/results/alignments/output_files/clustal'.format(directory)

# clustalo_generator(input_folder, output_folder)

Initially these sequences contain a lot of gaps. However, to properly extract the disordered regions we need that the length of the query sequence will be the length of the whole MSA, and this sequence should not contain any gaps. To process the data we apply the `process_folder` function that removes all the gaps from the first sequence of the ClustalOmega MSA.

In [11]:
# Remove gaps from all ClustalOmega MSAs
folder = '{}/results/alignments/output_files/clustal'.format(directory)

process_folder(folder)

### 1.3 Calculation of statistics
In this step, we calculate the occupancy and entropy for each of BLAST and ClustalOmega MSA. Occupancy will show how many gaps are in the certain column whereas the entropy will demonstrate how many varios amino acids are in there. Both these metrics help to derive insights on the quality of the alignment.

In [12]:
# Set paths to the input alignments fasta files - BLAST and ClustalOmega
blast_path = '{}/results/alignments/output_files/blast/'.format(directory)
clustal_path = '{}/results/alignments/output_files/clustal/'.format(directory)

In [13]:
# Run the process_files function
blast_stats = process_files(blast_path, 'blast')
clustal_stats = process_files(clustal_path, 'clustal')

# %store saves data in the memory and provides a quick accession
# from the other notebooks in the folder
%store blast_stats clustal_stats

Stored 'blast_stats' (DataFrame)
Stored 'clustal_stats' (DataFrame)


### 1.4 Calculating and removing redundant regions from MSA
We will use CD-Hit tool. Given the aligned sequences as an input and setting the threshold of 62% we will keep only non-redundant regions in MSA.

In [14]:
# Set output paths and parameters for non-redundant MSAs
blast_path_nr = '{}/results/alignments/output_files/blast/non-redundant'.format(directory)
clustal_path_nr = '{}/results/alignments/output_files/clustal/non-redundant'.format(directory)

red_seqs = []
threshold = 0.9 # play with the value
word_size = 4

In [15]:
# Select non-redundant sequences for BLAST MSAs
for file in os.listdir(blast_path):
    if file.endswith('.fasta'):
        input_file=os.path.join(blast_path, file)
        output_file=os.path.join(blast_path_nr, file)
        protein_id = os.path.splitext(file)[0].split('_')[0]
        calc_red = calculate_red(input_file, output_file, threshold, word_size, protein_id)

Non-redundant seqs for P07342: 10 Total no. of seqs: 200 Ratio: 0.05
Non-redundant seqs for O14958: 3 Total no. of seqs: 200 Ratio: 0.01
Non-redundant seqs for A4L7I2: 1 Total no. of seqs: 200 Ratio: 0.01
Non-redundant seqs for Q9BYF1: 22 Total no. of seqs: 200 Ratio: 0.11
Non-redundant seqs for Q93KQ4: 10 Total no. of seqs: 73 Ratio: 0.14
Non-redundant seqs for P00742: 4 Total no. of seqs: 200 Ratio: 0.02
Non-redundant seqs for Q9H832: 9 Total no. of seqs: 200 Ratio: 0.04
Non-redundant seqs for Q9CXY6: 1 Total no. of seqs: 200 Ratio: 0.01
Non-redundant seqs for P01019: 7 Total no. of seqs: 200 Ratio: 0.04
Non-redundant seqs for O14727: 7 Total no. of seqs: 200 Ratio: 0.04
Non-redundant seqs for A8AZZ3: 8 Total no. of seqs: 106 Ratio: 0.08
Non-redundant seqs for J8TM36: 20 Total no. of seqs: 200 Ratio: 0.10
Non-redundant seqs for S7W634: 9 Total no. of seqs: 200 Ratio: 0.04
Non-redundant seqs for Q9BYI3: 1 Total no. of seqs: 200 Ratio: 0.01
Non-redundant seqs for O15922: 3 Total no. of

In [16]:
# Select non-redundant sequences for ClustalOmega MSAs
for file in os.listdir(clustal_path):
    if file.endswith('.fasta'):
        input_file=os.path.join(clustal_path, file)
        output_file=os.path.join(clustal_path_nr, file)
        protein_id = os.path.splitext(file)[0].split('_')[0]
        calc_red = calculate_red(input_file, output_file, threshold, word_size, protein_id)

Non-redundant seqs for P01097: 6 Total no. of seqs: 195 Ratio: 0.03
Non-redundant seqs for O00308: 1 Total no. of seqs: 191 Ratio: 0.01
Non-redundant seqs for P00392: 1 Total no. of seqs: 185 Ratio: 0.01
Non-redundant seqs for O43791: 1 Total no. of seqs: 193 Ratio: 0.01
Non-redundant seqs for O00585: 23 Total no. of seqs: 191 Ratio: 0.12
Non-redundant seqs for A1L1Q4: 16 Total no. of seqs: 191 Ratio: 0.08
Non-redundant seqs for Q9H0E2: 6 Total no. of seqs: 195 Ratio: 0.03
Non-redundant seqs for S6B291: 3 Total no. of seqs: 193 Ratio: 0.02
Non-redundant seqs for A4L7I2: 1 Total no. of seqs: 200 Ratio: 0.01
Non-redundant seqs for Q8IW19: 1 Total no. of seqs: 190 Ratio: 0.01
Non-redundant seqs for Q9Z2F5: 1 Total no. of seqs: 192 Ratio: 0.01
Non-redundant seqs for S7W634: 2 Total no. of seqs: 196 Ratio: 0.01
Non-redundant seqs for Q8K4J6: 1 Total no. of seqs: 194 Ratio: 0.01
Non-redundant seqs for P04370-5: 1 Total no. of seqs: 192 Ratio: 0.01
Non-redundant seqs for O43474: 1 Total no. o

### 1.5 Defining disordered regions in MSA

Here we will look at the positions of disordered regions in an alignment. Then we'll extract these regions for the separate analysis.

### 1.5.1 Define disordered regions for the initial MSAs

In [17]:
# Set the output path for the disordered regions for BLAST and ClustalOmega MSAs
out_dir_blast = '{}/results/alignments/output_files/disordered/blast'.format(directory)
out_dir_clustal = '{}/results/alignments/output_files/disordered/clustal'.format(directory)

In [18]:
# Iterate through files in the input folder - BLAST
for file in os.listdir(blast_path):
    if file.endswith('.fasta'):
        input_file = os.path.join(blast_path, file)
        protein_id = os.path.splitext(file)[0].split('_')[0]

        # Extract relevant information based on the protein ID
        rows = curated_disprot_df[curated_disprot_df['acc'] == protein_id]

        for index, row in rows.iterrows():
            region = row['region']
            start_positions = [row['start']] 
            end_positions = [row['end']]

            # Output file name (only the region)
            output_file_name = '{}.fasta'.format(region)
            output_file_path = os.path.join(out_dir_blast, output_file_name)

            # Call the function for each row
            print('Applying function for region: {}.'.format(region))
            select_dis_regions(input_file, protein_id, start_positions, end_positions, region, output_file_path)
            
print('Done.')

Applying function for region: P07342_580-595.
Applying function for region: P07342_647-687.
Applying function for region: O14958_371-399.
Applying function for region: A4L7I2_1658-1856.
Applying function for region: Q9BYF1_769-805.
Applying function for region: Q93KQ4_51-81.
Applying function for region: P00742_90-126.
Applying function for region: Q9H832_1-99.
Applying function for region: Q9H832_327-354.
Applying function for region: Q9CXY6_29-44.
Applying function for region: Q9CXY6_347-390.
Applying function for region: P01019_44-64.
Applying function for region: P01019_436-450.
Applying function for region: O14727_1-104.
Applying function for region: A8AZZ3_24-44.
Applying function for region: A8AZZ3_116-134.
Applying function for region: A8AZZ3_146-195.
Applying function for region: J8TM36_236-249.
Applying function for region: S7W634_1-32.
Applying function for region: Q9BYI3_149-253.
Applying function for region: O15922_1-89.
Applying function for region: O15922_230-240.
Applyi

In [19]:
# Iterate through files in the input folder - ClustalOmega
for file in os.listdir(clustal_path):
    if file.endswith('.fasta'):
        input_file = os.path.join(clustal_path, file)
        protein_id = os.path.splitext(file)[0].split('_')[0]

        # Extract relevant information based on the protein ID
        rows = curated_disprot_df[curated_disprot_df['acc'] == protein_id]
        
        for index, row in rows.iterrows():
            region = row['region']
            start_positions = [row['start']] 
            end_positions = [row['end']]

            # Output file name (only the region)
            output_file_name = '{}.fasta'.format(region)
            output_file_path = os.path.join(out_dir_clustal, output_file_name)

            # Call the function for each row
            print('Applying function for region: {}.'.format(region))
            select_dis_regions(input_file, protein_id, start_positions, end_positions, region, output_file_path)
            
print('Done.')

Applying function for region: P01097_28-50.
Applying function for region: O00308_662-702.
Applying function for region: P00392_550-561.
Applying function for region: O43791_169-178.
Applying function for region: O43791_357-374.
Applying function for region: O00585_71-111.
Applying function for region: A1L1Q4_1-71.
Applying function for region: Q9H0E2_1-53.
Applying function for region: S6B291_240-465.
Applying function for region: A4L7I2_1658-1856.
Applying function for region: Q8IW19_399-420.
Applying function for region: Q8IW19_450-511.
Applying function for region: Q9Z2F5_306-430.
Applying function for region: S7W634_1-32.
Applying function for region: Q8K4J6_67-98.
Applying function for region: Q8K4J6_111-142.
Applying function for region: Q8K4J6_155-186.
Applying function for region: P04370-5_1-169.
Applying function for region: O43474_1-130.
Applying function for region: O14727_1-104.
Applying function for region: Q9BYF1_769-805.
Applying function for region: P01019_44-64.
Applyi

### 1.5.2 Define disordered regions for the non-redundant MSAs

In [20]:
# Set the output path for the disordered regions for BLAST and ClustalOmega MSAs
out_dir_blast_nr = '{}/results/alignments/output_files/disordered/blast/non-redundant'.format(directory)
out_dir_clustal_nr = '{}/results/alignments/output_files/disordered/clustal/non-redundant'.format(directory)

In [21]:
# Iterate through non-redundant MSAs in the input folder - BLAST
for file in os.listdir(blast_path_nr):
    if file.endswith('.fasta'):
        input_file = os.path.join(blast_path_nr, file)
        protein_id = os.path.splitext(file)[0].split('_')[0]

        # Extract relevant information based on the protein ID
        rows = curated_disprot_df[curated_disprot_df['acc'] == protein_id]
        
        for index, row in rows.iterrows():
            region = row['region']
            start_positions = [row['start']] 
            end_positions = [row['end']]

            # Output file name (only the region)
            output_file_name = '{}.fasta'.format(region)
            output_file_path = os.path.join(out_dir_blast_nr, output_file_name)

            # Call the function for each row
            print('Applying function for region: {}.'.format(region))
            select_dis_regions(input_file, protein_id, start_positions, end_positions, region, output_file_path)

print('Done.')

Applying function for region: P07342_580-595.
Applying function for region: P07342_647-687.
Applying function for region: O14958_371-399.
Applying function for region: A4L7I2_1658-1856.
Applying function for region: Q9BYF1_769-805.
Applying function for region: Q93KQ4_51-81.
Applying function for region: P00742_90-126.
Applying function for region: Q9H832_1-99.
Applying function for region: Q9H832_327-354.
Applying function for region: Q9CXY6_29-44.
Applying function for region: Q9CXY6_347-390.
Applying function for region: P01019_44-64.
Applying function for region: P01019_436-450.
Applying function for region: O14727_1-104.
Applying function for region: A8AZZ3_24-44.
Applying function for region: A8AZZ3_116-134.
Applying function for region: A8AZZ3_146-195.
Applying function for region: J8TM36_236-249.
Applying function for region: S7W634_1-32.
Applying function for region: Q9BYI3_149-253.
Applying function for region: O15922_1-89.
Applying function for region: O15922_230-240.
Applyi

In [22]:
# Iterate through non-redundant MSAs in the input folder - ClustalOmega
for file in os.listdir(clustal_path_nr):
    if file.endswith('.fasta'):
        input_file = os.path.join(clustal_path_nr, file)
        protein_id = os.path.splitext(file)[0].split('_')[0]

        # Extract relevant information based on the protein ID
        rows = curated_disprot_df[curated_disprot_df['acc'] == protein_id]
        
        for index, row in rows.iterrows():
            region = row['region']
            start_positions = [row['start']] 
            end_positions = [row['end']]

            # Output file name (only the region)
            output_file_name = '{}.fasta'.format(region)
            output_file_path = os.path.join(out_dir_clustal_nr, output_file_name)

            # Call the function for each row
            print('Applying function for region: {}.'.format(region))
            select_dis_regions(input_file, protein_id, start_positions, end_positions, region, output_file_path)
            
print('Done.')

Applying function for region: P01097_28-50.
Applying function for region: O00308_662-702.
Applying function for region: P00392_550-561.
Applying function for region: O43791_169-178.
Applying function for region: O43791_357-374.
Applying function for region: O00585_71-111.
Applying function for region: A1L1Q4_1-71.
Applying function for region: Q9H0E2_1-53.
Applying function for region: S6B291_240-465.
Applying function for region: A4L7I2_1658-1856.
Applying function for region: Q8IW19_399-420.
Applying function for region: Q8IW19_450-511.
Applying function for region: Q9Z2F5_306-430.
Applying function for region: S7W634_1-32.
Applying function for region: Q8K4J6_67-98.
Applying function for region: Q8K4J6_111-142.
Applying function for region: Q8K4J6_155-186.
Applying function for region: P04370-5_1-169.
Applying function for region: O43474_1-130.
Applying function for region: O14727_1-104.
Applying function for region: Q9BYF1_769-805.
Applying function for region: P01019_44-64.
Applyi