In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.ndimage as ndimage
from Bio import SeqIO

In [2]:
def get_segments(aa_seq, label_seq, segment_type, accession, description):
    slices = ndimage.find_objects(ndimage.label(label_seq)[0])
    ds = []
    for s in slices:
        segment = aa_seq[s[0]]  # Unpack 1-element slice tuple
        d = {'accession': accession, 'description': description, 'segment_type': segment_type, 'len': len(segment)}
        aa_counts = count_amino_acids(segment)
        d.update(aa_counts)
        ds.append(d)
    return ds


def count_amino_acids(aa_seq):
    aa_codes = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H',
                'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 
                'Y', 'V', 'O', 'U', 'B', 'Z', 'X', 'J']
    d = {aa: 0 for aa in aa_codes}
    for aa in aa_seq:
        d[aa] += 1
    return d

In [3]:
fasta_seq = SeqIO.parse('../generate_fastas/out/allseq.fasta', 'fasta')
fasta_disorder = SeqIO.parse('../generate_fastas/out/alldisorder.fasta', 'fasta')

In [4]:
protein_seq_dict = {}
for protein in fasta_seq:
    protein_seq_dict[protein.id.split("|")[0]] = str(protein.seq)

In [85]:
rows = []
for protein in fasta_disorder:
    dis_labels = [s == '1' for s in protein.seq]
    ord_labels = [s == '0' for s in protein.seq]

    accession = protein.id.split("|")[0]
    description = protein.description.split("|")[-1]
    aa_seq = protein_seq_dict[accession]
    
    # Disordered regions have the code 'D' and ordered regions have the code 'O'
    # The entire protein is added with the code 'P'
    ds_dis = get_segments(aa_seq, dis_labels, 'D', accession, description)
    ds_ord = get_segments(aa_seq, ord_labels, 'O', accession, description)
    ds_all = get_segments(aa_seq, [True for _ in range(len(aa_seq))], 'P', accession, description)

    # Add ds to rows
    rows.extend(ds_dis)
    rows.extend(ds_ord)
    rows.extend(ds_all)
df1 = pd.DataFrame(rows)
print(df1)

Empty DataFrame
Columns: []
Index: []


# 1 Length Distributions
## Length Distribution of Protein 

In [84]:
protein = df1[df1['segment_type'] == 'P']
plt.hist(protein['len'], bins=50)
plt.yscale('log')
plt.ylabel('Number of entries')
plt.xlabel('Number of Amino Acids')
plt.title('Length of Proteins')

KeyError: 'segment_type'

### Outliers

In [None]:
print(protein[protein['len'] > 15000])

## Length Distribution of Proteins After Removing Outliers

In [None]:
protein_outrm =  protein[protein['len'] < 15000]
plt.hist(protein_outrm['len'], bins=50)
plt.yscale('log')
plt.ylabel('Number of entries')
plt.xlabel('Number of Amino Acids')
plt.title('Length of Proteins')

## Length Distribution of Disordered Regions

In [None]:
disorder = df1[df1['segment_type'] == 'D']
plt.hist(disorder['len'], bins=50)
plt.yscale('log')
plt.ylabel('Number of entries')
plt.xlabel('Number of Amino Acids')
plt.title('Length of Disordered Regions')

## Length Distribution of Ordered Regions

In [None]:
order = df1[df1['segment_type'] == 'O']
plt.hist(order['len'], bins=50)
plt.yscale('log')
plt.ylabel('Number of entries')
plt.xlabel('Number of Amino Acids')
plt.title('Length of Ordered Regions')

### Outliers 

In [None]:
order[order['len'] > 15000]

In [None]:
df1[df1['accession'] == 'Q8WZ42']

# 2 Fraction disordered distribution

In [None]:
disorder_lengths = disorder.groupby('accession')['len'].sum().rename('D_len')
df2 = protein[['accession', 'len']].merge(disorder_lengths, on='accession', how='left').fillna(0)
df2['D_frac'] = df2['D_len'] / df2['len']

plt.hist(df2['D_frac'], bins=50)
plt.yscale('log')
plt.ylabel('Number of entries')
plt.xlabel('Fraction disordered')
plt.title('Fraction Disordered Distribution')

# 3 Fraction ordered distribution

In [None]:
order_lengths = order.groupby('accession')['len'].sum().rename('O_len')
df2 = df2.merge(order_lengths, on='accession', how='left').fillna(0)
df2['O_frac'] = df2['O_len'] / df2['len']

plt.hist(df2['O_frac'], bins=50)
plt.yscale('log')
plt.ylabel('Number of entries')
plt.xlabel('Fraction ordered')
plt.title('Fraction Ordered Distribution')

# 4 Correlation of fraction disordered with number of disordered segments

## 2D Histogram

In [None]:
D_segnum = disorder.groupby('accession').size().rename('D_segnum')
df2 = df2.merge(D_segnum, on='accession', how='left').fillna(0)

plt.hist2d(df2['D_segnum'], df2['D_frac'], bins=25)
plt.ylabel('Fraction of Disordered Content')
plt.xlabel('Number of Disordered Regions')
plt.title('Number Disordered Regions vs. Disordered Content')

## Scatter plot

In [None]:
plt.scatter(df2['D_segnum'], df2['D_frac'], s=6, alpha=0.25)
plt.ylabel('Fraction of Disordered Content')
plt.xlabel('Number of Disordered Regions')
plt.title('Number Disordered Regions vs. Disordered Content')

### Outliers

In [None]:
df2[df2['D_segnum'] > 50]

In [None]:
df1[df1['accession'] == 'O94248']

## Scatter plot without Outliers 

In [None]:
df2_outrm = df2[df2['D_segnum'] < 50]
plt.scatter(df2_outrm['D_segnum'], df2_outrm['D_frac'], s=6, alpha=0.25)
plt.ylabel('Fraction of Disordered Content')
plt.xlabel('Number of Disordered Regions')
plt.title('Number Disordered Regions vs. Disordered Content')

## Correlation Coefficient

In [None]:
correlation = df2_outrm['D_segnum'].corr(df2_outrm['D_frac'])
correlation

# 5 Correlation of average length of disordered segments with number of disordered segments

## 2D Histogram 

In [None]:
df3 = disorder[['accession', 'len']].groupby('accession').mean().merge(D_segnum, on='accession')
plt.hist2d(df3['len'], df3['D_segnum'], bins=25)
plt.ylabel('Number of Disordered Regions ')
plt.xlabel('Average Length of Disordered Content')
plt.title('Average Length of Disordered Content vs. Number of Disordered Regions')

## Scatter plot

In [None]:
plt.scatter(df3['len'], df3['D_segnum'], s=6, alpha=0.25)
plt.ylabel('Number of Disordered Regions')
plt.xlabel('Average Length of Disordered Regions')
plt.title('Number of Disordered Regions vs. Average Length of Disordered Regions')

## Scatter plot without outliers

In [None]:
df3_outrm = df3[df3['D_segnum'] < 50]
plt.scatter(df3_outrm['len'], df3_outrm['D_segnum'], s=6, alpha=0.25)
plt.ylabel('Number of Disordered Regions')
plt.xlabel('Average Length of Disordered Regions')
plt.title('Number of Disordered Regions vs. Average Length of Disordered Regions')

# 6 Number disordered segments in proteins distribution

In [None]:
plt.hist(df2['D_segnum'], bins=50)
plt.yscale('log')
plt.ylabel('Number of entries')
plt.xlabel('Number of Disordered Segments')
plt.title('Number of Disordered Segments Distribution')

### Outliers

In [None]:
df2[df2['D_segnum'] > 60]

In [None]:
plt.hist(df2.loc[df2['D_segnum'] < 60, 'D_segnum'], bins=50)
plt.yscale('log')
plt.ylabel('Number of entries')
plt.xlabel('Number of Disordered Segments')
plt.title('Number of Disordered Segments Distribution')

# 7 Amino acid distributions and enrichment 

## Disordered Segments 

In [None]:
aa_codes = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H',
            'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 
            'Y', 'V', 'O', 'U', 'B', 'Z', 'X', 'J']

In [None]:
aa_counts_dis = df.loc[df['segment_type'] == 'D', aa_codes].sum()
aa_fracs_dis = aa_counts_dis /aa_counts_dis.sum()
plt.bar(aa_codes, aa_fracs_dis)
plt.ylabel('Fraction of Amino Acid Makeup')
plt.xlabel('Amino Acids')
plt.title('Disordered Region Amino Acid Distribution')

In [None]:
aa_counts = df.loc[df['segment_type'] == 'P', aa_codes].sum()
aa_fracs = aa_counts / aa_counts.sum()
plt.bar(aa_codes, aa_fracs)
plt.ylabel('Fraction of Amino Acid Makeup')
plt.xlabel('Amino Acids')
plt.title('Total Amino Acid Distribution')

In [None]:
aa_fracs_dif = aa_fracs_dis - aa_fracs
plt.bar(aa_codes, aa_fracs_dif)
plt.ylabel('Difference of Fraction of Amino Acid Makeup')
plt.xlabel('Amino Acids')
plt.title('Difference of Protein and Disordered Segments Amino Acid Distribution')

## Overlay Bar Chart Amino Acid Distributions over Proteins vs. Disordered Segments

In [None]:
def overlapped_bar(df, show=False, width=0.9, alpha=.5,
                   title='', xlabel='', ylabel='', **plot_kwargs):
    """
    Like a stacked bar chart except bars on top of each other with transparency
    
    source: https://stackoverflow.com/questions/23293011/how-to-plot-a-superimposed-bar-chart-using-matplotlib-in-python"
    """
    xlabel = xlabel or df.index.name
    N = len(df)
    M = len(df.columns)
    indices = np.arange(N)
    colors = ['steelblue', 'firebrick', 'darksage', 'goldenrod', 'gray'] * int(M / 5. + 1)
    for i, label, color in zip(range(M), df.columns, colors):
        kwargs = plot_kwargs
        kwargs.update({'color': color, 'label': label})
        plt.bar(indices, df[label], width=width, alpha=alpha if i else 1, **kwargs)
        plt.xticks(indices + 0.5 * width, ['{}'.format(idx) for idx in df.index.values])
    plt.legend()
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if show:
        plt.show()
    return plt.gcf()

In [None]:
df_overlap_bar = pd.DataFrame(np.matrix([aa_fracs, aa_fracs_dis]).T, columns=['Protein', 'Disordered Regions'],
                              index=pd.Index(aa_codes))
overlapped_bar(df_overlap_bar, show=True, title='Amino Acid Distributions over Protein vs. Disordered Regions',
               xlabel='Amino Acid', ylabel='Fraction of Amino Acid Makeup')