# Statistics

This notebook goes through a release folder of MGnify. Fistrly it analyzes the release folder structure to get the number of clusters discarded due to BIAS, due to UniProt matches or to MGnify matches and other useful statistics.

Afterwards, it checks overall distributions of SEED multiple sequence alignments statistics, such as dimensions, occupancy score, conservation score and prettiness score. Moreover, it compares pre-trimming and post-trimming SEED multiple sequence alignments and retrieves.

In [2]:
!pwd

/home/damiclem/Scrivania/UniPD/Traineeship/MGnifam/notebook


In [4]:
# Dependencies
import matplotlib.pyplot as plt
from glob import glob, iglob
from os import path
import numpy as np
import sys
import os

%matplotlib inline

In [7]:
# Add custom dependecnies to path
sys.path.append(path.dirname(os.getcwd()) + '/..')

# Custom deipendencies
from src.msa import MSA
from src.msa import consensus, occupancy, conservation, prettiness

In [8]:
# Constants
RELEASE_PATH = '../'
PLOTS_PATH = RELEASE_PATH + '/plots'
BATCH_PATH = RELEASE_PATH + '/batch*'
BUILD_PATH = RELEASE_PATH + '/build'

## Batch and build statistics

In [9]:
# Get number of passed clusters, discarded due to bias, to UniProt or to MGnify
def get_stats(batch_path):
    
    # Define iterator through passed clusters
    cluster_iter = iglob(path.join(batch_path, 'MGYP*'))
    # Define iterator through BIAS clusters
    bias_iter = iglob(path.join(batch_path, 'BIAS', 'MGYP*'))
    # Define iterator through UniProt folder
    uniprot_iter = iglob(path.join(batch_path, 'UniProt', 'MGYP*'))
    # Define iterator through MGnify folder
    mgnify_iter = iglob(path.join(batch_path, 'MGnify', 'MGYP*'))
    
    # Define number of passed clusters
    num_passed = sum([1 for i in cluster_iter])
    # Define number of BIAS clusters
    num_bias = sum([1 for i in bias_iter])
    # Define number of UniProt clusters
    num_uniprot = sum([1 for i in uniprot_iter])
    # Define number of MGnify clusters
    num_mgnify = sum([1 for i in mgnify_iter])
    
    # Return statistics
    return num_passed, num_bias, num_uniprot, num_mgnify

In [None]:
# Define iterator through batches
batch_iter = iglob(BATCH_PATH)

In [None]:
# Plot BATCHES statistics

# Set plot parameters
bins = 10
density = False

# Initialize statistics
num_passed, num_bias, num_uniprot = [], [], []
prc_passed, prc_bias, prc_uniprot = [], [], []
# Loop through each batch
for batch_path in batch_iter:
    # Get statistics
    batch_stats = get_stats(batch_path)
    # Define total number of clusters in current batch
    num_clusters = sum(list(batch_stats))
    # Save stats
    num_passed.append(batch_stats[0]), prc_passed.append(batch_stats[0] / num_clusters)
    num_bias.append(batch_stats[1]), prc_bias.append(batch_stats[1] / num_clusters)
    num_uniprot.append(batch_stats[2]), prc_uniprot.append(batch_stats[2] /num_clusters)

# Make plot
fig, axs = plt.subplots(2, 3, figsize=(30, 20), sharex=True, sharey=True)
axs[0, 0].set_title('Passed clusters')
axs[0, 1].set_title('BIAS clusters')
axs[0, 2].set_title('UniProt clusters')
axs[0, 0].hist(num_passed, bins=bins, denisty=density)
axs[0, 1].hist(num_bias, bins=bins, denisty=density)
axs[0, 2].hist(num_uniprot, bins=bins, denisty=density)
axs[1, 0].hist(prc_passed, bins=bins, denisty=density)
axs[1, 1].hist(prc_bias, bins=bins, denisty=density)
axs[1, 2].hist(prc_uniprot, bins=bins, denisty=density)
plt.show()

In [None]:
# # Plot BUILD statistics

# # Define build path
# build_path = BUILD_PATH

# # Initialize statistics
# num_passed, num_bias, num_uniprot, num_mgnify = 0.0, 0.0, 0.0, 0.0
# prc_passed, prc_bias, prc_uniprot, num_mgnify = 0.0, 0.0, 0.0, 0.0
# # Get statistics
# build_stats = get_stats(build_path)
# # Compute total number of clusters
# num_clusters = sum(list(build_stats))
# # Define total number of clusters in current batch
# num_passed, num_bias, num_uniprot, num_mgnify = build_stats
# # Define percentages
# prc_passed = num_passed / num_clusters
# prc_bias = num_bias / num_clusters
# prc_uniprot = num_uniprot / num_clusters
# prc_mgnify = num_mgnify / num_clusters
# # Make plot
# fig, ax = plt.subplots(2, 4, figsize=(40, 20), sharex=True, sharey=True)
# axs[0, 0].set_title('Passed clusters')
# axs[0, 1].set_title('BIAS clusters')
# axs[0, 2].set_title('UniProt clusters')
# axs[0, 3].set_title('MGnify clusters')
# axs[0, 0].bar(num_passed, bins=bins, denisty=density)
# axs[0, 1].bar(num_bias, bins=bins, denisty=density)
# axs[0, 2].bar(num_uniprot, bins=bins, denisty=density)
# axs[0, 3].bar()
# axs[1, 0].bar(prc_passed, bins=bins, denisty=density)
# axs[1, 1].bar(prc_bias, bins=bins, denisty=density)
# axs[1, 2].bar(prc_uniprot, bins=bins, denisty=density)
# axs[1, 3].bar([])
# plt.show()

## MSA statistics

In [10]:
# Get occupancy value and conservation bit-score
def get_msa_stats(msa_path):
    
    # Load MSA from file
    msa = MSA.from_aln(msa_path)
    # Get MSA shape
    n, m = msa.aln.shape
    
    # Get consensus
    cns = consensus(msa.aln)
    # Use consensus to compute occupancy
    occ = occupancy(cns)
    # Use consensus to compute conservation
    csv = conservation(cns)
    # Use conservation to compute prettiness score
    prt = prettiness(csv, n, m)
    
    # Return scores
    return occ, csv, prt

In [None]:
# Get MSA statistics

# Initialize statistics dictionary
stats_raw, stats_trim = dict(), dict()

# Define iterator thorugh every passed cluster in every batch
clusters_iter = iglob(BATCH_PATH + '/MGYP*')

# Loop through every passed cluster
for cluster_path in clusters_iter:
    # Define current cluster's raw SEED alignment path
    seed_raw_path = path.join(cluster_path, 'SEED_raw')
    # Define current cluster's trimmed SEED alignment path
    seed_trim_path = path.join(cluster_path, 'SEED')
    
    # Get statistics from raw SEED alignment
    cns, occ, csv, prt = get_msa_stats(msa_path)
    # stats_raw.setdefault('cns', []).extend(cns)
    stats_raw.setdefault('occ', []).extend(occ)
    stats_raw.setdefault('csv', []).extend(csv)
    stats_raw.setdefault('prt', []).append(prt)
    # Get statistics from trimmed SEED alignment
    cns, occ, csv, prt = get_msa_stats(msa_path)
    # stats_trim.setdefault('cns', []).extend(cns)
    stats_trim.setdefault('occ', []).extend(occ)
    stats_trim.setdefault('csv', []).extend(csv)
    stats_trim.setdefault('prt', []).append(prt)
    
# Make plot
fig, axs = plt.subfigures(2, 3, sharex=True, sharey=False)
# Set orizontal titles
axs[0, 0].set_title('Prettiness score')
axs[0, 1].set_title('Occupancy distribution')
axs[0, 2].set_title('Conservation distribution')
# Set vertical labels
axs[0, 0].set_ylabel('Pre-trimming scores')
axs[1, 0].set_ylabel('Post-tirmming scores')
# Pre-trim prettiness score
axs[0, 0].hist(stats_raw.get('prt'), bins=30, density=False)  # Make histogram
axs[0, 0].axvline(np.mean(stats_raw.get('prt')), color='r')  # Add mean line
# Post-trim prettiness score
axs[1, 0].hist(stats_trim.get('prt'), bins=30, density=False)  # Make histogram
axs[1, 0].axvline(np.mean(stats_trim.get('prt')), color='r')  # Add mean line
# Pre-trim occupancy distribution
axs[0, 1].hist(stats_raw.get('occ'), bins=100, density=False)  # Make histogram
axs[0, 1].axvline(np.mean(stats_raw.get('occ')), color='r') # Add mean line
# Post-trim occupancy distribution
axs[1, 1].hist(stats_trim.get('occ'), bins=100, density=False)  # Make histogram
axs[1, 1].axvline(np.mean(stats_trim.get('occ')), color='r') # Add mean line
# TODO Pre-trim conservation distribution
axs[0, 2].hist(stats_raw.get('csv'), bins=100, density=False)  # Make histogram
axs[0, 2].axvline(np.mean(stats_raw.get('occ')), color='r') # Add mean line
# TODO Post-trim conservation distribution
axs[0, 2].hist(stats_trim.get('csv'), bins=100, density=False)  # Make histogram
axs[0, 2].axvline(np.mean(stats_trim.get('occ')), color='r') # Add mean line
# Show plot
plt.show()