In [None]:
import datetime

# Import matplotlib before seaborn
import matplotlib as mpl
import matplotlib.pyplot as plt

import itertools  # for color palette cycling
import re

import pandas as pd
import seaborn as sns

%load_ext autoreload
%autoreload 2
%matplotlib inline


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('/work/general_scripts')

In [None]:
mpl.use('Agg')

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from matplotlib import rc
rc('text', usetex=True)

In [None]:
import exploration_utils
import plot_subplots # not actually used?
import rnaseq_plot_utils

In [None]:
! ls demos/

In [None]:
from matplotlib import rc
rc('text', usetex=True)  # do need this for any $$ formulas
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
# http://stackoverflow.com/questions/2537868/sans-serif-math-with-latex-in-matplotlib
mpl.rcParams['text.latex.preamble'] = [
#       r'\usepackage{siunitx}',   # i need upright \micro symbols, but you need...
#       r'\sisetup{detect-all}',   # ...this to force siunitx to actually use your fonts
       r'\usepackage{helvet}',    # set the normal font here
       r'\usepackage{sansmath}',  # load up the sansmath so that math -> helvet
       r'\sansmath'               # <- tricky! -- gotta actually tell tex to use!
]

In [None]:
#! ls ./map_to_contigs_longer_than_1500bp/*.tsv
! ls ./map_to_contigs_longer_than_1500bp/*.dat

In [None]:
! mkdir -p figures

In [None]:
unders = exploration_utils.load_underscore_stats()

In [None]:
unders.head(2)

In [None]:
# just a test of plotting ability ()
fix, ax = plt.subplots(1, 1, figsize=(3,2))
pd.Series([1,2, 3,4,4,4,1]).plot.hist(ax=ax)
# Note that evrything defaults to TeX
ax.set_xlabel('A_bC_d_E_fG')

In [None]:
# just a test of plotting ability ()
fix, ax = plt.subplots(1, 1, figsize=(3,2))
pd.Series([1,2, 3,4,4,4,1]).plot.hist(ax=ax)
ax.set_xlabel(r'A_bC_d_E_fG') #'A_{bC}_{d}_{E}_{fG}')
ax.set_title('low' + r' $\mathregular{O_{2}}$' + ' replicate {}'.format(99))

In [None]:
meta_cols = ['week', 'oxygen', 'replicate']
underscore_cols = ['frac of RNA reads: __alignment_not_unique',
                   'frac of RNA reads: __ambiguous',
                   'frac of RNA reads: __no_feature',
                   'frac of RNA reads: __not_aligned',
                   'frac of RNA reads: __too_low_aQual']
frac_df = unders[meta_cols + underscore_cols] #.set_index(meta_cols)
frac_df.head()

In [None]:
datetime.datetime.now()

In [None]:
start = datetime.datetime.now()
counts_stats = exploration_utils.load_counts_w_processing()
print(datetime.datetime.now() - start)
counts_stats.head(2)

In [None]:
counts_stats.columns

In [None]:
counts_stats.head(2)

In [None]:
p = rnaseq_plot_utils.plot_underscore_bars(
        input_df=counts_stats, filename=None, 
        portrait=True)
#plt.tight_layout()

In [None]:
p.savefig('./figures/170424_fracs_mapped_unmapped_etc.pdf', 
          bbox_inches='tight')

In [None]:
p.savefig('./figures/170424_fracs_mapped_unmapped_etc.png', 
          bbox_inches='tight')

In [None]:
for tup, df in frac_df.groupby(['oxygen', 'replicate']):
    print(tup, df.shape)

In [None]:
unders['frac of RNA reads: __not_aligned'].max()

In [None]:
unders.head(2)

In [None]:
def under_cleaner(somelist):
    sl = [s.replace('__', '') for s in somelist]
    sl = [s.replace('_', ' ') for s in sl]
    return sl

under_cleaner(unders.columns)

In [None]:
unders_cleaned = unders.copy()
unders_cleaned.columns = under_cleaner(unders.columns)
cnames = [c for c in unders if '__' in c]
cnames_cleaned = under_cleaner(cnames)
for c in cnames_cleaned:
    var = c
    p = exploration_utils.plot_faceted(unders_cleaned, var)
    desc = exploration_utils.filename_cleaner(var)
    fname = './figures/170222_{}.pdf'.format(desc)
    p.savefig(fname)
rc('text', usetex=True)

In [None]:
print(unders.memory_usage().sum())

# DELETE unders
del unders

In [None]:
dict(zip(['frac of RNA reads: __alignment_not_unique', 'frac of RNA reads: __ambiguous', 'frac of RNA reads: __no_feature', 'frac of RNA reads: __not_aligned', 'frac of RNA reads: __too_low_aQual'],
['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854']))

In [None]:
merged_df = exploration_utils.load_counts_w_processing()

In [None]:
merged_df.head(3)

In [None]:
merged_df_cleaned = merged_df.copy()
merged_df_cleaned.columns = under_cleaner(merged_df_cleaned.columns)
cnames = [c for c in merged_df if '__' in c]
cnames_cleaned = under_cleaner(cnames)

In [None]:
def plot_underscores_by_series(dataframe):
    df_cleaned = dataframe.copy()
    df_cleaned.columns = under_cleaner(dataframe.columns)
    x='week'
    
    fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharex=True, sharey=True)
    #print(axs)
    axd = {('low', 1):axs[0, 0],
           ('low', 2):axs[0, 1],
           ('low', 3):axs[0, 2],
           ('low', 4):axs[0, 3], 
           ('high', 1):axs[1, 0],
           ('high', 2):axs[1, 1],
           ('high', 3):axs[1, 2],
           ('high', 4):axs[1, 3]}
    colors = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854']
    series = [c for c in dataframe if ('frac' in c) and ('__' in c)]
    series_cleaned = under_cleaner(series)
    colord = dict(zip(series_cleaned, colors))
    #print('series: {}'.format(series))
    #print(df_cleaned.head(2))
    #print(df_cleaned.columns)
    for tup, df in df_cleaned.groupby(['oxygen', 'replicate']):
        #print(tup)
        #print(df.head(1))
        ax = axd[tup]
        title = '{} O2, rep {}'.format(tup[0], tup[1])
        ax.set_title(title)
        df = df.copy()
        df.sort_values('week', ascending=False, inplace=True)
        ax.plot(df[x], df['sum(frac RNA-seq mapped to genes)'], 
                color='black', linewidth=4)
        for s in series_cleaned:
            color = colord[s]
            ax.plot(df[x], df[s], color=color, label=s)
        ax.set_xlabel(x)
        
            
    axs[0, 3].legend(bbox_to_anchor=(2.5, 1.))
    return fig

p = plot_underscores_by_series(merged_df)
p.savefig('./figures/170222_underscore_series_together.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(1,1, figsize=(4,2.5))
merged_df['check sum'].hist(bins=20, ax=ax)
ax.set_xlabel('sum of reads in .bam file\n(sum $>$ 1 when reads map to multiple locations)')
ax.set_ylabel('number of samples')
ax.set_title('Recovery of .fastq reads in .bam,\nas reported by Samtools')
fig.savefig('./figures/170223_fastq_read_recovery_by_samtools.pdf')

In [None]:
colnames = [c for c in merged_df.columns if not c.startswith('__')]
colnames = [c for c in colnames if 
            ('cryptic' not in c) and ('LakWas' not in c) and ('sample' not in c)]
merged_df[merged_df['check sum'] > 1.1][colnames]

In [None]:
counts =  exploration_utils.load_counts() 

In [None]:
# Release some memory before plotting. 
colnames_to_drop = ['LakWas type name', 'sample number',
                    'cryptic metagenome name', 'cryptic metatranscriptome name',
                    'fastq'
                   ]
for c in colnames_to_drop:
    try:
        del counts[c]
    except:
        print("deleting {} didn't work")

In [None]:
counts.head(3)

In [None]:
sns.choose_colorbrewer_palette(data_type='qualitative')

In [None]:
exploration_utils.filename_cleaner('Particulate methane monooxygenase alpha subunit precursor')

In [None]:
# just a test of plotting ability ()
fix, ax = plt.subplots(1, 1, figsize=(3,2))
pd.Series([1,2, 3,4,4,4,1]).plot.hist(ax=ax)
ax.set_xlabel('\mbox{time\_ABC} ($O_2$)')

In [None]:
# Demo of what I will run in a loop below: 
p = exploration_utils.plot_abundance_of_genes_with_same_names(
    'Particulate methane monooxygenase alpha subunit precursor',
    counts, portrait=True)
p.savefig('./figures/170223_pmmo_copy_expression.pdf', 
          bbox_inches='tight')

In [None]:
counts.head()

In [None]:
import sys
sys.prefix

In [None]:
import resource
print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

In [None]:
# Don't run the function; it takes more memory. 
# frac_sums = exploration_utils.load_frac_sums()
frac_sums = counts.groupby('sample id')['frac RNA-seq reads'].sum()
frac_sums = pd.merge(exploration_utils.get_sample_info(), 
                            frac_sums.to_frame().reset_index())

In [None]:
frac_sums.head(2)

In [None]:
fig, ax = plt.subplots(figsize=(4, 2.5))
frac_sums['frac RNA-seq reads'].hist(bins=20, ax = ax)
ax.set_xlabel('fraction of reads mapped to genes')
ax.set_ylabel('frequency')
fig.savefig('./figures/170222_sum_of_frac_reads_mapped_to_genes.pdf', 
            bbox_inches='tight')

In [None]:
def plot_frac_rna_mapped_to_genes():
    p = exploration_utils.plot_faceted(frac_sums, 
                                   'frac RNA-seq reads')
    return p 
    
p = plot_frac_rna_mapped_to_genes()
p.savefig('170222_frac_RNA_reads_mapped_to_genes.pdf', 
          bbox_inches='tight')

In [None]:
counts_nonzero = counts[counts['RNA reads'] > 0]
counts_nonzero.shape

In [None]:
gene_read_totals = counts_nonzero.groupby(
    'product')['RNA reads'].sum().sort_values(ascending=False)
gene_read_totals.head(30)

In [None]:
counts.head(2)

In [None]:
counts.shape

In [None]:
gene_counts = counts[['locus', 'product']].drop_duplicates()
gene_counts.shape
gene_counts = gene_counts.groupby('product').count()
gene_counts = gene_counts.reset_index().rename(columns={'locus':'# gene copies'})
gene_counts.head()

In [None]:
gene_counts.head()

In [None]:
grt = gene_read_totals.to_frame().reset_index().rename(
        columns={'RNA reads': 'sum(RNA reads), all samples'})
grt.head(3)

In [None]:
gene_product_summary = pd.merge(grt, gene_counts)
print(gene_product_summary.shape)
gene_product_summary.sort_values('sum(RNA reads), all samples', 
                      ascending=False, inplace=True)
gene_product_summary.head(30)

In [None]:
gene_product_summary.sort_values('# gene copies', 
                                 ascending=False).head(10)

In [None]:
print("{:,d}".format(12345678))

In [None]:
gene_product_summary.head(30)

In [None]:
def commas(d):
    return "{:,d}".format(d)

def no_format(d):
    return d

In [None]:
pd.set_option('max_colwidth', 999)

In [None]:
import textwrap

In [None]:
tex_filepath = "./map_to_contigs_longer_than_1500bp/top_genes.tex"
with open(tex_filepath, "w") as f:
    f.write(gene_product_summary.head(300).to_latex(
        index=False, longtable=True, 
        formatters=[no_format, commas, commas]))

In [None]:
#  gene_read_totals.to_csv('./map_to_contigs_longer_than_1500bp/gene_read_totals.tsv', sep='\t')
gene_product_summary.to_csv('./map_to_contigs_longer_than_1500bp/gene_product_summary.tsv', sep='\t')

In [None]:
gene_read_totals.to_csv('./map_to_contigs_longer_than_1500bp/gene_read_totals.tsv', sep='\t')

In [None]:
! realpath ./map_to_contigs_longer_than_1500bp

In [None]:
#sample_info.head()

In [None]:
exploration_utils.prep_gene_cts('Particulate methane monooxygenase alpha subunit precursor', counts).head()

In [None]:
def dict_of_sample_names():
    """
    Dict that converts from 8888.8.111111.GGAAGG type numbers to
    high_O2_replicate_3_week_2 type strings for PhD thesis.
    """
    si = si = pd.read_csv('/work/m4b_binning/assembly/data/sample_info/sample_info_w_cryptic.tsv', 
                          sep='\t')
    si['name'] = si['oxygen'] + '_O2_rep_' + si['replicate'].astype(str) + "_week_" + si['week'].astype(str)
    #return si['cryptic metatranscriptome name'].tolist()
    return dict(zip(si['cryptic metatranscriptome name'].tolist(), 
               si['name'].tolist()))

dict_of_sample_names()

In [None]:
! mkdir -p ./figures/gene_reads

In [None]:
exploration_utils.filename_cleaner('170222_read_counts_Ammonia monooxygenase/methane monooxygenase%2C subunit C')

In [None]:
! ls ./figures/gene_reads/ | head

In [None]:
exploration_utils.plot_read_counts_by_product(
    'Particulate methane monooxygenase alpha subunit precursor',
    counts)

In [None]:
exploration_utils.plot_read_fracs_by_product(
    'Particulate methane monooxygenase alpha subunit precursor',
    counts)

In [None]:
counts_nonzero.head(2)

In [None]:
counts.shape

In [None]:
print(counts.columns)
counts.head(2)

In [None]:
list(range(0, 20))

In [None]:
from multiprocessing import Pool
from functools import partial
from itertools import repeat

In [None]:
args = list(zip(repeat('yello'), range(0, 20)))
print(args)

In [None]:
def print_base_s(base, s):
    print(base + s)

pool = Pool(processes=10)
args = zip('')
plot_fun = partial(print_base_s, base='hello ')
pool.starmap(plot_fun, args)


In [None]:
num_plots = 5 #15 #50
gene_names = counts_nonzero.groupby('product')['RNA reads'].sum().sort_values(
    ascending=False).index.tolist()[0:num_plots]
pool = Pool(processes=3)

# DIDN'T WORK
#plot_fun = partial(
#    exploration_utils.plot_read_counts_by_product, 
#    sample_info=counts)
#pool.starmap(exploration_utils.plot_read_counts_by_product, 
#             list(zip(gene_names, repeat(counts))))

def plot_fun(gene_name):
    return exploration_utils.plot_read_counts_by_product(gene_name, counts)

pool.map(plot_fun, gene_names)  # doesn't print plots to notebook  :( 

pool.close()

#for g in gene_names:
#    p = exploration_utils.plot_read_counts_by_product(g, counts)

In [None]:
counts_nonzero.groupby('product')['frac RNA-seq reads'].sum().sort_values(
    ascending=False).head(5)

In [None]:
! pwd

In [None]:
def plot_fun_colors(gene_name, colors):
    exploration_utils.plot_abundance_of_genes_with_same_names(
        gene_name=gene_name, dataframe=counts, portrait=True,
        top_colors=colors)
    exploration_utils.plot_abundance_of_genes_with_same_names(
        gene_name=gene_name, dataframe=counts, portrait=False,
        top_colors=colors)
    
colors = [
    '#ff8101', # orange
    '#e41a1c', # red
    '#a8572c', # brown
    '#4eae4b', # green
    '#994fa1', # purple
    '#377eb8', # blue
    #'#fdfc33', # hard-to-see yellow
    '#d1d027', # dark yellow
    '#f482be',  # pink
    ]
    
plot_fun_colors('Particulate methane monooxygenase alpha subunit precursor',
        colors=colors)

In [None]:
num_plots = 15 #50
gene_names = counts_nonzero.groupby('product')['frac RNA-seq reads'].sum().sort_values(
    ascending=False).index.tolist()[0:num_plots]
print(gene_names)

In [None]:
gene_names.remove('hypothetical protein') # too many copies
print(gene_names)

In [None]:
gene_names.index('Ammonia monooxygenase/methane monooxygenase%2C subunit C')

In [None]:
print(len(gene_names))

In [None]:
counts.head(2)

In [None]:
pool = Pool(processes=4)

def plot_fun(gene_name):
    fignum = gene_names.index(gene_name) + 1
    return exploration_utils.plot_abundance_of_genes_with_same_names(
        gene_name, counts, fignum)

pool.map(plot_fun, gene_names)  # doesn't print plots to notebook  :( 

pool.close()

#for g in gene_names:
#    p = exploration_utils.plot_read_fracs_by_product(g, counts, fignum=fignum)
#    if fignum > 1:  # can't do the hypothetical protein with 413685 copies. 
#        # TODO: check that there aren't too many genes.  
#        exploration_utils.plot_abundance_of_genes_with_same_names(g, counts, fignum=fignum)
#    fignum += 1

In [None]:
# pick out some more by hand.
gene_names = [
    'Capsid protein (F protein)', 
    'Microvirus H protein (pilot protein)',
    'Bacteriophage replication gene A protein (GPA)',
    'S-layer protein',
    'Bacteriophage replication gene A protein (GPA)', # Remove (included above.)
    'Phage Tail Collar Domain protein', # REMOVE
    ]

pool = Pool(processes=4)

def plot_fun(gene_name):
    exploration_utils.plot_abundance_of_genes_with_same_names(
        gene_name=gene_name, dataframe=counts, portrait=True)
    exploration_utils.plot_abundance_of_genes_with_same_names(
        gene_name=gene_name, dataframe=counts, portrait=False)

pool.map(plot_fun, gene_names)  # doesn't print plots to notebook  :( 

pool.close()

#for g in gene_names:
#    p = exploration_utils.plot_abundance_of_genes_with_same_names(
#        g, counts, portrait=True)
#    p = exploration_utils.plot_abundance_of_genes_with_same_names(
#        g, counts, portrait=False)

In [None]:
sns.color_palette("Set1", 8).as_hex()

In [None]:
# pick out some more by hand.

pool = Pool(processes=4)
gene_names = [ 
    #'Particulate methane monooxygenase alpha subunit precursor',
    'Ammonia monooxygenase/methane monooxygenase%2C subunit C', 
    'Particulate methane monooxygenase beta subunit',
    'Methanol dehydrogenase [cytochrome c] subunit 2 precursor',
    'Methanol dehydrogenase [cytochrome c] subunit 1 precursor',
    'Methane monooxygenase component C']
pool.map(plot_fun, gene_names)  # doesn't print plots to notebook  :( 

pool.close()



#for g in gene_names:
#    p = exploration_utils.plot_abundance_of_genes_with_same_names(
#        g, counts, portrait=True)
#    p = exploration_utils.plot_abundance_of_genes_with_same_names(
#        g, counts, portrait=False)

In [None]:
pool = Pool(processes=4)
gene_names = [
    'Methanol dehydrogenase [cytochrome c] subunit 1',
    '3-hexulose-6-phosphate synthase',
    'Transketolase 1',
    'Formaldehyde-activating enzyme',
    '3-hexulose-6-phosphate isomerase',
    
    # other interesting looking stuff
    'Outer membrane porin F precursor',
    'Bacterial extracellular solute-binding pro- teins%2C family 3',
    'Phenolphthiocerol synthesis polyketide syn- thase type I Pks15/1',   
    ]

pool.map(plot_fun, gene_names)  # doesn't print plots to notebook  :( 

pool.close()

## Look for Xox

In [None]:
# Xox is annotated as methanol dehydrogenase, so will have to un-tease that. 

In [None]:
gene_counts.head()

In [None]:
gene_counts[gene_counts['product'].str.contains('Xox')]

In [None]:
gene_counts[gene_counts['product'].str.contains('methanol')]