In [None]:
# Import matplotlib before seaborn
import matplotlib as mpl
import matplotlib.pyplot as plt

import itertools  # for color palette cycling
import re

import pandas as pd
import seaborn as sns

%load_ext autoreload
%autoreload 2
%matplotlib inline


In [None]:
import sys
sys.path.append('/work/general_scripts')

In [None]:
mpl.use('Agg')

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import exploration_utils
import plot_subplots # not actually used?
import rnaseq_plot_utils

In [None]:
! ls demos/

In [None]:
from matplotlib import rc
rc('text', usetex=True)
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
# http://stackoverflow.com/questions/2537868/sans-serif-math-with-latex-in-matplotlib
mpl.rcParams['text.latex.preamble'] = [
#       r'\usepackage{siunitx}',   # i need upright \micro symbols, but you need...
#       r'\sisetup{detect-all}',   # ...this to force siunitx to actually use your fonts
       r'\usepackage{helvet}',    # set the normal font here
       r'\usepackage{sansmath}',  # load up the sansmath so that math -> helvet
       r'\sansmath'               # <- tricky! -- gotta actually tell tex to use!
]

In [None]:
#! ls ./map_to_contigs_longer_than_1500bp/*.tsv
! ls ./map_to_contigs_longer_than_1500bp/*.dat

In [None]:
! mkdir -p figures

In [None]:
unders = exploration_utils.load_underscore_stats()

In [None]:
unders.head(2)

In [None]:
# just a test of plotting ability ()
fix, ax = plt.subplots(1, 1, figsize=(3,2))
pd.Series([1,2, 3,4,4,4,1]).plot.hist(ax=ax)
# Note that evrything defaults to TeX
ax.set_xlabel('A_bC_d_E_fG')

In [None]:
# just a test of plotting ability ()
fix, ax = plt.subplots(1, 1, figsize=(3,2))
pd.Series([1,2, 3,4,4,4,1]).plot.hist(ax=ax)
ax.set_xlabel('$A_bC_d_E_fG$')
ax.set_title('low' + ' $\mathregular{O_2}$' + ' replicate {}'.format(99))

In [None]:
meta_cols = ['week', 'oxygen', 'replicate']
underscore_cols = ['frac of RNA reads: __alignment_not_unique',
                   'frac of RNA reads: __ambiguous',
                   'frac of RNA reads: __no_feature',
                   'frac of RNA reads: __not_aligned',
                   'frac of RNA reads: __too_low_aQual']
frac_df = unders[meta_cols + underscore_cols] #.set_index(meta_cols)
frac_df.head()

In [None]:
counts_stats = exploration_utils.load_counts_w_processing()
counts_stats.head(2)

In [None]:
counts_stats.columns

In [None]:
counts_stats.head(2)

In [None]:
p = rnaseq_plot_utils.plot_underscore_bars(
        input_df=counts_stats, filename=None, portrait=True)

In [None]:
p.savefig('./figures/170316_fracs_mapped_unmapped_etc.pdf', bbox_inches='tight')

In [None]:
for tup, df in frac_df.groupby(['oxygen', 'replicate']):
    print(tup, df.shape)

In [None]:
unders['frac of RNA reads: __not_aligned'].max()

In [None]:
unders.head(2)

In [None]:
def under_cleaner(somelist):
    sl = [s.replace('__', '') for s in somelist]
    sl = [s.replace('_', ' ') for s in sl]
    return sl

under_cleaner(unders.columns)

In [None]:
unders_cleaned = unders.copy()
unders_cleaned.columns = under_cleaner(unders.columns)
cnames = [c for c in unders if '__' in c]
cnames_cleaned = under_cleaner(cnames)
for c in cnames_cleaned:
    var = c
    p = exploration_utils.plot_faceted(unders_cleaned, var)
    desc = exploration_utils.filename_cleaner(var)
    fname = './figures/170222_{}.pdf'.format(desc)
    p.savefig(fname)
rc('text', usetex=True)

In [None]:
dict(zip(['frac of RNA reads: __alignment_not_unique', 'frac of RNA reads: __ambiguous', 'frac of RNA reads: __no_feature', 'frac of RNA reads: __not_aligned', 'frac of RNA reads: __too_low_aQual'],
['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854']))

In [None]:
merged_df = exploration_utils.load_counts_w_processing()

In [None]:
merged_df.head(3)

In [None]:
merged_df_cleaned = merged_df.copy()
merged_df_cleaned.columns = under_cleaner(merged_df_cleaned.columns)
cnames = [c for c in merged_df if '__' in c]
cnames_cleaned = under_cleaner(cnames)

In [None]:
def plot_underscores_by_series(dataframe):
    df_cleaned = dataframe.copy()
    df_cleaned.columns = under_cleaner(dataframe.columns)
    x='week'
    
    fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharex=True, sharey=True)
    #print(axs)
    axd = {('low', 1):axs[0, 0],
           ('low', 2):axs[0, 1],
           ('low', 3):axs[0, 2],
           ('low', 4):axs[0, 3], 
           ('high', 1):axs[1, 0],
           ('high', 2):axs[1, 1],
           ('high', 3):axs[1, 2],
           ('high', 4):axs[1, 3]}
    colors = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854']
    series = [c for c in dataframe if ('frac' in c) and ('__' in c)]
    series_cleaned = under_cleaner(series)
    colord = dict(zip(series_cleaned, colors))
    #print('series: {}'.format(series))
    print(df_cleaned.head(2))
    print(df_cleaned.columns)
    for tup, df in df_cleaned.groupby(['oxygen', 'replicate']):
        #print(tup)
        #print(df.head(1))
        ax = axd[tup]
        title = '{} O2, rep {}'.format(tup[0], tup[1])
        ax.set_title(title)
        df = df.copy()
        df.sort_values('week', ascending=False, inplace=True)
        ax.plot(df[x], df['sum(frac RNA-seq mapped to genes)'], 
                color='black', linewidth=4)
        for s in series_cleaned:
            color = colord[s]
            ax.plot(df[x], df[s], color=color, label=s)
        ax.set_xlabel(x)
        
            
    axs[0, 3].legend(bbox_to_anchor=(2.5, 1.))
    return fig

p = plot_underscores_by_series(merged_df)
p.savefig('./figures/170222_underscore_series_together.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(1,1, figsize=(4,2.5))
merged_df['check sum'].hist(bins=20, ax=ax)
ax.set_xlabel('sum of reads in .bam file\n(sum > 1 when reads map to multiple locations)')
ax.set_ylabel('number of samples')
ax.set_title('Recovery of .fastq reads in .bam,\nas reported by Samtools')
fig.savefig('./figures/170223_fastq_read_recovery_by_samtools.pdf')

In [None]:
colnames = [c for c in merged_df.columns if not c.startswith('__')]
colnames = [c for c in colnames if 
            ('cryptic' not in c) and ('LakWas' not in c) and ('sample' not in c)]
merged_df[merged_df['check sum'] > 1.1][colnames]

In [None]:
counts =  exploration_utils.load_counts() 

In [None]:
sns.choose_colorbrewer_palette(data_type='qualitative')

In [None]:
exploration_utils.filename_cleaner('Particulate methane monooxygenase alpha subunit precursor')

In [None]:
# just a test of plotting ability ()
fix, ax = plt.subplots(1, 1, figsize=(3,2))
pd.Series([1,2, 3,4,4,4,1]).plot.hist(ax=ax)
ax.set_xlabel('\mbox{time\_ABC} ($O_2$)')

In [None]:
# Demo of what I will run in a loop below: 
p = exploration_utils.plot_abundance_of_genes_with_same_names(
    'Particulate methane monooxygenase alpha subunit precursor',
    counts)
p.savefig('./figures/170223_pmmo_copy_expression.pdf', bbox_inches='tight')

In [None]:
counts.head()

In [None]:
import sys
sys.prefix

In [None]:
import resource
print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

In [None]:
# Don't run the function; it takes more memory. 
# frac_sums = exploration_utils.load_frac_sums()
frac_sums = counts.groupby('sample id')['frac RNA-seq reads'].sum()
frac_sums = pd.merge(exploration_utils.get_sample_info(), 
                            frac_sums.to_frame().reset_index())

In [None]:
frac_sums.head(2)

In [None]:
fig, ax = plt.subplots(figsize=(4, 2.5))
frac_sums['frac RNA-seq reads'].hist(bins=20, ax = ax)
ax.set_xlabel('fraction of reads mapped to genes')
ax.set_ylabel('frequency')
fig.savefig('./figures/170222_sum_of_frac_reads_mapped_to_genes.pdf', 
            bbox_inches='tight')

In [None]:
def plot_frac_rna_mapped_to_genes():
    p = exploration_utils.plot_faceted(frac_sums, 
                                   'frac RNA-seq reads')
    return p 
    
p = plot_frac_rna_mapped_to_genes()
p.savefig('170222_frac_RNA_reads_mapped_to_genes.pdf', 
          bbox_inches='tight')

In [None]:
counts_nonzero = counts[counts['RNA reads'] > 0]
counts_nonzero.shape

In [None]:
gene_read_totals = counts_nonzero.groupby(
    'product')['RNA reads'].sum().sort_values(ascending=False)
gene_read_totals.head(30)

In [None]:
counts.head(2)

In [None]:
counts.shape

In [None]:
gene_counts = counts[['locus', 'product']].drop_duplicates()
gene_counts.shape
gene_counts = gene_counts.groupby('product').count()
gene_counts = gene_counts.reset_index().rename(columns={'locus':'# gene copies'})
gene_counts.head()

In [None]:
gene_counts.head()

In [None]:
grt = gene_read_totals.to_frame().reset_index().rename(
        columns={'RNA reads': 'sum(RNA reads), all samples'})
grt.head(3)

In [None]:
gene_product_summary = pd.merge(grt, gene_counts)
print(gene_product_summary.shape)
gene_product_summary.sort_values('sum(RNA reads), all samples', 
                      ascending=False, inplace=True)
gene_product_summary.head(30)

In [None]:
gene_product_summary.sort_values('# gene copies', ascending=False).head(30)

In [None]:
#  gene_read_totals.to_csv('./map_to_contigs_longer_than_1500bp/gene_read_totals.tsv', sep='\t')
gene_product_summary.to_csv('./map_to_contigs_longer_than_1500bp/gene_product_summary.tsv', sep='\t')

In [None]:
gene_read_totals.to_csv('./map_to_contigs_longer_than_1500bp/gene_read_totals.tsv', sep='\t')

In [None]:
#sample_info.head()

In [None]:
exploration_utils.prep_gene_cts('Particulate methane monooxygenase alpha subunit precursor', counts).head()

In [None]:
! mkdir -p ./figures/gene_reads

In [None]:
exploration_utils.filename_cleaner('170222_read_counts_Ammonia monooxygenase/methane monooxygenase%2C subunit C')

In [None]:
! ls ./figures/gene_reads/ | head

In [None]:
exploration_utils.plot_read_counts_by_product(
    'Particulate methane monooxygenase alpha subunit precursor',
    counts)

In [None]:
exploration_utils.plot_read_fracs_by_product(
    'Particulate methane monooxygenase alpha subunit precursor',
    counts)

In [None]:
counts_nonzero.head(2)

In [None]:
num_plots = 50
gene_names = counts_nonzero.groupby('product')['RNA reads'].sum().sort_values(
    ascending=False).index.tolist()[0:num_plots]

for g in gene_names:
    p = exploration_utils.plot_read_counts_by_product(g, counts)

In [None]:
counts_nonzero.groupby('product')['frac RNA-seq reads'].sum().sort_values(
    ascending=False)

In [None]:
num_plots = 50
gene_names = counts_nonzero.groupby('product')['frac RNA-seq reads'].sum().sort_values(
    ascending=False).index.tolist()[0:num_plots]

fignum=1
for g in gene_names:
    p = exploration_utils.plot_read_fracs_by_product(g, counts, fignum=fignum)
    if fignum > 1:  # can't do the hypothetical protein with 413685 copies. 
        # TODO: check that there aren't too many genes.  
        exploration_utils.plot_abundance_of_genes_with_same_names(g, counts, fignum=fignum)
    fignum += 1

## Look for Xox

In [None]:
# Xox is annotated as methanol dehydrogenase, so will have to un-tease that. 

In [None]:
gene_counts.head()

In [None]:
gene_counts[gene_counts['product'].str.contains('Xox')]

In [None]:
gene_counts[gene_counts['product'].str.contains('methanol')]

## Plot abundance of different genes in the 8 series. 

In [None]:
counts.head()

In [None]:
gene = 'Particulate methane monooxygenase alpha subunit precursor'
counts[counts['product'] == gene].head()

In [None]:
toy = counts[counts['product'] == gene]
top = toy.groupby('locus')['frac RNA-seq reads'].max().sort_values(
    ascending=False).to_frame().reset_index()
top.head()

In [None]:
from cycler import cycler

In [None]:
import seaborn as sns