In [None]:
# Import matplotlib before seaborn
import matplotlib as mpl
import matplotlib.pyplot as plt

import re

import pandas as pd
import seaborn as sns

%load_ext autoreload
%autoreload 2
%matplotlib inline


In [None]:
import exploration_utils

In [None]:
! ls demos/

In [None]:
#! ls ./map_to_contigs_longer_than_1500bp/*.tsv
! ls ./map_to_contigs_longer_than_1500bp/*.dat

In [None]:
! mkdir -p figures

In [None]:
unders = exploration_utils.load_underscore_stats()

In [None]:
unders.head()

In [None]:
unders['frac of RNA reads: __not_aligned'].max()

In [None]:
cnames = [c for c in unders if '__' in c]
for c in cnames:
    var = c
    p = exploration_utils.plot_faceted(unders, var)
    desc = exploration_utils.filename_cleaner(var)
    fname = './figures/170222_{}.pdf'.format(desc)
    p.savefig(fname)

In [None]:
unders.head(1)

In [None]:
dict(zip(['frac of RNA reads: __alignment_not_unique', 'frac of RNA reads: __ambiguous', 'frac of RNA reads: __no_feature', 'frac of RNA reads: __not_aligned', 'frac of RNA reads: __too_low_aQual'],
['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854']))

In [None]:
counts =  exploration_utils.load_counts() 

In [None]:
counts.head()

In [None]:
frac_sums = counts.groupby('sample id')['frac RNA-seq reads'].sum() #.min()

In [None]:
frac_sums.reset_index().rename(columns={'frac RNA-seq reads': 'sum(RNA-seq mapped to genes)'}).head()

In [None]:
merged_df = pd.merge(unders, 
                     frac_sums.reset_index().rename(
                        columns={'frac RNA-seq reads':'sum(RNA-seq mapped to genes)'}))

In [None]:
merged_df['check sum'] = 0 
cnames_to_sum = [c for c in merged_df.columns 
                 if (': __' in c) or (c == 'sum(RNA-seq mapped to genes)')]
for c in cnames_to_sum:
    merged_df['check sum'] = merged_df['check sum'] + merged_df[c]

merged_df.head()

In [None]:
merged_df['check sum'].hist(bins=20)

In [None]:
! pwd

In [None]:
! ls

In [None]:
def plot_underscores_by_series(dataframe):
    x='week'
    
    fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharex=True, sharey=True)
    #print(axs)
    axd = {('low', 1):axs[0, 0],
           ('low', 2):axs[0, 1],
           ('low', 3):axs[0, 2],
           ('low', 4):axs[0, 3], 
           ('high', 1):axs[1, 0],
           ('high', 2):axs[1, 1],
           ('high', 3):axs[1, 2],
           ('high', 4):axs[1, 3]}
    colors = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854']
    series = [c for c in dataframe if ('frac' in c) and ('__' in c)]
    colord = dict(zip(series, colors))
    #print('series: {}'.format(series))
    for tup, df in dataframe.groupby(['oxygen', 'replicate']):
        #print(tup)
        #print(df.head(1))
        ax = axd[tup]
        title = '{} O2, rep {}'.format(tup[0], tup[1])
        ax.set_title(title)
        df = df.copy()
        df.sort_values('week', ascending=False, inplace=True)
        ax.plot(df[x], df['sum(RNA-seq mapped to genes)'], 
                color='black', linewidth=4)
        for s in series:
            color = colord[s]
            ax.plot(df[x], df[s], color=color, label=s)
        ax.set_xlabel(x)
        
            
    axs[0, 3].legend(bbox_to_anchor=(2.5, 1.))
    return fig
        
merged_df = pd.merge(unders, 
                    frac_sums.reset_index().rename(
                        columns={'frac RNA-seq reads':'sum(RNA-seq mapped to genes)'}))
p = plot_underscores_by_series(merged_df)
p.savefig('./figures/170222_underscore_series_together.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(4, 2.5))
frac_sums.hist(bins=20, ax = ax)
ax.set_xlabel('fraction of reads mapped to genes')
ax.set_ylabel('frequency')
fig.savefig('./figures/170222_sum_of_frac_reads_mapped_to_genes.pdf', 
            bbox_inches='tight')

In [None]:
frac_sums.to_frame().reset_index().head()

In [None]:
frac_sums_merged = pd.merge(exploration_utils.get_sample_info(), 
                            frac_sums.to_frame().reset_index())
frac_sums_merged.head()

In [None]:
def plot_frac_rna_mapped_to_genes():
    p = exploration_utils.plot_faceted(frac_sums_merged, 
                                   'frac RNA-seq reads')
    return p 
    
p = plot_frac_rna_mapped_to_genes()
p.savefig('170222_frac_RNA_reads_mapped_to_genes.pdf', 
          bbox_inches='tight')

In [None]:
counts_nonzero = counts[counts['RNA reads'] > 0]
counts_nonzero.shape

In [None]:
gene_read_totals = counts_nonzero.groupby('product')['RNA reads'].sum().sort_values(ascending=False)
gene_read_totals.head(30)

In [None]:
counts.head(2)

In [None]:
counts.shape

In [None]:
gene_counts = counts[['locus', 'product']].drop_duplicates()
gene_counts.shape
gene_counts = gene_counts.groupby('product').count()
gene_counts = gene_counts.reset_index().rename(columns={'locus':'# gene copies'})
gene_counts.head()

In [None]:
gene_counts.head()

In [None]:
grt = gene_read_totals.to_frame().reset_index().rename(
        columns={'RNA reads': 'sum(RNA reads), all samples'})
grt.head(3)

In [None]:
top_genes = pd.merge(grt, gene_counts)
print(top_genes.shape)
top_genes.sort_values('sum(RNA reads), all samples', 
                      ascending=False, inplace=True)
top_genes.head(30)

In [None]:
top_genes.sort_values('# gene copies', ascending=False).head(30)

In [None]:
#  gene_read_totals.to_csv('./map_to_contigs_longer_than_1500bp/gene_read_totals.tsv', sep='\t')
top_genes.to_csv('./map_to_contigs_longer_than_1500bp/top_genes.tsv', sep='\t')

In [None]:
gene_read_totals.to_csv('./map_to_contigs_longer_than_1500bp/gene_read_totals.tsv', sep='\t')

In [None]:
#sample_info.head()

In [None]:
exploration_utils.prep_gene_cts('Particulate methane monooxygenase alpha subunit precursor', counts).head()

In [None]:
! mkdir -p ./figures/gene_reads

In [None]:
exploration_utils.filename_cleaner('170222_read_counts_Ammonia monooxygenase/methane monooxygenase%2C subunit C')

In [None]:
! ls ./figures/gene_reads/ | head

In [None]:
exploration_utils.plot_read_counts_by_product(
    'Particulate methane monooxygenase alpha subunit precursor',
    counts)

In [None]:
exploration_utils.plot_read_fracs_by_product(
    'Particulate methane monooxygenase alpha subunit precursor',
    counts)

In [None]:
counts_nonzero.head(2)

In [None]:
num_plots = 50
gene_names = counts_nonzero.groupby('product')['RNA reads'].sum().sort_values(
    ascending=False).index.tolist()[0:num_plots]

for g in gene_names:
    p = exploration_utils.plot_read_counts_by_product(g, counts)

In [None]:
counts_nonzero.groupby('product')['frac RNA-seq reads'].sum().sort_values(
    ascending=False)

In [None]:
num_plots = 50
gene_names = counts_nonzero.groupby('product')['frac RNA-seq reads'].sum().sort_values(
    ascending=False).index.tolist()[0:num_plots]

fignum=1
for g in gene_names:
    p = exploration_utils.plot_read_fracs_by_product(g, counts, fignum=fignum)
    fignum += 1