In [None]:
# Import matplotlib before seaborn
import matplotlib as mpl
import matplotlib.pyplot as plt

import itertools  # for color palette cycling
import re

import pandas as pd
import seaborn as sns

%load_ext autoreload
%autoreload 2
%matplotlib inline


In [None]:
tsvFile = '/work/rnaseq/alignments/map_to_contigs_longer_than_1500bp/map_to_contigs_longer_than_1500bp.tsv'

In [None]:
tsv = pd.read_csv(tsvFile, sep='\t', index_col=0)

In [None]:
tsv.shape

In [None]:
tsv.columns

In [None]:
tsv.head()

In [None]:
sums = tsv.sum(axis=0)
sums.head()

Don't divide by sum, because I should really be dividing by the fastq total, and I'm a little too lazy at the moment.  

** TODO: consider this if we end up thinning **

In [None]:
sums = tsv.sum(axis=1)


In [None]:
sums.head(2)

In [None]:
sums[sums > 0].head()

In [None]:
sums_nonzero = sums[sums > 0]

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 2.5))
ax.hist(sums_nonzero[sums_nonzero < 1e6], bins = 100)
plt.yscale('log', nonposy='clip')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 2.5))
ax.hist(sums_nonzero[sums_nonzero < 1e8])
plt.yscale('log', nonposy='clip', bins = 100)

In [None]:
str(1e6)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(8, 2.5))
cutoff = 1e6
dfs = [sums_nonzero[sums_nonzero < cutoff],
       sums_nonzero[sums_nonzero >= cutoff]]
titles = ['frequency of read sums,\nif < {:.1E}'.format(cutoff),
         'frequency of read sums,\nif >= {:.1E}'.format(cutoff)]
for i, df in enumerate(dfs):
    ax = axs[i]
    ax.set_title(titles[i])
    ax.hist(df, bins = 30, log=True)
    plt.xticks(rotation='vertical')
    ax.set_xlabel('number of reads for gene copy')
    ax.set_ylabel('number of genes')
    #plt.yscale('log', nonposy='clip')

for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation=90)
    
fig.savefig('160223_freq_of_reads_assigned_to_genes--not_normalized_by_seq_depth.pdf',
           bbox_inches='tight')

In [None]:
vals = [1, 2, 10, 100, 1000, 1e4, 1e5]
for v in vals:
    num_left = sums_nonzero[sums_nonzero >= v].shape[0]
    print('value: {}, number: {}'.format(v, num_left))
    d = pd.DataFrame({'cutoff (sum of reads across samples)':[v], 
                      'number remaining': [num_left]})
    print(d)
    if v == vals[0]:
        remaining_df = d
    else:
        remaining_df = pd.concat([remaining_df, d], axis=0)
        
print(remaining_df)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 2.5))
x = 'cutoff (sum of reads across samples)'
y = 'number remaining'
ax.plot(remaining_df[x], remaining_df[y], marker='o')
ax.set_xlabel(x)
ax.set_ylabel(y)
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_title('Number of genes left if for\ndifferent read count cutoffs')
fig.savefig('170223_number_of_genes_left_at_different_read_count_cutoffs.pdf', 
           bbox_inches='tight')