In [1]:
%matplotlib inline
import glob 
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_pickle('NajaNCOUNTS.pickle.gz').T # We did this separately for all samples

In [3]:
tgIdToGene = {}
scaffoldToGeneId={}
with open('../../tgMapList.tsv') as f:
    for line in f:
        parts = line.strip().split(None,1)
        tgIdToGene[parts[0]] = parts[1]

with open('../../tgMap.tsv') as f:
    for line in f:
        parts = line.strip().split(None,1)
        scaffoldToGeneId[parts[0]] = parts[1]

longColnames = []
for index in df.columns:
    longColnames.append( tgIdToGene[scaffoldToGeneId[index]] )


In [4]:
# Dataset size before collapsing same annotation genes
print(df.shape)

(1, 213263)


In [5]:
df.columns = longColnames
del longColnames

In [6]:
def barPlot(sums):
    fig, ax = plt.subplots(figsize=(150,5))
    ax.bar(range(len(sums)),sums)
    ax.set_xticks(range(len(sums)) )
    ax.set_xticklabels(sums.index, rotation=90)
    ax.xaxis.set_tick_params(labelsize=9)
#barPlot(df.sum())

In [7]:
import scanpy as sc

  from ._conv import register_converters as _register_converters


In [8]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures
sc.logging.print_version_and_date()
sc.logging.print_versions_dependencies_numerics()

Running Scanpy 1.4 on 2019-04-09 17:58.
Dependencies: anndata==0.6.17 numpy==1.15.4 scipy==1.0.0 pandas==0.23.4 scikit-learn==0.19.1 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [9]:
summedCols = df.groupby(df.columns, axis=1).sum()

In [10]:
# Dataset size after collapsing same annotation genes
print(summedCols.shape)

(1, 70786)


In [11]:
banned = [
 'rRNA','ribosomal','mitochondrial','mitochondrion','microsatellite','transposon','SINE','LINE repeat'   
]
keptGenes = [geneName for geneName in summedCols.columns if not any( (b in geneName for b in banned) )]


In [12]:
adata = sc.AnnData(summedCols)
#adata.var_names_make_unique()
adata

AnnData object with n_obs × n_vars = 1 × 70786 

In [13]:
batches = [ob[0].split('_')[0].split('-')[-1] for ob in adata.obs_names]
adata.obs['batch'] = batches

In [14]:
# Transpose for RaceID analysis:
exportFrame = adata.to_df().transpose()

In [15]:
keptGenes = [geneName for geneName in adata.var.index if not any( (b in geneName for b in banned) )]

In [16]:
exportFrame.to_csv('./bulkNajMappedDeduplicated.csv')
