In [1]:
# import standard python packages
import pandas as pd
import subprocess
import os
import dill
import datetime

# add the utils and env directories to the path
import sys
sys.path.append('../../utils/')
sys.path.append('../../env/')

# import functions from utils directory files
from string_functions import *
from biofile_handling import *

# import paths to software installs from env
from install_locs import *

In [2]:
################
# general info #
################

# Specify the name of the species folder in Amazon S3
species = 'Mus_musculus'

# Specify any particular identifying conditions, eg tissue type:
conditions = 'adultbrain'

################

species_prefix = prefixify(species)

# Specify folder as destination for file downloads
output_folder = '../../output/' + prefixify(species) + '_' + conditions + '/'

if not os.path.exists(output_folder):
    os.mkdir(output_folder)
    
species_SampleDict = SampleDict(species, conditions, output_folder)
species_BioFileDocket = BioFileDocket(species_SampleDict).get_from_s3().unpickle()
species_BioFileDocket.s3_to_local()

file Mmus_adultbrain_sample_BioFileDocket.pkl already exists at ../../output/Mmus_adultbrain/Mmus_adultbrain_sample_BioFileDocket.pkl
file GCF_000001635.23_GRCm38.p3_genomic.gff already exists at ../../output/Mmus_adultbrain/GCF_000001635.23_GRCm38.p3_genomic.gff
file GCF_000001635.23_GRCm38.p3_genomic.fna already exists at ../../output/Mmus_adultbrain/GCF_000001635.23_GRCm38.p3_genomic.fna
file GSM2906405_Brain1_dge_coerced.txt already exists at ../../output/Mmus_adultbrain/GSM2906405_Brain1_dge_coerced.txt
file Mmus_adultbrain_uniprot-idmm.tsv already exists at ../../output/Mmus_adultbrain/Mmus_adultbrain_uniprot-idmm.tsv
file Mmus_adultbrain_gtf-idmm.tsv already exists at ../../output/Mmus_adultbrain/Mmus_adultbrain_gtf-idmm.tsv
file GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna already exists at ../../output/Mmus_adultbrain/GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna
file GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna.transdecoder.bed already exists at ../../output/Mmus_adultbrain/GCF_0

In [3]:
file_url = 'https://figshare.com/ndownloader/files/11083451?private_link=865e694ad06d5857db4b'
output_filename = 'Mmus_cellannot.csv'
output_filepath = species_SampleDict.directory + output_filename

subprocess.run(['wget', file_url, '-O', output_filepath],
               stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

cell_annots = pd.read_csv(output_filepath, index_col = 'Unnamed: 0')
display(cell_annots)

Unnamed: 0,Cell.name,ClusterID,Tissue,Batch,Cell.Barcode,Annotation
1,Bladder_1.AAAACGAAAACGGGGCGA,Bladder_1,Bladder,Bladder_1,AAAACGAAAACGGGGCGA,Stromal cell_Dpt high(Bladder)
2,Bladder_1.AAAACGAAGCGGCCGCTA,Bladder_5,Bladder,Bladder_1,AAAACGAAGCGGCCGCTA,Stromal cell_Car3 high(Bladder)
3,Bladder_1.AAAACGAAGTACTAGCAT,Bladder_16,Bladder,Bladder_1,AAAACGAAGTACTAGCAT,Vascular smooth muscle progenitor cell(Bladder)
4,Bladder_1.AAAACGACGTTGCTGTGT,Bladder_8,Bladder,Bladder_1,AAAACGACGTTGCTGTGT,Vascular endothelial cell(Bladder)
5,Bladder_1.AAAACGAGCGAGCGAGTA,Bladder_4,Bladder,Bladder_1,AAAACGAGCGAGCGAGTA,Urothelium(Bladder)
...,...,...,...,...,...,...
270844,NeonatalPancreas_1.AGTTTAAAAACGCTGAAA,NeonatalPancreas_,NeonatalPancreas,NeonatalPancreas_1,AGTTTAAAAACGCTGAAA,Urothelium(NeonatalPancreas)
270845,NeonatalPancreas_1.GGGTTTGAACGCTCTACC,NeonatalPancreas_,NeonatalPancreas,NeonatalPancreas_1,GGGTTTGAACGCTCTACC,Urothelium(NeonatalPancreas)
270846,NeonatalPancreas_1.GCTGTGACAATATTTAGG,NeonatalPancreas_,NeonatalPancreas,NeonatalPancreas_1,GCTGTGACAATATTTAGG,Urothelium(NeonatalPancreas)
270847,NeonatalPancreas_1.GTCCCGGATCTTTATTGT,NeonatalPancreas_,NeonatalPancreas,NeonatalPancreas_1,GTCCCGGATCTTTATTGT,Urothelium(NeonatalPancreas)


In [4]:
sample_name = 'Brain_1'

sample_cell_annots = cell_annots[cell_annots['Batch'] == sample_name]
sample_cell_annots = sample_cell_annots[['Cell.name', 'Annotation']]

sample_cell_annots.rename(columns = {'Cell.name': 'cell_barcode', 'Annotation': 'celltype'}, inplace = True)
display(sample_cell_annots)

output_filename = '_'.join([prefixify(species_BioFileDocket.species), species_BioFileDocket.conditions, sample_name, 'cellannot.tsv'])

output_CellAnnotFile = CellAnnotFile(
    filename = output_filename,
    sampledict = species_SampleDict,
    sources = [species_BioFileDocket.gxc]
)

sample_cell_annots.to_csv(output_CellAnnotFile.path, sep = '\t', index = None)
species_BioFileDocket.add_keyfile(output_CellAnnotFile, 'cellannot')

Unnamed: 0,cell_barcode,celltype
45644,Brain_1.AAAACGAAAACGTCAAAG,Myelinating oligodendrocyte(Brain)
45645,Brain_1.AAAACGAAAGTTAAAACG,Myelinating oligodendrocyte(Brain)
45646,Brain_1.AAAACGAAAGTTACGTTG,Myelinating oligodendrocyte(Brain)
45647,Brain_1.AAAACGAAAGTTTATTGT,Myelinating oligodendrocyte(Brain)
45648,Brain_1.AAAACGAACCTAGGGTTT,Myelinating oligodendrocyte(Brain)
...,...,...
48924,Brain_1.TTTAGGGATCTTAAGTAC,Macrophage_Klf2 high(Brain)
48925,Brain_1.TTTAGGTAAGGGGGGCGA,Myelinating oligodendrocyte(Brain)
48926,Brain_1.TTTAGGTCGTAAGTAATG,Myelinating oligodendrocyte(Brain)
48927,Brain_1.TTTAGGTGCAATCCGACG,Myelinating oligodendrocyte(Brain)


In [5]:
species_BioFileDocket.local_to_s3()
species_BioFileDocket.pickle()
species_BioFileDocket.push_to_s3(overwrite = True)

GCF_000001635.23_GRCm38.p3_genomic.gff already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000001635.23_GRCm38.p3_genomic.fna already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GSM2906405_Brain1_dge_coerced.txt already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
Mmus_adultbrain_uniprot-idmm.tsv already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
Mmus_adultbrain_gtf-idmm.tsv already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna.transdecoder.bed already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000001635.23_GRCm38.p3_g