In [1]:
# import standard python packages
import pandas as pd
import subprocess
import os
import dill
import datetime

# add the utils and env directories to the path
import sys
sys.path.append('../../utils/')
sys.path.append('../../env/')

# import functions from utils directory files
from string_functions import *
from biofile_handling import *

# import paths to software installs from env
from install_locs import *

In [2]:
################
# general info #
################

# Specify the name of the species folder in Amazon S3
species = 'Danio_rerio'

# Specify any particular identifying conditions, eg tissue type:
conditions = 'adultbrain'

################

species_prefix = prefixify(species)

# Specify folder as destination for file downloads
output_folder = '../../output/' + prefixify(species) + '_' + conditions + '/'

if not os.path.exists(output_folder):
    os.mkdir(output_folder)
    
species_SampleDict = SampleDict(species, conditions, output_folder)
species_BioFileDocket = BioFileDocket(species_SampleDict).get_from_s3().unpickle()
species_BioFileDocket.s3_to_local()

file Drer_adultbrain_sample_BioFileDocket.pkl already exists at ../../output/Drer_adultbrain/Drer_adultbrain_sample_BioFileDocket.pkl
file GCF_000002035.5_GRCz10_genomic.gff already exists at ../../output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic.gff
file GCF_000002035.5_GRCz10_genomic.fna already exists at ../../output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic.fna
file GSM3768152_Brain_8_dge.txt already exists at ../../output/Drer_adultbrain/GSM3768152_Brain_8_dge.txt
file Drer_adultbrain_uniprot-idmm.tsv already exists at ../../output/Drer_adultbrain/Drer_adultbrain_uniprot-idmm.tsv
file Drer_adultbrain_gtf-idmm.tsv already exists at ../../output/Drer_adultbrain/Drer_adultbrain_gtf-idmm.tsv
file GCF_000002035.5_GRCz10_genomic_cDNA.fna already exists at ../../output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic_cDNA.fna
file GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder.bed already exists at ../../output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic_cDNA.fna.transde

In [10]:
file_url = 'https://ndownloader.figstatic.com/files/30949762'
output_filename = 'Drer_cellannot.xlsx'
output_filepath = species_SampleDict.directory + output_filename

subprocess.run(['wget', file_url, '-O', output_filepath],
               stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

cell_annots = pd.read_excel(output_filepath, sheet_name= 'cell barcode of each cell')
cell_annots['Batch'] = cell_annots['cell barcode'].str.split('.', expand = True)[0]
cell_annots['cell_barcode'] = cell_annots['cell barcode'].str.split('.', expand = True)[1]
display(cell_annots)

Unnamed: 0,cell barcode,Cluster,Annotation,Batch,cell_barcode
0,Brain_1.TTCCGCAAGTACTGTGCG,Brain_cluster3,Brain.Microglia,Brain_1,TTCCGCAAGTACTGTGCG
1,Brain_1.TTCATATGTGCGTTTAGG,Brain_cluster3,Brain.Microglia,Brain_1,TTCATATGTGCGTTTAGG
2,Brain_1.TGCGGAACTTATGCTCAA,Brain_cluster3,Brain.Microglia,Brain_1,TGCGGAACTTATGCTCAA
3,Brain_1.TGCGGAGTTGCCGCGTCC,Brain_cluster3,Brain.Microglia,Brain_1,TGCGGAGTTGCCGCGTCC
4,Brain_1.GCAGGAGAATTATGATCA,Brain_cluster3,Brain.Microglia,Brain_1,GCAGGAGAATTATGATCA
...,...,...,...,...,...
201089,Z72h2.CATCCCCAACAACGTATT,Z72h_cluster1,Z72h.Hatching gland,Z72h2,CATCCCCAACAACGTATT
201090,Z72h2.CCAGACCCGACGAGCGAG,Z72h_cluster1,Z72h.Hatching gland,Z72h2,CCAGACCCGACGAGCGAG
201091,Z72h2.CCGCTAACACCCAAAGTT,Z72h_cluster1,Z72h.Hatching gland,Z72h2,CCGCTAACACCCAAAGTT
201092,Z72h2.CGGCAGGGACATCTGTGT,Z72h_cluster1,Z72h.Hatching gland,Z72h2,CGGCAGGGACATCTGTGT


In [17]:
sample_name = 'Brain_8'

sample_cell_annots = cell_annots[cell_annots['Batch'] == sample_name]
sample_cell_annots = sample_cell_annots[['cell_barcode', 'Annotation']]

sample_cell_annots.rename(columns = {'Annotation': 'celltype'}, inplace = True)
sample_cell_annots.drop_duplicates(subset = 'cell_barcode', inplace = True)
display(sample_cell_annots)

output_filename = '_'.join([prefixify(species_BioFileDocket.species), species_BioFileDocket.conditions, sample_name, 'cellannot.tsv'])

output_CellAnnotFile = CellAnnotFile(
    filename = output_filename,
    sampledict = species_SampleDict,
    sources = [species_BioFileDocket.gxc]
)

sample_cell_annots.to_csv(output_CellAnnotFile.path, sep = '\t', index = None)
species_BioFileDocket.add_keyfile(output_CellAnnotFile, 'cellannot')

Unnamed: 0,cell_barcode,celltype
3400,ATCTCTGCTCAAAAAACG,Brain.Microglia
3401,TAAGGGACGTTGATTCCA,Brain.Microglia
3402,GCAGGAACAATACGGCAG,Brain.Microglia
3403,GCGAATTGTGCGGCGTCC,Brain.Microglia
3404,AACCTACTGTGTGAGATC,Brain.Microglia
...,...,...
20786,TGATCAGGGCGAATACAG,Brain.Innate_immune_cell
20787,TTGGACGCTCAAGTCCCG,Brain.Innate_immune_cell
20788,AAAACGTCTACCCTCGCA,Brain.Innate_immune_cell
20789,TAAGGGAGGACTGCGTCC,Brain.Innate_immune_cell


In [12]:
species_BioFileDocket.local_to_s3()
species_BioFileDocket.pickle()
species_BioFileDocket.push_to_s3(overwrite = True)

GCF_000002035.5_GRCz10_genomic.gff already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic.fna already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GSM3768152_Brain_8_dge.txt already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
Drer_adultbrain_uniprot-idmm.tsv already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
Drer_adultbrain_gtf-idmm.tsv already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic_cDNA.fna already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder.bed already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecode