## FOUNDIN notebook for handling movement of GCS raw files

In [1]:
# set notebook global variables
local_work_dir = '/labseq/projects/ppmi/foundin'
cohort = 'foundin'
gcp_project_id = 'foundin-pd'
gcp_user = 'gibbsr'

#### import libraries and modules

In [2]:
# import modin.pandas as pd
import pandas as pd

### define helper functions for use in notebooks

In [3]:
# function to generate gcloud move commands between buckets with rename
def move_fastqs_between_buckets(in_file, script_name, bucket_name, out_path, \
                                ids_dict, id_index=0, snum_index=1, lane_index=2, \
                                read_index=3, file_type_index=4):
    temp_script_file = f'{local_work_dir}/{script_name}.sh'

    with open(fastqs_list_file, 'r') as fastqs_file:
        with open(temp_script_file, 'w') as script_file:
            for line in fastqs_file:
                fastq_name = line.strip().replace(f'{bucket_name}/','')
                fastq_parts = fastq_name.split('_')
                #expect fastq parts ID, S#, L#, R#, fastq.ext
                if fastq_parts[id_index] in ids_dict:
                    newID = ids_dict[fastq_parts[id_index]]
                    out_fastq = f'{out_path}/{newID}_{fastq_parts[snum_index]}_\
{fastq_parts[lane_index]}_{fastq_parts[read_index]}_{fastq_parts[file_type_index]}'
                    gcloud_cmd = f'gsutil -mq mv {line.strip()} {out_fastq}'

                    script_file.write(f'{gcloud_cmd}\n')


    print('#run these commands at terminal:\n')
    !chmod +x {temp_script_file}
    print(f'nohup {temp_script_file} > {local_work_dir}/{script_name}.log &')          

## Jan 13 2020

#### move Hi-C and scRNA fastqs that USUHS pushed to our pd-genome stagging bucket to one in foundin

#### set notebook global variables

## Feb 4, 2020

#### the previous move of scRNA data had to be redone; USUHS previously did the demultiplexing with Illumina's bcltofastq tools, needed to be done with Cellranger mkfastq

so move and rename again

## March 3, 2020

move and rename the bulk and single-cell ATACseq

#### make a USUHS staging bucket in the FOUNDIN GCP project

In [4]:
# make a USUHS staging bucket in the FOUNDIN GCP project
gcs_stagging_bucket = 'gs://usuhs-staging-687dcdd56cc4'

gcloud_cmd = f'gsutil mb -p {gcp_project_id} -c standard {gcs_stagging_bucket}'

print(gcloud_cmd)
# !{gcloud_cmd}

gsutil mb -p foundin-pd -c standard gs://usuhs-staging-687dcdd56cc4


#### move the fastqs, ie for safety copy and confirm then delete

In [5]:
# copy the fastqs, ie for safety copy and confirm then delete
#paths that USUHS push the data too
# ori_staging_bucket = 'gs://nihnialng-staging-f745d15a/pC2.NIA.DNA.RNA.N79.fastq.0*'
# ori_staging_bucket = 'gs://nihnialng-staging-f745d15a/pC2.10xGenomics.scRNA.N59.0*'
# ori_staging_bucket = 'gs://nihnialng-staging-f745d15a/pC2.NIA.N268.0*'
ori_staging_bucket = 'gs://nihnialng-staging-f745d15a/pC2.NIA.atacSeq.N33.0*'

#copy the fastqs
gcloud_cmd = f'gsutil -mq cp {ori_staging_bucket}/*.fastq.gz {gcs_stagging_bucket}/'

print(gcloud_cmd)
# !{gcloud_cmd}

gsutil -mq cp gs://nihnialng-staging-f745d15a/pC2.NIA.atacSeq.N33.0*/*.fastq.gz gs://usuhs-staging-687dcdd56cc4/


In [6]:
# check that everything looks like it copied over
gcloud_cmd = f'gsutil ls -lh {ori_staging_bucket}/*.fastq.gz | tail -n 1'
print(gcloud_cmd)
# !{gcloud_cmd}

gcloud_cmd = f'gsutil ls -lh {gcs_stagging_bucket}/*.fastq.gz | tail -n 1'
print(gcloud_cmd)
# !{gcloud_cmd}

gsutil ls -lh gs://nihnialng-staging-f745d15a/pC2.NIA.atacSeq.N33.0*/*.fastq.gz | tail -n 1
gsutil ls -lh gs://usuhs-staging-687dcdd56cc4/*.fastq.gz | tail -n 1


#### rename and move from staging to final dest

use nameing info from Xylena and Cornelis

In [7]:
# rename and move from staging to final dest
# name_file='FOUNDIN-name-pC2.DNA.RNA.keyTable.csv'
name_file='FOUNDIN-name-pC2.ATAC.keyTable.csv'

name_mapping_df = pd.read_csv(f'{local_work_dir}/{name_file}')
print(name_mapping_df.shape)

(400, 6)


In [8]:
name_mapping_df['Root_Name'].value_counts()

SCAT_PPMI58182_0923_da65_v1    4
SCAT_PPMI51518_8710_da65_v1    4
SCAT_PPMI51714_7806_da65_v1    4
SCAT_PPMI55124_1128_da65_v1    4
SCAT_PPMI50086_8366_da65_v1    4
                              ..
ATAC_PPMI3664_2833_da0_v1      1
ATAC_PPMI4109_6049_da65_v1     1
ATAC_PPMI51330_4636_da25_v1    1
ATAC_PPMI3469_6244_da25_v1     1
ATAC_PPMI3453_7504_da65_v1     1
Name: Root_Name, Length: 301, dtype: int64

In [9]:
# drop the duplicates
name_mapping_df.drop_duplicates(subset='Root_Name', keep='first', inplace=True)
print(name_mapping_df.shape)

(301, 6)


In [10]:
# ditch dup indexing on USUHS ID
temp_df = name_mapping_df['Sample_ID'].str.split('_', n = 1, expand = True)
name_mapping_df['USUHS_ID'] = temp_df[0]

In [11]:
# subset columns
name_mapping_df = name_mapping_df[['USUHS_ID', 'runFolder', 'Assay', 'Root_Name', 'Notes']]
print(name_mapping_df.shape)

(301, 5)


In [12]:
name_mapping_df['Assay'].value_counts()

ATAC    268
SCAT     33
Name: Assay, dtype: int64

In [13]:
# get the list of fastqs to rename
# fastqs_list_file = f'{local_work_dir}/temp.scrna_hic.fastq.list'
fastqs_list_file = f'{local_work_dir}/temp.atac.fastq.list'

term_cmd = f'gsutil ls {gcs_stagging_bucket}/*.fastq.gz > {fastqs_list_file}'
print(term_cmd)
!{term_cmd}

gsutil ls gs://usuhs-staging-687dcdd56cc4/*.fastq.gz > /labseq/projects/ppmi/foundin/temp.atac.fastq.list


#### move the scATACseq fastqs

In [14]:
#### move the scATACseq fastqs
script_name = 'move_scatac_fastqs'
out_bucket_path = 'gs://foundin-raw-assay/ASSAYS/SCAT'

hics_df = name_mapping_df.loc[name_mapping_df['Assay'] == 'SCAT']
hics_dict = dict(zip(hics_df['USUHS_ID'], hics_df['Root_Name']))

move_fastqs_between_buckets(fastqs_list_file, script_name, gcs_stagging_bucket, \
                            out_bucket_path, hics_dict, snum_index=2, lane_index=3, \
                                read_index=4, file_type_index=5)

#run these commands at terminal:

nohup /labseq/projects/ppmi/foundin/move_scatac_fastqs.sh > /labseq/projects/ppmi/foundin/move_scatac_fastqs.log &


#### move the bulk ATACseq fastqs

In [37]:
#### move the bulk ATACseq fastqs
script_name = 'move_atac_fastqs'
out_bucket_path = 'gs://foundin-raw-assay/ASSAYS/ATAC'

hics_df = name_mapping_df.loc[name_mapping_df['Assay'] == 'ATAC']
hics_dict = dict(zip(hics_df['USUHS_ID'], hics_df['Root_Name']))

move_fastqs_between_buckets(fastqs_list_file, script_name, gcs_stagging_bucket, \
                            out_bucket_path, hics_dict)

#run these commands at terminal:

nohup /labseq/projects/ppmi/foundin/move_atac_fastqs.sh > /labseq/projects/ppmi/foundin/move_atac_fastqs.log &


#### move the HiC fastqs

In [170]:
# move the HiC fastqs
script_name = 'move_hic_fastqs'
out_bucket_path = 'gs://foundin-raw-assay/ASSAYS/HICS'

hics_df = name_mapping_df.loc[name_mapping_df['Assay'] == 'HiC']
hics_dict = dict(zip(hics_df['USUHS_ID'], hics_df['Root_Name']))

move_fastqs_between_buckets(fastqs_list_file, script_name, gcs_stagging_bucket, \
                            out_bucket_path, hics_dict)

#run these commands at terminal:

chmod +x /labseq/projects/ppmi/foundin/move_hic_fastqs.sh
nohup /labseq/projects/ppmi/foundin/move_hic_fastqs.sh > /labseq/projects/ppmi/foundin/move_hic_fastqs.log &


#### move the scRNA fastqs

In [20]:
# move the scRNA fastqs
script_name = 'move_scrna_fastqs'
out_bucket_path = 'gs://foundin-raw-assay/ASSAYS/SCRN/Batch_3_4_5'

scrn_df = name_mapping_df.loc[name_mapping_df['Assay'] == 'scRNAseq']
scrn_dict = dict(zip(scrn_df['USUHS_ID'], scrn_df['Root_Name']))

move_fastqs_between_buckets(fastqs_list_file, script_name, gcs_stagging_bucket, \
                            out_bucket_path, scrn_dict)

#run these commands at terminal:

chmod +x /labseq/projects/ppmi/foundin/move_scrna_fastqs.sh
nohup /labseq/projects/ppmi/foundin/move_scrna_fastqs.sh > /labseq/projects/ppmi/foundin/move_scrna_fastqs.log &


In [46]:
scrn_df.head()

Unnamed: 0,USUHS_ID,runFolder,Assay,Root_Name,Notes
20,,191219_N06_0148_BH2JJYDSXY,scRNAseq,SCRN_PPMI3220_6139_da65_v1,
24,,191219_N06_0148_BH2JJYDSXY,scRNAseq,SCRN_PPMI3411_0083_da65_v1,
28,,191219_N06_0148_BH2JJYDSXY,scRNAseq,SCRN_PPMI3419_0298_da65_v1,
32,,191219_N06_0148_BH2JJYDSXY,scRNAseq,SCRN_PPMI3422_1260_da65_v1,
36,,191219_N06_0148_BH2JJYDSXY,scRNAseq,SCRN_PPMI3448_2397_da65_v1,


In [44]:
temp_df.loc[temp_df[0] == 'C2-49045']

Unnamed: 0,0,1
120,C2-49045,1
