In [1]:
import os
import pandas as pd
import subprocess
import re

In [2]:
# Test grep
cmd = ["grep", "CARDLG", "2020-08-21.presyn-top100-clones-contamination-table-on-pt.csv"]
rc = subprocess.run(cmd, stdout=subprocess.PIPE)
print(rc.stdout)

b'"42","CARDLGCSGGSCYADYYYYGMDVWGQGTTVT x","RA-min-3 LN99-run81-r2_S6__RA-min-1 LN97-run81-r2_S4__RA-plus-4 LN96-run81-r2_S244"\r\n"70","CARDLGYRAFDYWGQGTLVT x","RA-min-5 LN85-run181-r2_S228__RA-min-5 LN85-run162-r2_S132__RA-min-5 LN85-run160-r4_S107__RA-min-5 LN85-run141-r2_S34__RA-plus-6 LN70-run160-r4_S96"\r\n'


In [3]:
# Get the directory names of the Roche runs and put it in 'runlist'
mydir = "/mnt/immunogenomics/RUNS/"
p = re.compile('run\d+')
runlist = list()
for rundir in os.listdir(mydir):
    m = p.match(rundir)
    if m != None:
        if m.group() == rundir:
            runlist.append(mydir + rundir + "/")

In [4]:
runlist.sort()
runlist

['/mnt/immunogenomics/RUNS/run102/',
 '/mnt/immunogenomics/RUNS/run112/',
 '/mnt/immunogenomics/RUNS/run114/',
 '/mnt/immunogenomics/RUNS/run122/',
 '/mnt/immunogenomics/RUNS/run141/',
 '/mnt/immunogenomics/RUNS/run142/',
 '/mnt/immunogenomics/RUNS/run157/',
 '/mnt/immunogenomics/RUNS/run160/',
 '/mnt/immunogenomics/RUNS/run162/',
 '/mnt/immunogenomics/RUNS/run169/',
 '/mnt/immunogenomics/RUNS/run181/',
 '/mnt/immunogenomics/RUNS/run183/',
 '/mnt/immunogenomics/RUNS/run204/',
 '/mnt/immunogenomics/RUNS/run211/',
 '/mnt/immunogenomics/RUNS/run214/',
 '/mnt/immunogenomics/RUNS/run217/',
 '/mnt/immunogenomics/RUNS/run220/',
 '/mnt/immunogenomics/RUNS/run222/',
 '/mnt/immunogenomics/RUNS/run227/',
 '/mnt/immunogenomics/RUNS/run234/',
 '/mnt/immunogenomics/RUNS/run236/',
 '/mnt/immunogenomics/RUNS/run239/',
 '/mnt/immunogenomics/RUNS/run24/',
 '/mnt/immunogenomics/RUNS/run241/',
 '/mnt/immunogenomics/RUNS/run242/',
 '/mnt/immunogenomics/RUNS/run243/',
 '/mnt/immunogenomics/RUNS/run245/',
 '

In [5]:
# Is there a "data" subdirectory? If so, store it. If not, put it in another dictionary for manual check
data_dict = dict()
to_check_dict = dict()
for mydir in runlist:
    if "data" in os.listdir(mydir):
        data_dir = mydir + "data/"
        data_dict[data_dir] = os.listdir(data_dir)
    else:
        print("TO CHECK:", os.listdir(mydir))
        to_check_dict[mydir] = os.listdir(mydir)

TO CHECK: ['README.txt', 'results-tbcell']
TO CHECK: ['results-tbcell']


In [6]:
# Need to download data from these directories manually
to_check_dict

{'/mnt/immunogenomics/RUNS/run214/': ['README.txt', 'results-tbcell'],
 '/mnt/immunogenomics/RUNS/run258/': ['results-tbcell']}

## List the fastq and sff files per run

In [7]:
# See if we can identify the right file(s) in the data directory (fastq.gz or .sff)
download_files = dict()
check_files = dict()
for data_dir, myfiles in data_dict.items():
    filelist = list()
    for myfile in myfiles:
        if myfile.endswith(".fastq.gz") or myfile.endswith(".sff"):
            filelist.append(myfile)
    if len(filelist) == 0:
        check_files[data_dir] = myfiles
    else:
        download_files[data_dir] = filelist

In [8]:
download_files

{'/mnt/immunogenomics/RUNS/run102/data/': ['HA06QXN01.sff', 'HA06QXN02.sff'],
 '/mnt/immunogenomics/RUNS/run112/data/': ['HG10TD301.sff',
  'HG10TD302.sff',
  'run112-HG10TD301_S1_L001.assembled.fastq.gz',
  'run112-HG10TD302_S1_L001.assembled.fastq.gz'],
 '/mnt/immunogenomics/RUNS/run114/data/': ['HHEXEGX01.sff',
  'HHEXEGX02.sff',
  'HHEXEGX03.sff',
  'HHEXEGX04.sff'],
 '/mnt/immunogenomics/RUNS/run122/data/': ['HI7M9GL01.sff',
  'HI7M9GL02.sff',
  'run122-r1-HI7M9GL01_S1_L001.assembled.fastq.gz'],
 '/mnt/immunogenomics/RUNS/run141/data/': ['HPV8MXM01.sff',
  'HPV8MXM02.sff',
  'run141-r1_L001.assembled.fastq.gz',
  'run141-r2_L001.assembled.fastq.gz'],
 '/mnt/immunogenomics/RUNS/run142/data/': ['HPYCFHP01.sff',
  'HPYCFHP02.sff',
  'run142-r1_L001.assembled.fastq.gz'],
 '/mnt/immunogenomics/RUNS/run157/data/': ['HTS5G9E01.sff',
  'HTS5G9E02.sff',
  'run157-r1-1966994-T0-full_S20_L001.assembled.fastq.gz',
  'run157-r1-5734848-T0-full_S18_L001.assembled.fastq.gz',
  'run157-r1-5734848

In [9]:
# Do all data directories contain fastq and/or sff files? If so, this dict should be empty
check_files

{}

## Notes
I checked the list above manually to see which sff files were already converted to fastq and which are not. I'm going to make two separate lists for this below.

In [10]:
convert_sff = ['run102','run114','run122','run162','run183','run211','run239','run24',
               'run241','run242','run243','run246','run252','run254','run259','run27',
               'run34','run37','run41','run45','run49','run51','run55','run58','run61',
               'run70','run71']
already_converted = ['run112','run141','run157','run160','run169','run181',
                     'run204','run217','run220','run222','run227','run234','run236',
                     'run245','run248','run251','run66','run73','run74','run81',
                     'run84','run88','run93','run98']
run142_fastqdir = "/mnt/immunogenomics/RUNS/run142/" # note that the correct fastq files for this run is not in data subdir!

In [11]:
f = lambda x: "/mnt/immunogenomics/RUNS/" + x + "/data/"
convert_sff = [x for x in map(f, convert_sff)]
already_converted = [x for x in map(f, already_converted)]

In [12]:
# Add the fastq files of run142
[x for x in os.listdir(run142_fastqdir) if x.endswith(".fastq.gz")]

['run142-r1_L001.assembled.fastq.gz', 'run142-r2_L001.assembled.fastq.gz']

## Create dictionaries for sff files and for fastq files

In [13]:
download_sff = dict()
download_fastq = dict()
for mydir in convert_sff:
    myfiles = [x for x in download_files.get(mydir) if x.endswith(".sff")]
    myfiles = [x for x in myfiles if not x.startswith("454Reads")]
    download_sff[mydir] = myfiles
download_fastq = dict()
for mydir in already_converted:
    myfiles = [x for x in download_files.get(mydir) if x.endswith(".fastq.gz")]
    download_fastq[mydir] = myfiles

In [14]:
download_sff

{'/mnt/immunogenomics/RUNS/run102/data/': ['HA06QXN01.sff', 'HA06QXN02.sff'],
 '/mnt/immunogenomics/RUNS/run114/data/': ['HHEXEGX01.sff',
  'HHEXEGX02.sff',
  'HHEXEGX03.sff',
  'HHEXEGX04.sff'],
 '/mnt/immunogenomics/RUNS/run122/data/': ['HI7M9GL01.sff', 'HI7M9GL02.sff'],
 '/mnt/immunogenomics/RUNS/run162/data/': ['HV4JT6A01.sff', 'HV4JT6A02.sff'],
 '/mnt/immunogenomics/RUNS/run183/data/': ['H4W1I3K01.sff', 'H4W1I3K02.sff'],
 '/mnt/immunogenomics/RUNS/run211/data/': ['IC7F61C01.sff', 'IC7F61C02.sff'],
 '/mnt/immunogenomics/RUNS/run239/data/': ['ITW40QN01.sff', 'ITW40QN02.sff'],
 '/mnt/immunogenomics/RUNS/run24/data/': ['FVUGYZ001.sff',
  'FVUGYZ002.sff',
  'FVUGYZ003.sff',
  'FVUGYZ004.sff'],
 '/mnt/immunogenomics/RUNS/run241/data/': ['IWAB1T402.sff'],
 '/mnt/immunogenomics/RUNS/run242/data/': ['IXKJMGP01.sff', 'IXKJMGP02.sff'],
 '/mnt/immunogenomics/RUNS/run243/data/': ['I0EK49M01.sff', 'I0EK49M02.sff'],
 '/mnt/immunogenomics/RUNS/run246/data/': ['I4IMECA01.sff', 'I4IMECA02.sff'],
 '

In [15]:
fhSamples = open("SAMPLES-sff", "w")
fhRename = open("rename_files.sh", "w")
for mydir, myfiles in download_sff.items():
    run = mydir.split("/")[4]
    #print(run)
    for myfile in myfiles:
        region, ext = myfile.split(".")
        region = region[-1]
        mypath = mydir + myfile
        print(run, region, mypath)
        print(mypath, file=fhSamples)
        print("mv", myfile, run + "-r" + str(region) + "-" + myfile.split(".")[0] + "_S" + str(region) + "_L001.assembled.sff", file=fhRename)
fhSamples.close()
fhRename.close()

run102 1 /mnt/immunogenomics/RUNS/run102/data/HA06QXN01.sff
run102 2 /mnt/immunogenomics/RUNS/run102/data/HA06QXN02.sff
run114 1 /mnt/immunogenomics/RUNS/run114/data/HHEXEGX01.sff
run114 2 /mnt/immunogenomics/RUNS/run114/data/HHEXEGX02.sff
run114 3 /mnt/immunogenomics/RUNS/run114/data/HHEXEGX03.sff
run114 4 /mnt/immunogenomics/RUNS/run114/data/HHEXEGX04.sff
run122 1 /mnt/immunogenomics/RUNS/run122/data/HI7M9GL01.sff
run122 2 /mnt/immunogenomics/RUNS/run122/data/HI7M9GL02.sff
run162 1 /mnt/immunogenomics/RUNS/run162/data/HV4JT6A01.sff
run162 2 /mnt/immunogenomics/RUNS/run162/data/HV4JT6A02.sff
run183 1 /mnt/immunogenomics/RUNS/run183/data/H4W1I3K01.sff
run183 2 /mnt/immunogenomics/RUNS/run183/data/H4W1I3K02.sff
run211 1 /mnt/immunogenomics/RUNS/run211/data/IC7F61C01.sff
run211 2 /mnt/immunogenomics/RUNS/run211/data/IC7F61C02.sff
run239 1 /mnt/immunogenomics/RUNS/run239/data/ITW40QN01.sff
run239 2 /mnt/immunogenomics/RUNS/run239/data/ITW40QN02.sff
run24 1 /mnt/immunogenomics/RUNS/run24/d

## Notes
The SAMPLES-sff file can be renamed to SAMPLES. Then run ../copy-from-beehub.sh

The downloaded files can be renamed with the script that was produced above: bash rename-files.sh

TO DO: generate a script that will convert the sff file to fastq and upload this to the ResearchDrive in the right directory: python2 ../SeqToFastq.py sff *.sff