In [1]:
import os
import pandas as pd
import subprocess
import re

In [2]:
# Test grep
cmd = ["grep", "CARDLG", "2020-08-21.presyn-top100-clones-contamination-table-on-pt.csv"]
rc = subprocess.run(cmd, stdout=subprocess.PIPE)
print(rc.stdout)

b'"42","CARDLGCSGGSCYADYYYYGMDVWGQGTTVT x","RA-min-3 LN99-run81-r2_S6__RA-min-1 LN97-run81-r2_S4__RA-plus-4 LN96-run81-r2_S244"\r\n"70","CARDLGYRAFDYWGQGTLVT x","RA-min-5 LN85-run181-r2_S228__RA-min-5 LN85-run162-r2_S132__RA-min-5 LN85-run160-r4_S107__RA-min-5 LN85-run141-r2_S34__RA-plus-6 LN70-run160-r4_S96"\r\n'


In [3]:
# Get the directory names of the Roche runs and put it in 'runlist'
mydir = "/mnt/immunogenomics/RUNS/"
p = re.compile('run\d+')
runlist = list()
for rundir in os.listdir(mydir):
    m = p.match(rundir)
    if m != None:
        if m.group() == rundir:
            runlist.append(mydir + rundir + "/")

In [4]:
runlist.sort()
runlist

['/mnt/immunogenomics/RUNS/run102/',
 '/mnt/immunogenomics/RUNS/run112/',
 '/mnt/immunogenomics/RUNS/run114/',
 '/mnt/immunogenomics/RUNS/run122/',
 '/mnt/immunogenomics/RUNS/run141/',
 '/mnt/immunogenomics/RUNS/run142/',
 '/mnt/immunogenomics/RUNS/run157/',
 '/mnt/immunogenomics/RUNS/run160/',
 '/mnt/immunogenomics/RUNS/run162/',
 '/mnt/immunogenomics/RUNS/run169/',
 '/mnt/immunogenomics/RUNS/run181/',
 '/mnt/immunogenomics/RUNS/run183/',
 '/mnt/immunogenomics/RUNS/run204/',
 '/mnt/immunogenomics/RUNS/run211/',
 '/mnt/immunogenomics/RUNS/run214/',
 '/mnt/immunogenomics/RUNS/run217/',
 '/mnt/immunogenomics/RUNS/run220/',
 '/mnt/immunogenomics/RUNS/run222/',
 '/mnt/immunogenomics/RUNS/run227/',
 '/mnt/immunogenomics/RUNS/run234/',
 '/mnt/immunogenomics/RUNS/run236/',
 '/mnt/immunogenomics/RUNS/run239/',
 '/mnt/immunogenomics/RUNS/run24/',
 '/mnt/immunogenomics/RUNS/run241/',
 '/mnt/immunogenomics/RUNS/run242/',
 '/mnt/immunogenomics/RUNS/run243/',
 '/mnt/immunogenomics/RUNS/run245/',
 '

In [5]:
# Is there a "data" subdirectory? If so, store it. If not, put it in another dictionary for manual check
data_dict = dict()
to_check_dict = dict()
for mydir in runlist:
    if "data" in os.listdir(mydir):
        data_dir = mydir + "data/"
        data_dict[data_dir] = os.listdir(data_dir)
    else:
        print("TO CHECK:", os.listdir(mydir))
        to_check_dict[mydir] = os.listdir(mydir)

TO CHECK: ['README.txt', 'results-tbcell']
TO CHECK: ['results-tbcell']


In [6]:
# Need to download data from these directories manually
to_check_dict

{'/mnt/immunogenomics/RUNS/run214/': ['README.txt', 'results-tbcell'],
 '/mnt/immunogenomics/RUNS/run258/': ['results-tbcell']}

## List the fast files per run

In [7]:
# See if we can identify the right file(s) in the data directory (fastq.gz)
download_files = dict()
check_files = dict()
for data_dir, myfiles in data_dict.items():
    filelist = list()
    for myfile in myfiles:
        if myfile.endswith(".fastq.gz"):
            filelist.append(myfile)
    if len(filelist) == 0:
        check_files[data_dir] = myfiles
    else:
        download_files[data_dir] = filelist

In [8]:
download_files

{'/mnt/immunogenomics/RUNS/run102/data/': ['run102-r1-HA06QXN01_S1_L001.assembled.fastq.gz',
  'run102-r2-HA06QXN02_S2_L001.assembled.fastq.gz'],
 '/mnt/immunogenomics/RUNS/run112/data/': ['run112-HG10TD301_S1_L001.assembled.fastq.gz',
  'run112-HG10TD302_S1_L001.assembled.fastq.gz'],
 '/mnt/immunogenomics/RUNS/run114/data/': ['run114-r1-HHEXEGX01_S1_L001.assembled.fastq.gz',
  'run114-r2-HHEXEGX02_S2_L001.assembled.fastq.gz',
  'run114-r3-HHEXEGX03_S3_L001.assembled.fastq.gz',
  'run114-r4-HHEXEGX04_S4_L001.assembled.fastq.gz'],
 '/mnt/immunogenomics/RUNS/run122/data/': ['run122-r2-HI7M9GL02_S2_L001.assembled.fastq.gz',
  'run122-r1-HI7M9GL01_S1_L001.assembled.fastq.gz'],
 '/mnt/immunogenomics/RUNS/run141/data/': ['run141-r1_L001.assembled.fastq.gz',
  'run141-r2_L001.assembled.fastq.gz'],
 '/mnt/immunogenomics/RUNS/run142/data/': ['run142-r1_L001.assembled.fastq.gz'],
 '/mnt/immunogenomics/RUNS/run157/data/': ['run157-r1-1966994-T0-full_S20_L001.assembled.fastq.gz',
  'run157-r1-5734

In [9]:
# Do all data directories contain fastq files? If so, this dict should be empty
check_files

{}

## Create a list to download fastq files
Manually check the list with fastq files. Keep the raw converted fastq files. Ignore the fastq files belonging to one sample.

In [10]:
# Add the fastq files of run142
run142_fastqdir = "/mnt/immunogenomics/RUNS/run142/"
download_files[run142_fastqdir] = [x for x in os.listdir(run142_fastqdir) if x.endswith(".fastq.gz")]
download_files[run142_fastqdir]

['run142-r1_L001.assembled.fastq.gz', 'run142-r2_L001.assembled.fastq.gz']

In [11]:
# Remove the "data" directory of run142 from the dictionary
print("before:", download_files.get("/mnt/immunogenomics/RUNS/run142/data/", "not present"))
del download_files["/mnt/immunogenomics/RUNS/run142/data/"]
print("after:", download_files.get("/mnt/immunogenomics/RUNS/run142/data/", "not present"))

before: ['run142-r1_L001.assembled.fastq.gz']
after: not present


In [12]:
# Create the list
fhOut = open("SAMPLES-fastq", "w")
exclude_list = ["full", "IgA", "IgD", "IgG", "IgM", "assembled-"]
check_exclude = list()
for mydir, myfiles in download_files.items():
    myfiles.sort()
    for myfile in myfiles:
        keep = True
        for excl in exclude_list:
            if excl in myfile:
                keep = False
        if keep == True:
            print(mydir + myfile)
            print(mydir + myfile, file=fhOut)
        else:
            check_exclude.append(mydir + myfile)
fhOut.close()
print("Wrote SAMPLES-fastq to disk")

/mnt/immunogenomics/RUNS/run102/data/run102-r1-HA06QXN01_S1_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS/run102/data/run102-r2-HA06QXN02_S2_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS/run112/data/run112-HG10TD301_S1_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS/run112/data/run112-HG10TD302_S1_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS/run114/data/run114-r1-HHEXEGX01_S1_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS/run114/data/run114-r2-HHEXEGX02_S2_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS/run114/data/run114-r3-HHEXEGX03_S3_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS/run114/data/run114-r4-HHEXEGX04_S4_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS/run122/data/run122-r1-HI7M9GL01_S1_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS/run122/data/run122-r2-HI7M9GL02_S2_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS/run141/data/run141-r1_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS/run141/data/run141-r2_L001.assembled.fastq.gz
/mnt/immunogenomics/RUNS

In [13]:
# Check what was excluded
check_exclude

['/mnt/immunogenomics/RUNS/run157/data/run157-r1-1966994-T0-full_S20_L001.assembled.fastq.gz',
 '/mnt/immunogenomics/RUNS/run157/data/run157-r1-5734848-T0-full_S18_L001.assembled.fastq.gz',
 '/mnt/immunogenomics/RUNS/run157/data/run157-r1-5734848-T1-full_S23_L001.assembled.fastq.gz',
 '/mnt/immunogenomics/RUNS/run157/data/run157-r1-7713715-T0-full_S17_L001.assembled.fastq.gz',
 '/mnt/immunogenomics/RUNS/run157/data/run157-r1-7713715-T1-full_S22_L001.assembled.fastq.gz',
 '/mnt/immunogenomics/RUNS/run157/data/run157-r1-G4030-Oxford-full_S21_L001.assembled.fastq.gz',
 '/mnt/immunogenomics/RUNS/run157/data/run157-r1-G4066-Oxford-full_S19_L001.assembled.fastq.gz',
 '/mnt/immunogenomics/RUNS/run157/data/run157-r2-1966994-T0-IgA_S24_L001.assembled.fastq.gz',
 '/mnt/immunogenomics/RUNS/run157/data/run157-r2-1966994-T0-IgD_S32_L001.assembled.fastq.gz',
 '/mnt/immunogenomics/RUNS/run157/data/run157-r2-1966994-T0-IgG_S28_L001.assembled.fastq.gz',
 '/mnt/immunogenomics/RUNS/run157/data/run157-r2-