In [None]:
%%bash

files=$(ls -1 1_raw | grep _R1 | cut -d'_' -f1)

i=1
for sample in $files
do
    mkdir 2_demultiplexed/${sample}/
    python demultiplex.py \
    0_info/${sample}.tsv \
    1_raw/${sample}_S${i}_L001_R1_001.fastq.gz 1_raw/${sample}_S${i}_L001_R2_001.fastq.gz \
    2_demultiplexed/${sample}/
    i=$((i+1))
done

In [None]:
%%bash

mkdir 3_qc
mkdir 4_merged_fastq
mkdir 5_merged_fasta

libraries=$(ls -1 2_demultiplexed)
for library in $libraries
do
    mkdir 3_qc/${library}/
    mkdir 4_merged_fastq/${library}/
    mkdir 5_merged_fasta/${library}/
    files=$(ls -1 2_demultiplexed/${library} | grep -v invalid | grep .R1 | cut -d'.' -f1)
    for sample in $files
    do
        fastp -i 2_demultiplexed/${library}/${sample}.R1.fastq -I 2_demultiplexed/${library}/${sample}.R2.fastq \
        -o 3_qc/${library}/${sample}.R1.fastq -O 3_qc/${library}/${sample}.R2.fastq \
        --unpaired1 3_qc/${library}/${sample}_unpaired.R1.fastq \
        --unpaired2 3_qc/${library}/${sample}_unpaired.R2.fastq \
        --failed_out 3_qc/${library}/${sample}_failed.fastq \
        -q 30 \
        --cut_tail \
        --trim_front1 20 \
        --trim_front2 20 \
        --max_len1 106 \
        --max_len2 106 \
        -l 90 \
        --merge \
        --overlap_len_require 90 \
        --correction \
        --merged_out 4_merged_fastq/${library}/${sample}.fastq \
        -w 6 \
        -h ./fastp_out/${sample}_fastp.html \
        -j ./fastp_out/${sample}_fastp.json
        cat 3_qc/${library}/${sample}.R1.fastq >> 4_merged_fastq/${library}/${sample}.fastq
        cat 3_qc/${library}/${sample}_unpaired.R1.fastq >> 4_merged_fastq/${library}/${sample}.fastq
        seqkit fq2fa 4_merged_fastq/${library}/${sample}.fastq -o 5_merged_fasta/${library}/${sample}.fasta
    done
done

#--correction \


# KRAKEN2 TIME!!!

In [None]:
%%bash

mkdir 6_kraken_outputs
mkdir 7_kraken_reports

libraries=$(ls -1 5_merged_fasta)

for library in $libraries
do
    files=$(ls -1 5_merged_fasta/${library} | cut -d'.' -f1)
    for sample in $files
    do
        kraken2 --db fish_kraken_02 \
        5_merged_fasta/${library}/${sample}.fasta \
        --use-names \
        --memory-mapping \
        --threads 6 \
        --report-zero-counts \
        --confidence 0.0 \
        --output 6_kraken_outputs/${sample}_output.tsv \
        --report 7_kraken_reports/${sample}_report.txt
    done
done


In [None]:
%%bash

kraken-biom 7_kraken_reports/*.txt --max F -o testing.biom
biom convert -i testing.biom -o testing.tsv --to-tsv --header-key taxonomy

echo -e '0' > total_reads.txt

files=$(ls -1 7_kraken_reports/ | cut -d'.' -f1)
for sample in $files
do
    a=$(awk '{s+=$3}END{print s}' 7_kraken_reports/${sample}.txt)
    sed -i "s/$/\t$a/" total_reads.txt
done

d=$(awk -F ";" 'NR>2 {print substr($(NF-2),5)"\t"substr($(NF-1),5)"\t"substr($NF,5)}' testing.tsv | \
awk '{
if ($3 =="" && $2 =="")
    print $1;
else if ($3 =="" && $2 !="")
    print $2"_spp.";
else
    print $2"_"$3;
}')

awk 'NR<=2{print}' testing.tsv > testingx.tsv

n=1
for i in $d
do
n=$((n+1))
awk -v i=$i -v n=$n 'NR==(1+n) {print i"\t"$0}' testing.tsv | cut -f2 --complement >> testingx.tsv
done

a=$(cat total_reads.txt)
echo $a >test.txt
b=$(awk -F "\t" 'NR>2{for (i=1;i<=NF;i++) sum[i]+=$i;}; END{for (i in sum) print sum[i]}' testing.tsv)
echo $b >>test.txt
c=$(awk '{ if (NR == 1) { for (i = 2; i <= NF; i++){ first_row[i] = $i} } else { for (i = 2; i <= NF-1; i++){ printf "%s\t", first_row[i] - $i }; printf "\n"}}' test.txt)

echo unassigned $c u__unassigned > test.txt
sed -i "s/ /\t/g" test.txt

awk '{print}' testingx.tsv > testing.tsv
awk '{print}' test.txt >> testing.tsv

rm test.txt
rm total_reads.txt
rm testingx.tsv


In [None]:
%%bash

biom convert -i testing.tsv -o test2.biom --to-hdf5 --header-key taxonomy
biom convert -i test2.biom -o test2.tsv --to-tsv --header-key taxonomy

# DONE!
### you can go home now :)

In [None]:
%%bash

kraken2-build --help

In [None]:
%%bash

awk -F'\t' '$3 ~ /Capreolus/ {print $2}' 6_kraken_outputs/BLEL05_output.tsv > dodgy.txt

In [None]:
from Bio import SeqIO

sequences=[]
with open ('Rangifer.fasta','w') as fasta:
    with open ('dodgy.txt','r') as txt:
        for line in txt:
            with open('5_merged_fasta/EA01/BLEL05.fasta','r') as fastin:
                for record in SeqIO.parse(fastin,'fasta'):
                    if line.strip() in record.name:
                        sequences.append(record)
    SeqIO.write(sequences,fasta,'fasta')

In [None]:
with open ('dodgy.txt','r') as txt:
    for line in txt:
        print(line.strip())
#list_d=

In [None]:
%%bash


libraries=$(ls -1 2_demultiplexed/)

for library in $libraries
do
    files=$(ls -1 2_demultiplexed/${library} | cut -d'.' -f1)
    for sample in $files
    do
        seqkit fq2fa 2_demultiplexed/${library}/${sample}.R1.fastq -o dirt/${library}/${sample}.fasta
    done
done

In [None]:
%%bash

libraries=$(ls -1 dirt/)

for library in $libraries
do
    files=$(ls -1 dirt/${library} | cut -d'.' -f1)
    for sample in $files
    do
        kraken2 --db ../database_stuff/fish_kraken \
        dirt/${library}/${sample}.fasta \
        --use-names \
        --report-zero-counts \
        --report terp/${sample}.txt
        #--output 7_kraken_outputs
    done
done

In [None]:
%%bash

a=$(cat total_reads.txt)
echo $a >test.txt
b=$(awk -F "\t" 'NR>2{for (i=1;i<=NF;i++) sum[i]+=$i;}; END{for (i in sum) print sum[i]}' testing.tsv)
echo $b >>test.txt
c=$(awk '{ if (NR == 1) { for (i = 2; i <= NF; i++){ first_row[i] = $i} } else { for (i = 2; i <= NF-1; i++){ printf "%s\t", first_row[i] - $i }; printf "\n"}}' test.txt)

echo unassigned $c u__unassigned > test.txt
sed -i "s/ /\t/g" test.txt

In [None]:
%%bash

d=$(awk -F ";" 'NR>2 {print substr($(NF-2),5)"\t"substr($(NF-1),5)"\t"substr($NF,5)}' testing.tsv | \
awk '{
if ($3 =="" && $2 =="")
    print $1;
else if ($3 =="" && $2 !="")
    print $2"_spp.";
else
    print $2"_"$3;
}')

awk 'NR<=2{print}' testing.tsv > testingx.tsv

n=1
for i in $d
do
n=$((n+1))
awk -v i=$i -v n=$n 'NR==(1+n) {print i"\t"$0}' testing.tsv | cut -f2 --complement >> testingx.tsv
done

#awk 'NR<=2 {print}' testing.tsv > testingx.tsv


In [None]:
import numpy as np

a = np.array([2,4,6,8])
b = np.array([1,3,5,7])
c = a-b
print (c)




In [None]:
%%bash

files=$(set -- ls-1 6_kraken_reports/ $2)

awk 'NR>1 {print $3\t$6}'

awk 'NR>2 {print $1}' testing.tsv > finaltest.txt

taxid=$(cat finaltest.txt)
for id in $taxid
do
     awk '$1 == $id {print $2}'
done
 
#awk 'NR>2 $1="[replace]"' FS=, OFS=, testing.tsv > finaltest.tsv

In [None]:
%%bash

files=$(ls -1 6_kraken_reports/)

#l=

awk '{print $5"\t"$6}' 6_kraken_reports/$(set -- $files; echo -e $1)

#files=$(ls -1 6_kraken_reports/)
#echo -e $files

In [None]:
import glob

#dictionary of taxid and scientific names

lst1= glob.glob('test_data/output/*')
with open(lst1[1],'r') as inp:
    for line in inp:
        print('%s\t%s' %(line.split('\t')[4],line.strip().split('\t')[5].split('  ')[-1]))

In [None]:
%%bash

for sample in $files
do
    a=$(awk '{s+=$3}END{print s}' 6_kraken_reports/${sample}.txt)
    sed -i "s/$/\t$a/" total_reads.txt
done

awk '{s+=$3}END{print s}' 6_kraken_reports/BLEL01.txt

In [None]:
for sample in $files
do
    a=$(awk 'NR==1{print $3}' test_data/output/${sample}.txt)
    sed -i "s/$/\t$a/" text.txt
    #echo -e ${sample}'\t' >> text.txt
done

In [None]:
f_primer='ACTGGGATTAGATACCCC'
r_primer='CTAGAGGAGCCTGTTCTA'

len(r_primer)

In [None]:
%%bash
fastp

In [None]:
%%bash

files=$(ls -1 ../raw_data/EA01/ | grep .R1 | cut -d'.' -f1)
for simple in $files
do
echo $simple
done

In [None]:
#REVERSE COMPLIMENT DNA SEQUENCE

revcom = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A'}[B] for B in x][::-1])
print (revcom('CTAGAGGAGCCTGTTCTA'))

In [None]:
len('ACTATGCATGGCCATAAATTTTGATAAAAATATACAATTTTATCCGCCAGGGAACTTCAAGCATCAGCTTAAAACCCAAAGGACTTTGCCGTTCTTCTGTCCCACC')

In [None]:
count = 0
touch_files()
while True:
    lines = []
    line = f1.readline()
    if line.strip() == "":
        break
    lines.append(line.strip())

    for i in range(3):
        lines.append(f1.readline().strip())

    for i in range(4):
        lines.append(f2.readline().strip())

    temp = find_bcs(lines, sample_data, search_until)
    if temp:
        invalid_recs['R1'].extend(temp[:4])
        invalid_recs['R2'].extend(temp[4:])
    count+=1
    if (count % 100000) == 1:
        print("["+time.strftime("%c")+"] - %i read pairs processed" %(count/2*2))
        write_out(0)
return count


In [None]:
%%bash

#files=$(ls -1 ../Fastq | grep _R1 | cut -d'_' -f1)
files='EA01'
i=1
for sample in $files
do
    mkdir ../raw_data/${sample}/
    python demultiplex.py \
    ${sample}.tsv \
    ../Fastq/${sample}_S${i}_L001_R1_001.fastq.gz ../Fastq/${sample}_S${i}_L001_R2_001.fastq.gz \
    ../raw_data/${sample}/
    i=$((i+1))
done

In [None]:
sample_test=open('EA01.tsv','r')
for sample in sample_test:
    print(sample)

In [None]:
fh = open('EA01.tsv','r')
#sample_data = {}
test=open('EA01.txt','w+')
for l in fh:
    #print(l)
    cols = l.strip().split("\t")
    sample = cols[1]
    bcs = cols[2].split(":")
    #sample_data[sample] = {'count': 0, 'bcs':[], 'seqs':{ 'R1': [], 'R2': []}}
    #sample_data[sample]['bcs'] = bcs
    test.write('%s\t%s\t%s\n' %(sample,bcs[0],bcs[1]))
test.close()

In [None]:
%%bash

#files=$(ls -1 ../Fastq | grep _R1 | cut -d'_' -f1)
files='EA01'
i=1
for sample in $files
do
    mkdir ../raw_data/EA0100/
    python demultiplex_obi.py \
    EA0100.tsv \
    ../Fastq/${sample}_S${i}_L001_R1_001.fastq.gz ../Fastq/${sample}_S${i}_L001_R2_001.fastq.gz \
    ../raw_data/EA0100/
    i=$((i+1))
done

In [None]:
recordit='GAATTCGTGCTCACTGGGATTAGATACCCCACTATGCATAGCATAAATTTTGATAAAAATATACAATTTTATCCGCCAGGGAACTACAAGCATCAGCTTAAAACCCAAAGGACTTGGCGGTGCTTCAGACCCACCTAGAGGAGCCTGTTCTAGAGCACGAGGCT'
#recordit=(in_seqs)[1]
bc='GCGACGTG'
#recordit=(in_seqs[1])
if regex.search('('+bc+'){e<='+mm+'}',str(record.seq)[:30]):
    print('fish')


In [None]:
#in_seqs[1]
str(record.seq)

In [None]:
import gzip
#from Bio import SeqIO
import regex

sample='BLEL01'
mm='1'
bc='ACACACAC'

fastq=open(sample+'.fastq','w') #open up file named as sample

sequences=[]
with gzip.open('../Fastq/EA01_S1_L001_R1_001.fastq.gz','rt') as handle:
    for record in SeqIO.parse(handle, "fastq"):
        #if regex.search('('+bc+'){e<='+mm+'}',str(record.seq)[:30]):
        if regex.search('('+bc+'){e<=0}',str(record.seq)[:30]):
            sequences.append(record)
SeqIO.write(sequences,fastq,'fastq')


In [None]:
###GOOD BIT###

import gzip
import regex

sample='BLEL01'
mm='1'
bc='ACACACAC'

fastq=open(sample+'.fastq','w+') #open up file named as sample

sequences=[]
with gzip.open('../Fastq/EA01_S1_L001_R1_001.fastq.gz','rt') as handle:
    for line in handle:
        line.strip()
        if regex.search('('+bc+'){e<='+mm+'}',str(record.seq)[:30]):
        #if regex.search('('+bc+'){e<=0}',str(line)[:15]):
        #if bc in str(line)[:30]:
            #print(line)
            fastq.write('%s' %(line))

            #sequences.append(record)
#SeqIO.write(sequences,fastq,'fastq')


In [None]:
###GOOD BIT REVISITED###

import gzip
import regex

sample='BLEL01'
mm='1'
bc='ACACACAC'

fastq=open(sample+'.fastq','w+') #open up file named as sample

sequences=[]
with gzip.open('../Fastq/EA01_S1_L001_R1_001.fastq.gz','rt') as handle:
    for line in handle:
        line.strip()
        #if regex.search('('+bc+'){e<='+mm+'}',str(record.seq)[:30]):
        #if regex.search('('+bc+'){e<=0}',str(line)[:30]):
        if bc in str(line)[:30]:
            #print(line)
            fastq.write('%s' %(line.split(bc,1)[1]))

            #sequences.append(record)
#SeqIO.write(sequences,fastq,'fastq')


In [None]:
###GOOD BIT WITH SEQIO###

import gzip
import regex
from Bio import SeqIO

sample='BLEL01'
mm='1'
bc='ACACACAC'

fastq=open(sample+'.fastq','w+') #open up file named as sample

sequences=[]
with gzip.open('../Fastq/EA01_S1_L001_R1_001.fastq.gz','r') as handle:
    for record in SeqIO.parse(handle, "fastq"):
        #if regex.search('('+bc+'){e<='+mm+'}',str(record.seq)[:30]):
        if bc in str(record.seq)[:30]:
            SeqIO.write(record,fastq,'fastq')
            
            
            #sequences.append(record)
#SeqIO.write(sequences,fastq,'fastq')


In [None]:
sequences[1]


In [None]:
#with open ('testing.txt','r') as test:
with open ('EAXX.tsv','w+') as liff:
    testing=([line.strip() for line in open('testing.txt')])
    for test in testing:
        for test2 in testing:
            #print('12S-fish\tN\t%s\t%s\tACTGGGATTAGATACCCC\tTAGAACAGGCTCCTCTAG\tF\t@' %(test,test2))
            liff.write('12S-fish\tN\t%s:%s\tACTGGGATTAGATACCCC\tTAGAACAGGCTCCTCTAG\tF\t@\n' %(test,test2))

    #print(test)


#'12S-fish\t%s\t%S\t%s\tACTGGGATTAGATACCCC\tTAGAACAGGCTCCTCTAG\tF\t@' %(test)



In [None]:
with gzip.open('../Fastq/EA01_S1_L001_R1_001.fastq.gz','rt') as handle:
    handle

In [None]:

sample='BLEL01'
mm='1'
bc='ACACACAC'

fastq=open(sample+'.fastq','w+') #open up file named as sample

with gzip.open('../Fastq/EA01_S1_L001_R1_001.fastq.gz','rt') as f1:
    my_file = f1.readlines()
    for i,line in enumerate(my_file):
            if bc in line:
                #print(file[i-4])
                fastq.write('%s\n' %(f1[i-4]))

        #print file[i-4]

In [None]:
with gzip.open('../Fastq/EA01_S1_L001_R1_001.fastq.gz','rt') as f1:
    with gzip.open('../Fastq/EA01_S1_L001_R2_001.fastq.gz','rt') as f2:
        lines = []
        line = f1.readline()
        if line.strip() == "":
            break
        lines.append(line.strip())

        for i in range(3):
            lines.append(f1.readline().strip())

        for i in range(4):
            lines.append(f2.readline().strip())

In [None]:
%%bash

wc -l BLEL01.fastq

wc -l ../raw_data/EA01/BLEL01.R1.fastq


In [None]:
from multiprocessing import Pool

def f(x):
    return x*x

if __name__ == '__main__':
    p = Pool(5)
    print(p.map(f, [1, 2, 3]))

In [None]:
map(f, [1, 2, 3])

In [None]:
import regex

if regex.search('(test){e<=1}', '123 taast'):
    print('fish')

In [None]:
%%bash

files=$(ls -1 ../Fastq | grep _R1 | cut -d'_' -f1)
i=1
for sample in $files
do
    echo $i
    i=$((i+1))
done