In [77]:
from Bio import Entrez
import pandas as pd
import time

In [78]:
Entrez.email = "etholleman@ucdavis.edu"

BLAST tabular output headers

In [79]:
COL_NAMES = (
    "qseqid",
    "sseqid",
    "pident",
    "length",
    "mismatch",
    "gapopen",
    "qstart",
    "qend",
    "sstart",
    "send",
    "evalue",
    "bitscore",
)


Outside of this notebook I BLASTed all sequences (from `all_text.fas` file) using BLASTn and then downloaded the
results of all alignments.

In [80]:
latest_download = '2022-02-11_751722_138389_data (1)/0JX3GD7A016-Alignment-HitTable.csv'
latest_df = pd.read_csv(latest_download, header=None)
latest_df.columns = COL_NAMES

In [81]:
latest_df.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,pFC8_tac_T1T2_Vrmix#3_10_pFC8tac_tac_promoter_...,CP046087.1,100.0,674,0,0,42,715,910768,910095,0.0,1245.0
1,pFC8_tac_T1T2_Vrmix#3_10_pFC8tac_tac_promoter_...,CP036485.1,100.0,674,0,0,42,715,878955,878282,0.0,1245.0
2,pFC8_tac_T1T2_Vrmix#3_10_pFC8tac_tac_promoter_...,CP033477.1,100.0,674,0,0,42,715,872028,871355,0.0,1245.0
3,pFC8_tac_T1T2_Vrmix#3_10_pFC8tac_tac_promoter_...,CP033494.1,100.0,674,0,0,42,715,860629,859956,0.0,1245.0
4,pFC8_tac_T1T2_Vrmix#3_10_pFC8tac_tac_promoter_...,CP029160.1,100.0,674,0,0,42,715,9954961,9954288,0.0,1245.0


Get the best alignment for read read by percent identity, exclude any alignments that are less that 200 bp

In [93]:
def parse_by_read(df, min_align_length=200):
    reads = list(set(df['qseqid']))
    rep_aligns = []
    for r in reads:
        rows = df.loc[df['qseqid'] == r]
        rows = rows.loc[rows['length'] > min_align_length]
        best_pident = rows['pident'].idxmax()
        rep_aligns.append(rows.loc[best_pident])
    return pd.DataFrame(rep_aligns)

align = parse_by_read(latest_df)

In [95]:
align.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
1002,pFC8_tac_T1T2_Vrmix#3_3_pFC8tac_tac_promoter_P...,MF084286.1,100.0,256,0,0,80,335,10729,10474,1.28e-128,473.0
1113,pFC8_tac_T1T2_Vrmix#3_4_pFC8tac_tac_promoter_P...,CP046087.1,100.0,674,0,0,44,717,910768,910095,0.0,1245.0
318,pFC8_tac_T1T2_Vrmix#3_14_pFC8tac_tac_promoter_...,MF084286.1,100.0,256,0,0,86,341,10729,10474,1.4300000000000002e-128,473.0
100,pFC8_tac_T1T2_Vrmix#3_11_pFC8tac_tac_promoter_...,MF084286.1,100.0,256,0,0,82,337,10729,10474,1.4300000000000002e-128,473.0
1225,pFC8_tac_T1T2_Vrmix#3_5_pFC8tac_tac_promoter_P...,MF084286.1,100.0,256,0,0,82,337,10729,10474,1.28e-128,473.0


In [96]:
len(align['qseqid'])

14

Use biopython to search entrez for the accessions included in BLAST results and download genbank files of those
accessions. Extract descriptions to get human readable name for alignments.

In [97]:
def add_accession_description(df):
    descriptions = []
    for i, row in df.iterrows():
        print(f'Searching for {row["sseqid"]}')
        accession = Entrez.efetch(db="nucleotide", id=row['sseqid'], rettype="gb", retmode="text")
        descriptions.append(
            SeqIO.read(accession, 'gb').description
        )
        time.sleep(2)
    df['description'] = descriptions
    return df

In [86]:
align = add_accession_description(align)

Searching for MF084286.1
Searching for CP046087.1
Searching for MF084286.1
Searching for MF084286.1
Searching for MF084286.1
Searching for CP046087.1
Searching for CP029160.1
Searching for NM_005244.5
Searching for XM_005260327.2
Searching for CP046087.1
Searching for CP046087.1
Searching for MF084286.1
Searching for CP046087.1
Searching for MF084286.1


In [90]:
align[['qseqid', 'description', 'pident', 'length']]

Unnamed: 0,qseqid,description,pident,length
1002,pFC8_tac_T1T2_Vrmix#3_3_pFC8tac_tac_promoter_P...,"Expression vector pYES2-Os1-HH, complete sequence",100.0,256
1113,pFC8_tac_T1T2_Vrmix#3_4_pFC8tac_tac_promoter_P...,Saccharomyces cerevisiae strain CEN.PK113-7D c...,100.0,674
318,pFC8_tac_T1T2_Vrmix#3_14_pFC8tac_tac_promoter_...,"Expression vector pYES2-Os1-HH, complete sequence",100.0,256
100,pFC8_tac_T1T2_Vrmix#3_11_pFC8tac_tac_promoter_...,"Expression vector pYES2-Os1-HH, complete sequence",100.0,256
1225,pFC8_tac_T1T2_Vrmix#3_5_pFC8tac_tac_promoter_P...,"Expression vector pYES2-Os1-HH, complete sequence",100.0,256
1381,pFC8_tac_T1T2_Vrmix#3_6_pFC8tac_tac_promoter_P...,Saccharomyces cerevisiae strain CEN.PK113-7D c...,100.0,674
222,pFC8_tac_T1T2_Vrmix#3_12_pFC8tac_tac_promoter_...,Saccharomyces cerevisiae strain SY14 chromosom...,100.0,244
697,pFC8_tac_T1T2_Vrmix#3_23_pFC8tac_tac_promoter_...,Homo sapiens EYA transcriptional coactivator a...,99.897,967
810,pFC8_tac_T1T2_Vrmix#3_24_pFC8tac_tac_promoter_...,PREDICTED: Homo sapiens EYA transcriptional co...,99.686,954
897,pFC8_tac_T1T2_Vrmix#3_2_pFC8tac_tac_promoter_P...,Saccharomyces cerevisiae strain CEN.PK113-7D c...,100.0,674


Reads show very strong alignment to yeast genes, and many expression vectors; we do not have yeast in the lab and are not using any of these expression vectors. Further the primer used for these sequencing reactions would 
not be capable of binding to any of these sequences.

In [None]:
align.to_csv('aligned-reads-blast.tsv', sep='\t', index=None)