# Is duckDB ammenible to our data structure? Should we build a class around it?

***

In [2]:
import duckdb as ddb
import pandas as pd
import numpy as np

## 1. How quick can we query protein files?

In [2]:
# in memory db - lost when process ends
conn = ddb.connect()

In [3]:
# run query on set of protein files
df = conn.execute("""
    SELECT *
    FROM read_csv_auto('../data/taxa/proteins/taxa*.csv', header=True)
    LIMIT 10
""").df()

In [14]:
df

Unnamed: 0,seq_id,protein_seq,protein_desc,protein_len
0,4417.0,MSKYDEALARMQLRSNEAFERITGQTVEQYQQEQRQNRMPTTAQLA...,hypothetical protein,4602
1,4417.1,MTEEETLGILSENPEASPMKINPKHLEKFFKDNHTPNTKISISIAN...,hypothetical protein,53
2,4417.2,MTQHLSHLSTTPNTQKPITLYISGLTHDGRGIATYDDTHGDKSGKK...,23S rRNA (uracil(1939)-C(5))-methyltransferase...,476
3,4417.3,MSNNIQHLRNIAIIAHVDHGKTTLVDKLLQQSGALGERAGEIERVM...,translational GTPase TypA,615
4,4417.4,MKLNKIALALIATATTAATAGVTVTPLIGYQHQNEAHKKQREIFHT...,OmpA family protein,423
5,4417.5,MRAPRLGVNIDHIATLRNVRGVDYPCPIKGALVCQRAGADGITLHL...,pyridoxine 5'-phosphate synthase,244
6,4417.6,MSLQSLKIQSLPRFSLNFAANIVAALWMLVGSVRAFNWVKPTFGQF...,caspase family protein,479
7,4417.7,MTSIYLLIPLSLMLFVVAIWAIAYAVKSNQFEDLDNAPDQIILDDR...,cbb3-type cytochrome oxidase assembly protein ...,70
8,4417.8,MSIVLLAPALAMGFLGSPHCMGMCGGIVTAFGISMKNLSPQKRGML...,sulfite exporter TauE/SafE family protein,280
9,4417.9,MLEIRHLQMLSILARHGSLVMTADELNLTASAISHQLKELESYYDI...,LysR family transcriptional regulator,295


## 2. Make it a virtual table

In [15]:
# all protein files
df = conn.execute("""
    SELECT *
    FROM read_csv_auto('../data/taxa/proteins/taxa*.csv', header=True)
""").df()
conn.register('proteins', df)

its too big to be a virtual table...

## 3. Make a database of file and add an actual table

In [4]:
conn = ddb.connect(database="./dbs/test_db")

In [7]:
conn.execute("""
    CREATE OR REPLACE TABLE proteins AS
        SELECT
            "seq_id"::STRING AS seq_id.m
            "protein_seq"::STRING AS protein_seq,
            "protein_desc"::STRING AS protein_desc,
            "protein_len"::INT AS protein_len
        FROM read_csv_auto('../data/taxa/proteins/taxa*.csv', header=True)
""")

<duckdb.DuckDBPyConnection at 0x117839b30>

In [8]:
conn.execute("FROM proteins LIMIT 10").df()

Unnamed: 0,seq_id,protein_seq,protein_desc,protein_len
0,4417.0,MSKYDEALARMQLRSNEAFERITGQTVEQYQQEQRQNRMPTTAQLA...,hypothetical protein,4602
1,4417.1,MTEEETLGILSENPEASPMKINPKHLEKFFKDNHTPNTKISISIAN...,hypothetical protein,53
2,4417.2,MTQHLSHLSTTPNTQKPITLYISGLTHDGRGIATYDDTHGDKSGKK...,23S rRNA (uracil(1939)-C(5))-methyltransferase...,476
3,4417.3,MSNNIQHLRNIAIIAHVDHGKTTLVDKLLQQSGALGERAGEIERVM...,translational GTPase TypA,615
4,4417.4,MKLNKIALALIATATTAATAGVTVTPLIGYQHQNEAHKKQREIFHT...,OmpA family protein,423
5,4417.5,MRAPRLGVNIDHIATLRNVRGVDYPCPIKGALVCQRAGADGITLHL...,pyridoxine 5'-phosphate synthase,244
6,4417.6,MSLQSLKIQSLPRFSLNFAANIVAALWMLVGSVRAFNWVKPTFGQF...,caspase family protein,479
7,4417.7,MTSIYLLIPLSLMLFVVAIWAIAYAVKSNQFEDLDNAPDQIILDDR...,cbb3-type cytochrome oxidase assembly protein ...,70
8,4417.8,MSIVLLAPALAMGFLGSPHCMGMCGGIVTAFGISMKNLSPQKRGML...,sulfite exporter TauE/SafE family protein,280
9,4417.9,MLEIRHLQMLSILARHGSLVMTADELNLTASAISHQLKELESYYDI...,LysR family transcriptional regulator,295


In [9]:
conn.execute("FROM proteins SELECT * WHERE protein_len<50").df()

Unnamed: 0,seq_id,protein_seq,protein_desc,protein_len
0,4417.94,MFVSEWVYNLLVVMGGGEVWGFGGLGFDKMTWCIKAYVGRASLPS,hypothetical protein,45
1,4417.114,MPLHDNIRKFREQKQWSQEYMAEQLGLSKNGYAKIERGESRPSLDRLE,helix-turn-helix transcriptional regulator,48
2,4417.116,MQVLSSLKSAKNRHEDCQVVRRRGRTFVICKSNPRFKAVQGGKKRK,type B 50S ribosomal protein L36,46
3,4417.121,MSKFLVSTRINRYEKGVYQPNFGILEKMAMALDVPVIYFLAMMNWHK,helix-turn-helix transcriptional regulator,47
4,4417.185,MKRTFQPSVLKRKRTHGFRARMATKNGRQVLARRRAKGRHRLTV,50S ribosomal protein L34,44
...,...,...,...,...
405400,3361.3628,MKVRASVKKLCRNCKIVRREGVVRVICSAEPRHKQRQG,50S ribosomal protein L36,38
405401,3361.3861,MNAQSKIEQHSPIRTDGFEIVEYRASTTAGIAGSLPYLAYRVLGA,hypothetical protein,45
405402,3361.3879,MWYFAWILGVLLACSFGIVNALWLETTQDLDEGEAGDD,cytochrome bd-I oxidase subunit CydX,38
405403,3361.3889,MACSIDRSYIGRIERGEVNITVEKLYRIASLLSCDPASLLPLVSELQG,helix-turn-helix domain-containing protein,48


## 4. create table with primary keys

Summary: it sucks. YOu cannot modify a column to be foreign key, and you cant create the scheme with foreign key and then import data from CSV unless the headers match percetly, so have to parse, rename, save, create scheme then parse again

In [30]:
conn.execute("""
    CREATE OR REPLACE TABLE taxa AS
        SELECT
            "taxid"::INT AS ncbi_taxid,
            "record_name"::STRING AS record_name,
            "filepath"::STRING AS filepath,
            "taxonomy"::STRING AS taxonomy,
            "organism"::STRING AS organism,
            "bacdive_id"::INT AS bacdive_id,
            "ogt_raw":: STRING AS ogt_scraped_string
        FROM read_csv_auto('../data/taxa/taxa_info_and_ogt.csv', header=True)
""")

<duckdb.DuckDBPyConnection at 0x117839b30>

In [31]:
conn.execute("DESCRIBE taxa").df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,ncbi_taxid,INTEGER,YES,,,
1,record_name,VARCHAR,YES,,,
2,filepath,VARCHAR,YES,,,
3,taxonomy,VARCHAR,YES,,,
4,organism,VARCHAR,YES,,,
5,bacdive_id,INTEGER,YES,,,
6,ogt_scraped_string,VARCHAR,YES,,,


In [2]:
conn = ddb.connect(database="./dbs/test_db")

In [3]:

conn.execute("""
CREATE SEQUENCE seq_taxa_index START 1
""")
conn.execute("""
CREATE OR REPLACE TABLE taxa(
    taxa_index INT PRIMARY KEY default nextval('seq_taxa_index')-1,
    ncbi_taxid INT
)
""")


<duckdb.DuckDBPyConnection at 0x1a76a9bf0>

In [4]:
conn.execute("INSERT INTO taxa(ncbi_taxid) VALUES (1)")

<duckdb.DuckDBPyConnection at 0x1a76a9bf0>

In [5]:
conn.execute("SELECT * FROM taxa").df(
)

Unnamed: 0,taxa_index,ncbi_taxid
0,0,1


Try creating a virtual table with the correct names, THEN using the copy on the actual table with the correct scheme

In [2]:
conn = ddb.connect(database="./dbs/test_db")

In [14]:
conn.execute("""
CREATE OR REPLACE TABLE taxa(
    taxa_index INT PRIMARY KEY NOT NULL,
    ncbi_taxid INT NOT NULL,
    record_name STRING,
    filepath STRING,
    taxonomy STRING,
    organism STRING,
    bacdive_id INT,
    ogt_scraped_string STRING
)
""")

<duckdb.DuckDBPyConnection at 0x1ae194070>

In [7]:
conn.execute("""CREATE OR REPLACE TEMP TABLE taxa_tmp AS 
        SELECT
            "column0"::INT AS taxa_index,
            "taxid"::INT AS ncbi_taxid,
            "record_name"::STRING AS record_name,
            "filepath"::STRING AS filepath,
            "taxonomy"::STRING AS taxonomy,
            "organism"::STRING AS organism,
            "bacdive_id"::INT AS bacdive_id,
            "ogt_raw":: STRING AS ogt_scraped_string
        FROM read_csv_auto('../data/taxa/taxa_info_and_ogt.csv', header=True)""")
conn.execute("COPY taxa_tmp TO 'taxa_tmp.csv' (HEADER)")

<duckdb.DuckDBPyConnection at 0x1ae194070>

In [15]:
conn.execute("COPY taxa FROM 'taxa_tmp.csv' ( HEADER )")

<duckdb.DuckDBPyConnection at 0x1ae194070>

In [17]:
conn.execute("DESCRIBE taxa").df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,taxa_index,INTEGER,NO,,,
1,ncbi_taxid,INTEGER,NO,,,
2,record_name,VARCHAR,YES,,,
3,filepath,VARCHAR,YES,,,
4,taxonomy,VARCHAR,YES,,,
5,organism,VARCHAR,YES,,,
6,bacdive_id,INTEGER,YES,,,
7,ogt_scraped_string,VARCHAR,YES,,,


In [25]:
conn.execute("CREATE INDEX pkey ON taxa (taxa_index)")

<duckdb.DuckDBPyConnection at 0x1ae194070>

In [26]:
conn.execute("SELECT * FROM duckdb_indexes()").df()

Unnamed: 0,schema_name,schema_oid,index_name,index_oid,table_name,table_oid,is_unique,is_primary,expressions,sql
0,main,1,pkey,1334,taxa,1294,False,False,,CREATE INDEX pkey ON taxa (taxa_index);


In [27]:
conn.execute("SELECT * FROM duckdb_constraints()").df()

Unnamed: 0,schema_name,schema_oid,table_name,table_oid,constraint_index,constraint_type,constraint_text,expression,constraint_column_indexes,constraint_column_names
0,main,1,taxa,1294,0,PRIMARY KEY,PRIMARY KEY(taxa_index),,[0],[taxa_index]
1,main,1,taxa,1294,1,NOT NULL,NOT NULL,,[0],[taxa_index]
2,main,1,taxa,1294,2,NOT NULL,NOT NULL,,[1],[ncbi_taxid]


## 5. Do a foreign key assignment

In [30]:
conn.execute("""
CREATE OR REPLACE TABLE taxa(
    taxa_index INT PRIMARY KEY NOT NULL,
    ncbi_taxid INT NOT NULL,
    record_name STRING,
    filepath STRING,
    taxonomy STRING,
    organism STRING,
    bacdive_id INT,
    ogt_scraped_string STRING
)
""")
conn.execute("""
CREATE OR REPLACE TABLE taxa_16s(
    taxa_index INT PRIMARY KEY NOT NULL,
    seq_16srRNA STRING,
    FOREIGN KEY (taxa_index) REFERENCES taxa(taxa_index) 
)""")

<duckdb.DuckDBPyConnection at 0x1ae194070>

Fill in the taxa table

In [31]:
conn.execute("""CREATE OR REPLACE TEMP TABLE taxa_tmp AS 
        SELECT
            "column0"::INT AS taxa_index,
            "taxid"::INT AS ncbi_taxid,
            "record_name"::STRING AS record_name,
            "filepath"::STRING AS filepath,
            "taxonomy"::STRING AS taxonomy,
            "organism"::STRING AS organism,
            "bacdive_id"::INT AS bacdive_id,
            "ogt_raw":: STRING AS ogt_scraped_string
        FROM read_csv_auto('../data/taxa/taxa_info_and_ogt.csv', header=True)""")
conn.execute("COPY taxa_tmp TO 'taxa_tmp.csv' (HEADER)")
conn.execute("COPY taxa FROM 'taxa_tmp.csv' ( HEADER )")

<duckdb.DuckDBPyConnection at 0x1ae194070>

now fill in the 16s table

In [33]:
conn.execute("COPY taxa_16s FROM '../data/taxa/16s_rRNA.csv' ( HEADER , NULLSTR 'None')")

<duckdb.DuckDBPyConnection at 0x1ae194070>

Try to execute an inner join

In [49]:
df = conn.execute("""
    SELECT (taxa_16s.seq_16srRNA, taxa.ncbi_taxid)
        FROM taxa_16s INNER JOIN taxa
        ON taxa_16s.taxa_index = taxa.taxa_index
    WHERE taxa.ncbi_taxid<100000
""").df()

In [53]:
df = df['main.row(taxa_16s."seq_16srRNA", taxa.ncbi_taxid)'].apply(pd.Series)

In [57]:
df.isna().sum()

seq_16srRNA    0
ncbi_taxid     0
dtype: int64

In [67]:
df = conn.execute("""
    SELECT (taxa_16s.seq_16srRNA)
        FROM taxa_16s INNER JOIN taxa
        ON taxa_16s.taxa_index = taxa.taxa_index
    WHERE taxa.ncbi_taxid<1000000
""").df()

In [68]:
(df['seq_16srRNA'].apply(len) == 0).sum()

0

where are the taxa without sequances?

In [77]:
conn.execute("""
    SELECT * FROM taxa_16s
    LIMIT 10 
""").df()

Unnamed: 0,taxa_index,seq_16srRNA
0,0,ACGATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGCAGG...
1,2,CAACTTGAGAGTTTGATCCTGGCTCAGAGCGAACGCTGGCGGCAGG...
2,1,TCAACGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...
3,4,CAAATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCATG...
4,11,GAACCTGAGAGTTTGATCCTGGCTCAGAGCGAACGCTGGCGGCATG...
5,8,
6,10,CAACTAAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATG...
7,7,ACAATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGGAGG...
8,5,TTGTTGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...
9,6,GAACTGAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATG...


they are there, but dissappeared when we inner joined?

In [79]:
conn.execute("""
    SELECT (taxa_16s.seq_16srRNA)
        FROM taxa_16s INNER JOIN taxa
        ON taxa_16s.taxa_index = taxa.taxa_index
    WHERE taxa_16s.taxa_index=8
""").df()

Unnamed: 0,seq_16srRNA
0,


In [85]:
df = conn.execute("""
    SELECT taxa.taxa_index, taxa_16s.seq_16srRNA
        FROM taxa_16s INNER JOIN taxa
        ON taxa_16s.taxa_index = taxa.taxa_index
""").df()

In [86]:
df.head(10)

Unnamed: 0,taxa_index,seq_16srRNA
0,0,ACGATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGCAGG...
1,2,CAACTTGAGAGTTTGATCCTGGCTCAGAGCGAACGCTGGCGGCAGG...
2,1,TCAACGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...
3,4,CAAATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCATG...
4,11,GAACCTGAGAGTTTGATCCTGGCTCAGAGCGAACGCTGGCGGCATG...
5,8,
6,10,CAACTAAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATG...
7,7,ACAATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGGAGG...
8,5,TTGTTGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...
9,6,GAACTGAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATG...


All looks good...

## 6. Do operations on proteins to extract taxa id

In [38]:
test = conn.execute("""SELECT 
    substr(seq_id, 0, strpos(seq_id, '.'))::INT AS col 
FROM read_csv('../data/taxa/proteins/*.csv', auto_detect=False, header=True, sep=';', columns={'seq_id': 'VARCHAR', 'protein_seq': 'STRING', 'protein_desc': 'STRING', 'protein_len': 'INT'})
LIMIT 11
""").df()

In [39]:
test

Unnamed: 0,col
0,4417
1,4417
2,4417
3,4417
4,4417
5,4417
6,4417
7,4417
8,4417
9,4417


## 7. Put together entire schema

In [58]:
conn = ddb.connect(database="./dbs/test_db")

### TAXA

#### Schema for final table

In [4]:
# schema for taxa
conn.execute("""
CREATE OR REPLACE TABLE taxa(
    taxa_index INT PRIMARY KEY NOT NULL,
    ncbi_taxid INT NOT NULL,
    record_name STRING,
    filepath STRING,
    taxonomy STRING,
    organism STRING,
    bacdive_id INT,
    ogt_scraped_string STRING,
    seq_16srRNA STRING,
    len_16s INT,
    ogt FLOAT,
    thermophile_label BOOL,
)
""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

#### Create temp tables for each of the three taxa related files

In [5]:
conn.execute("""CREATE OR REPLACE TEMP TABLE taxa_tmp AS 
    SELECT
        "column0"::INT AS taxa_index,
        "taxid"::INT AS ncbi_taxid,
        "record_name"::STRING AS record_name,
        "filepath"::STRING AS filepath,
        "taxonomy"::STRING AS taxonomy,
        "organism"::STRING AS organism,
        "bacdive_id"::INT AS bacdive_id,
        "ogt_raw":: STRING AS ogt_scraped_string
    FROM read_csv_auto('../data/taxa/taxa_info_and_ogt.csv', header=True)""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [6]:
conn.execute("""CREATE OR REPLACE TEMP TABLE taxa_16s_tmp AS 
    SELECT 
        "taxa_index"::INT AS taxa_index_1,
        "seq_16srRNA"::STRING AS seq_16srRNA,
        length(seq_16srRNA)::INT as len_16s
    FROM read_csv_auto('../data/taxa/16s_rRNA.csv', header=True, nullstr='None')""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [7]:
conn.execute("""CREATE OR REPLACE TEMP TABLE taxa_labels_tmp AS 
    SELECT *
    FROM read_csv_auto('../data/taxa/labels.csv', header=True)""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

#### inner join table into taxa file

In [8]:
conn.execute("""
    COPY (SELECT * EXCLUDE (column0, taxa_index_1) FROM taxa_tmp
        INNER JOIN taxa_16s_tmp ON (taxa_tmp.taxa_index=taxa_16s_tmp.taxa_index_1)
        INNER JOIN taxa_labels_tmp ON (taxa_tmp.taxa_index=taxa_labels_tmp.column0))
    TO 'taxa_joined.csv' WITH (HEADER 1, DELIMITER '|')
""").df()

Unnamed: 0,Count
0,16664


In [9]:
conn.execute("""
    COPY taxa FROM 'taxa_joined.csv' ( HEADER, DELIMITER '|' )""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [10]:
conn.execute("SELECT * FROM taxa LIMIT 10").df()

Unnamed: 0,taxa_index,ncbi_taxid,record_name,filepath,taxonomy,organism,bacdive_id,ogt_scraped_string,seq_16srRNA,len_16s,ogt,thermophile_label
0,0,1968276,NZ_JAAFZH010000001,./data/refseq/bacteria/GCF_010435915.1_ASM1043...,Bacteria Bacteroidetes Cytophagia Cytophagales...,Spirosoma terrae,164169.0,"{'growth': ['25'], 'max': None, 'min': None}",ACGATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGCAGG...,79.0,,
1,1,132919,NZ_FNTL01000005,./data/refseq/bacteria/GCF_900105375.1_IMG-tax...,Bacteria Actinobacteria Corynebacteriales Noca...,Rhodococcus jostii,11034.0,"{'growth': ['28', '30', '28'], 'max': None, 'm...",TCAACGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...,1518.0,,
2,2,1654716,NZ_LGTB01000001,./data/refseq/bacteria/GCF_001238275.1_ASM1238...,Bacteria Proteobacteria Alphaproteobacteria Hy...,Bradyrhizobium viridifuturi,168770.0,,CAACTTGAGAGTTTGATCCTGGCTCAGAGCGAACGCTGGCGGCAGG...,1489.0,,
3,3,1181879,NZ_JACHMG010000001,./data/refseq/bacteria/GCF_014204865.1_ASM1420...,Bacteria Actinobacteria Pseudonocardiales Pseu...,Amycolatopsis jiangsuensis,23214.0,"{'growth': ['28', '28'], 'max': None, 'min': N...",TTGTTGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...,1519.0,,
4,4,31971,NZ_CABKNA010000001,./data/refseq/bacteria/GCF_902373665.1_MGYG-HG...,Bacteria Firmicutes Erysipelotrichia Erysipelo...,Amedibacillus dolichus,5424.0,"{'growth': ['37', '37'], 'max': None, 'min': N...",CAAATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCATG...,1540.0,,
5,5,1763543,NZ_BMNB01000001,./data/refseq/bacteria/GCF_014646235.1_ASM1464...,Bacteria Actinobacteria Micromonosporales Micr...,Verrucosispora sonchi,132025.0,28,TTGTTGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...,1517.0,28.0,False
6,6,305,NZ_CP088237,./data/refseq/bacteria/GCF_021117135.1_ASM2111...,Bacteria Proteobacteria Betaproteobacteria Bur...,Ralstonia solanacearum,,,GAACTGAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATG...,1536.0,,
7,7,1807141,NZ_VTRU01000001,./data/refseq/bacteria/GCF_008274625.1_ASM8274...,Bacteria Bacteroidetes Flavobacteriia Flavobac...,Chryseobacterium panacisoli,166547.0,,ACAATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGGAGG...,1517.0,,
8,8,1121477,NZ_FQVC01000024,./data/refseq/bacteria/GCF_900128975.1_IMG-tax...,Bacteria Proteobacteria Alphaproteobacteria Hy...,Devosia limi DSM 17137,6165.0,28,,,28.0,False
9,9,1178482,NZ_CP013106,./data/refseq/bacteria/GCF_001431725.1_ASM1431...,Bacteria Proteobacteria Gammaproteobacteria Oc...,Halomonas huangheensis,168874.0,,CAACTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...,1540.0,,


### PROTEINS

#### create a table no primary keys this time unfortunately

In [11]:
conn.execute("""
CREATE OR REPLACE TABLE proteins AS SELECT 
    substr(seq_id, 0, strpos(seq_id, '.'))::INT AS taxa_index,
    "seq_id"::STRING AS protein_index,
    "protein_seq"::STRING AS protein_seq,
    "protein_desc"::STRING AS protein_desc,
    "protein_len"::INT AS protein_len
FROM read_csv('../data/taxa/proteins/taxa_*.csv', auto_detect=False, header=True, sep=';', columns={'seq_id': 'VARCHAR', 'protein_seq': 'STRING', 'protein_desc': 'STRING', 'protein_len': 'INT'})
""").df()

Unnamed: 0,Count
0,65427472


In [12]:
conn.execute("CREATE SEQUENCE protein_int_index_seq START 1")
conn.execute("""
    ALTER TABLE proteins ADD COLUMN protein_int_index INT DEFAULT nextval('protein_int_index_seq')-1
""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [13]:
conn.execute("""SELECT COUNT(*) FROM proteins""").df()

Unnamed: 0,count_star()
0,65427472


In [14]:
conn.execute("""SELECT COUNT(DISTINCT taxa_index) FROM proteins""").df()

Unnamed: 0,count(DISTINCT taxa_index)
0,16664


In [15]:
conn.execute("""SELECT COUNT(DISTINCT protein_int_index) FROM proteins""").df()

Unnamed: 0,count(DISTINCT protein_int_index)
0,65427472


All is well.

### TAXA PAIRS

### Create temp tables and inner join to make pairs

In [16]:
conn.execute("""
    CREATE OR REPLACE TEMP TABLE taxa_pairs_tmp AS SELECT 
        *
    FROM read_csv_auto('../data/taxa_pairs/pairwise_16s_blast.csv', header=True)
""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [17]:
# now get labels, we might as well make this one table now
conn.execute("""
    CREATE OR REPLACE TEMP TABLE taxa_pair_labels_tmp AS SELECT 
        column0:: INT AS taxa_pair_index,
        is_pair:: BOOL AS is_pair
    FROM read_csv_auto('../data/taxa_pairs/pair_labels.csv', header=True)
""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [18]:
conn.execute("CREATE SEQUENCE taxa_pair_id_seq START 1")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [19]:
# add an index to the taxa pairs table which doesn't have one
conn.execute("""
    ALTER TABLE taxa_pairs_tmp ADD COLUMN taxa_pair_index INT DEFAULT nextval('taxa_pair_id_seq')-1
""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [20]:
# join the two tables and create a true table
conn.execute("""
    CREATE OR REPLACE TABLE taxa_pairs AS SELECT * FROM taxa_pairs_tmp
        INNER JOIN taxa_pair_labels_tmp ON (taxa_pairs_tmp.taxa_pair_index=taxa_pair_labels_tmp.taxa_pair_index)
    
""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [21]:
# join the two tables and create a true table
conn.execute("""
    ALTER TABLE taxa_pairs DROP "taxa_pair_index:1"
""").df()

Unnamed: 0,Success


In [22]:
conn.execute("""SELECT * FROM taxa_pairs WHERE is_pair LIMIT 12""").df()

Unnamed: 0,thermo_index,meso_index,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,local_E_value,query_align_start,query_align_end,subject_align_end,subject_align_start,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,taxa_pair_index,is_pair
0,16361,16481,0.944407,0.946975,0.94601,0.0,1,1471,1474,1,1471,1.0,1474,1.0,1222.0,740239,True
1,16361,16385,0.943729,0.946295,0.945331,0.0,1,1471,1474,1,1471,1.0,1474,1.0,1219.0,740240,True
2,16361,16460,0.933018,0.937458,0.936821,0.0,1,1471,1473,1,1471,1.0,1473,1.0,1167.0,740241,True
3,16361,16345,0.934371,0.938817,0.938179,0.0,1,1471,1473,1,1471,1.0,1473,1.0,1167.0,740242,True
4,16361,16211,0.931525,0.934058,0.933107,0.0,1,1471,1474,1,1471,1.0,1474,1.0,1161.0,740243,True
5,16361,16315,0.924068,0.926581,0.925322,0.0,1,1471,1475,1,1471,1.0,1475,1.0,1121.0,740244,True
6,16361,16272,0.905341,0.910265,0.909647,0.0,1,1470,1472,1,1470,0.99932,1472,0.999321,1039.0,740245,True
7,16361,16296,0.905213,0.908906,0.908597,0.0,1,1470,1471,1,1470,0.99932,1471,0.999321,1036.0,740246,True
8,16361,16340,0.906567,0.910265,0.909338,0.0,1,1471,1474,1,1471,1.0,1474,1.0,1026.0,740247,True
9,16361,16115,0.899324,0.904827,0.903598,0.0,1,1470,1474,1,1470,0.99932,1474,0.999322,1015.0,740248,True


### PROTEIN PAIRS

In [23]:
conn.execute("""
CREATE OR REPLACE TABLE protein_pairs AS SELECT  
    "thermo_protein_id"::STRING AS thermo_protein_index,
    "meso_protein_id"::STRING AS meso_protein_index,
    "local_gap_compressed_percent_id"::FLOAT AS local_gap_compressed_percent_id,
    "scaled_local_query_percent_id"::FLOAT AS scaled_local_query_percent_id,
    "scaled_local_symmetric_percent_id"::FLOAT AS scaled_local_symmetric_percent_id,
    "local_E_value"::FLOAT AS local_E_value,
    "query_align_start"::INT AS query_align_start,
    "query_align_end"::INT AS query_align_end,
    "subject_align_end"::INT AS subject_align_end,
    "subject_align_start"::INT AS subject_align_start,
    "query_align_len"::INT AS query_align_len,
    "query_align_cov"::FLOAT AS query_align_cov,
    "subject_align_len"::INT AS subject_align_len,
    "subject_align_cov"::FLOAT AS subject_align_cov,
    "bit_score"::INT AS bit_score,
    substr(thermo_protein_id, 0, strpos(thermo_protein_id, '.'))::INT AS thermo_index,
    substr(meso_protein_id, 0, strpos(meso_protein_id, '.'))::INT AS meso_index,
FROM read_csv('../data/taxa_pairs/protein_alignment/taxa_pair*.csv', auto_detect=False, header=True, sep=',', columns={
    'column0': 'INT',
    'thermo_protein_id': 'STRING',
    'meso_protein_id': 'STRING',
    'local_gap_compressed_percent_id': 'FLOAT',
    'scaled_local_query_percent_id': 'FLOAT',
    'scaled_local_symmetric_percent_id': 'FLOAT',
    'local_E_value': 'FLOAT',
    'query_align_start': 'INT',
    'query_align_end': 'INT',
    'subject_align_end': 'INT',
    'subject_align_start': 'INT',
    'query_align_len': 'INT',
    'query_align_cov': 'FLOAT',
    'subject_align_len': 'INT',
    'subject_align_cov': 'FLOAT',
    'bit_score': 'INT',
})
""").df()

Unnamed: 0,Count
0,181500502


create a new id - the 0th column in these files doesn't mean anything

In [24]:
conn.execute("CREATE SEQUENCE protein_pair_id_seq START 1")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [25]:
conn.execute("""
    ALTER TABLE protein_pairs ADD COLUMN prot_pair_index INT DEFAULT nextval('protein_pair_id_seq')-1
""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

Need to add meso and thermo protein indexes that are not strings to actually make this fast

In [26]:
conn.execute("""
    ALTER TABLE protein_pairs ADD COLUMN meso_protein_int_index INT;
    ALTER TABLE protein_pairs ADD COLUMN thermo_protein_int_index INT;
""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [27]:
conn.execute("""
    UPDATE protein_pairs SET meso_protein_int_index=(
        SELECT proteins.protein_int_index
        FROM proteins
        WHERE protein_pairs.meso_protein_index=proteins.protein_index
    )
""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [28]:
conn.execute("""
    UPDATE protein_pairs SET thermo_protein_int_index=(
        SELECT proteins.protein_int_index
        FROM proteins
        WHERE protein_pairs.thermo_protein_index=proteins.protein_index
    )
""")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [29]:
conn.execute("""SELECT COUNT(*) FROM protein_pairs""").df()

Unnamed: 0,count_star()
0,181500502


In [30]:
conn.execute("""SELECT COUNT(DISTINCT prot_pair_index) FROM protein_pairs""").df()

Unnamed: 0,count(DISTINCT prot_pair_index)
0,181500502


In [31]:
conn.execute("""SELECT COUNT(DISTINCT meso_protein_int_index) FROM protein_pairs""").df()

Unnamed: 0,count(DISTINCT meso_protein_int_index)
0,6410621


In [33]:
conn.execute("""SELECT COUNT(DISTINCT thermo_protein_int_index) FROM protein_pairs""").df()

Unnamed: 0,count(DISTINCT thermo_protein_int_index)
0,301597


### CREATE INDEXES

#### primaries

In [34]:
conn.execute("CREATE UNIQUE INDEX taxa_primary ON taxa (taxa_index)")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [35]:
conn.execute("CREATE UNIQUE INDEX protein_primary ON proteins (protein_int_index)")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [36]:
conn.execute("CREATE UNIQUE INDEX taxa_pair_primary ON taxa_pairs (taxa_pair_index)")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [37]:
conn.execute("CREATE UNIQUE INDEX prot_pair_primary ON protein_pairs (prot_pair_index)")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

#### foreign indexes

In [38]:
conn.execute("CREATE INDEX protein_to_taxa ON proteins (taxa_index)")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [39]:
conn.execute("CREATE INDEX taxa_pair_to_meso ON taxa_pairs (meso_index)")
conn.execute("CREATE INDEX taxa_pair_to_thermo ON taxa_pairs (thermo_index)")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [60]:
conn.execute("CREATE INDEX taxa_pair_both ON taxa_pairs (meso_index, thermo_index)")

<duckdb.DuckDBPyConnection at 0x1af149a70>

In [40]:
conn.execute("CREATE INDEX prot_pair_to_meso ON protein_pairs (meso_index)")
conn.execute("CREATE INDEX prot_pair_to_thermo ON protein_pairs (thermo_index)")

conn.execute("CREATE INDEX prot_pair_to_meso_prot ON protein_pairs (meso_protein_int_index)")
conn.execute("CREATE INDEX prot_pair_to_thermo_prot ON protein_pairs (thermo_protein_int_index)")

<duckdb.DuckDBPyConnection at 0x118ac9e70>

In [82]:
conn.execute("SELECT * FROM duckdb_indexes()").df()

Unnamed: 0,schema_name,schema_oid,index_name,index_oid,table_name,table_oid,is_unique,is_primary,expressions,sql
0,main,1,prot_pair_to_meso_prot,1425,protein_pairs,1422,False,False,,CREATE INDEX prot_pair_to_meso_prot ON protein...
1,main,1,prot_pair_primary,1423,protein_pairs,1422,True,False,,CREATE UNIQUE INDEX prot_pair_primary ON prote...
2,main,1,taxa_pair_to_thermo,1388,taxa_pairs,837,False,False,,CREATE INDEX taxa_pair_to_thermo ON taxa_pairs...
3,main,1,taxa_pair_to_meso,1386,taxa_pairs,837,False,False,,CREATE INDEX taxa_pair_to_meso ON taxa_pairs (...
4,main,1,protein_to_taxa,1384,proteins,831,False,False,,CREATE INDEX protein_to_taxa ON proteins (taxa...
5,main,1,prot_pair_to_thermo,1431,protein_pairs,1422,False,False,,CREATE INDEX prot_pair_to_thermo ON protein_pa...
6,main,1,taxa_pair_primary,1372,taxa_pairs,837,True,False,,CREATE UNIQUE INDEX taxa_pair_primary ON taxa_...
7,main,1,prot_pair_to_meso,1429,protein_pairs,1422,False,False,,CREATE INDEX prot_pair_to_meso ON protein_pair...
8,main,1,protein_primary,1370,proteins,831,True,False,,CREATE UNIQUE INDEX protein_primary ON protein...
9,main,1,prot_pair_to_thermo_prot,1427,protein_pairs,1422,False,False,,CREATE INDEX prot_pair_to_thermo_prot ON prote...


## Test some queries

#### taxa pairs and protein pairs

In [42]:
# join taxa and protein pairs
test = conn.execute("""
    SELECT taxa_pair_index, prot_pair_index, protein_pairs.local_E_value, taxa_pairs.is_pair FROM protein_pairs
    INNER JOIN taxa_pairs ON (protein_pairs.meso_index=taxa_pairs.meso_index AND protein_pairs.thermo_index=taxa_pairs.thermo_index)
    LIMIT 10
""").df()
test

Unnamed: 0,taxa_pair_index,prot_pair_index,local_E_value,is_pair
0,652803,0,0.0,True
1,652803,1,7.4e-33,True
2,652803,2,0.0,True
3,652803,3,0.408,True
4,652803,4,0.364,True
5,652803,5,0.0,True
6,652803,6,0.0,True
7,652803,7,0.0,True
8,652803,8,0.0,True
9,652803,9,2.326155e-43,True


In [43]:
# should have number equal to number of protein pairs
conn.execute("""
    SELECT COUNT(taxa_pair_index) FROM protein_pairs
    INNER JOIN taxa_pairs ON (protein_pairs.meso_index=taxa_pairs.meso_index AND protein_pairs.thermo_index=taxa_pairs.thermo_index)
""").df()

Unnamed: 0,count(taxa_pair_index)
0,181500502


In [44]:
# should have number equal to or less than number taxa pairs
conn.execute("""
    SELECT COUNT(DISTINCT taxa_pair_index) FROM protein_pairs
    INNER JOIN taxa_pairs ON (protein_pairs.meso_index=taxa_pairs.meso_index AND protein_pairs.thermo_index=taxa_pairs.thermo_index)
""").df()

Unnamed: 0,count(DISTINCT taxa_pair_index)
0,13781


In [45]:
conn.execute("SELECT COUNT(taxa_pair_index) FROM taxa_pairs WHERE is_pair").df()

Unnamed: 0,count(taxa_pair_index)
0,13784


close, looks like 3 taxa pairs failed or got no proteins


#### protein pairs to proteins

In [46]:
# run a basic filter of protein pair metrics
conn.execute("""
    SELECT COUNT(prot_pair_index) FROM protein_pairs
    WHERE
        protein_pairs.local_E_Value< 1e-46
        AND protein_pairs.query_align_cov > 0.95
        AND protein_pairs.subject_align_cov > 0.95
        AND protein_pairs.scaled_local_symmetric_percent_id > 0.7
""").df()

Unnamed: 0,count(prot_pair_index)
0,1758286


In [47]:
# look at the protein indexes of the filter
conn.execute("""
    SELECT 
        protein_pairs.meso_protein_index,
        protein_pairs.thermo_protein_index,
        protein_pairs.meso_protein_int_index,
        protein_pairs.thermo_protein_int_index
    FROM protein_pairs
    WHERE
        protein_pairs.local_E_Value< 1e-46
        AND protein_pairs.query_align_cov > 0.95
        AND protein_pairs.subject_align_cov > 0.95
        AND protein_pairs.scaled_local_symmetric_percent_id > 0.7
    LIMIT 10
""").df()

Unnamed: 0,meso_protein_index,thermo_protein_index,meso_protein_int_index,thermo_protein_int_index
0,4886.1509,14963.324,17226524,28678489
1,4886.1507,14963.327,17226522,28678492
2,4886.1129,14963.649,17226144,28678814
3,4886.2204,14963.1008,17227219,28679173
4,4886.1765,14963.1048,17226780,28679213
5,4886.279,14963.1352,17225294,28679517
6,4886.3305,14963.1814,17228320,28679979
7,4886.1277,14963.209,17226292,28680255
8,4886.2503,14963.2668,17227518,28680833
9,4886.132,14963.316,17225147,28681325


In [48]:
# show the protein sequences for the filter
test = conn.execute("""
    SELECT 
        pr_t.protein_seq AS thermo_seq,
        pr_m.protein_seq AS meso_seq,
        protein_pairs.meso_protein_index,
        protein_pairs.thermo_protein_index,
        protein_pairs.query_align_cov,
        protein_pairs.subject_align_cov
    FROM protein_pairs
        INNER JOIN proteins AS pr_t ON protein_pairs.thermo_protein_int_index=pr_t.protein_int_index
        INNER JOIN proteins AS pr_m ON protein_pairs.meso_protein_int_index=pr_m.protein_int_index
    WHERE
        protein_pairs.local_E_Value< 1e-46
        AND protein_pairs.query_align_cov > 0.95
        AND protein_pairs.subject_align_cov > 0.95
        AND protein_pairs.scaled_local_symmetric_percent_id > 0.7
    LIMIT 10
""").df()

In [49]:
test.iloc[0]

thermo_seq              MKITEIELFTVPPRWLFLKISTDEGITGWGEPVVEGRADTVAAAVR...
meso_seq                MKITRFETFIVPPRWLFLKIETDEGISGWGEPVVEGKAHTVQAAVE...
meso_protein_index                                              3950.3409
thermo_protein_index                                            10208.908
query_align_cov                                                       1.0
subject_align_cov                                                     1.0
Name: 0, dtype: object

Let's go check in the files that we did not screw everything up too bad...

In [50]:
pd.Series( 
    index='thermo_protein_id,meso_protein_id,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,local_E_value,query_align_start,query_align_end,subject_align_end,subject_align_start,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score'.split(','),
    data='10208.908,3950.3409,0.7356020942408377,0.7356020942408377,0.7356020942408377,2.71e-217,1,382,382,1,382,1.0,382,1.0,1537.0'.split(','))

thermo_protein_id                             10208.908
meso_protein_id                               3950.3409
local_gap_compressed_percent_id      0.7356020942408377
scaled_local_query_percent_id        0.7356020942408377
scaled_local_symmetric_percent_id    0.7356020942408377
local_E_value                                 2.71e-217
query_align_start                                     1
query_align_end                                     382
subject_align_end                                   382
subject_align_start                                   1
query_align_len                                     382
query_align_cov                                     1.0
subject_align_len                                   382
subject_align_cov                                   1.0
bit_score                                        1537.0
dtype: object

The scores are definately within the filter. Let's look at the original sequence.

MKITEIELFTVPPRWLFLKISTDEGITGWGEPVVEGRADTVAAAVRELEPCLIGNDPSRIEDLWQVLYRGGFYRGGPVMMSAIAGIDQALWDIKGQRYGLPVYEFLGGAAREKVRVYSWIGGDRPTDVGRAAAEKKEQGFNAIKMNASGEMNYIDSFSKVEAIVERVAAVREATGPDFGIAVDFHGRIHRAMAKVVAKELEPYRLMFIEEPVLPENNEALREVARHTSTPIATGERMYSRWDFKGLLEDGYVDIIQPDLSHAGGISEVKKIAAMAEAYDVTVAPHCPLGPVALASCLQLDACTPNVFIQEQSLGIHYNQESDLLDYLEDPTVFQYEDGYVAIPEGPGLGIRVDESVVREAARRGHKWKNPIWRNADGTVAEW

MKITRFETFIVPPRWLFLKIETDEGISGWGEPVVEGKAHTVQAAVEELMDYLIGQDPQRIEDLWQLMYRGGFYRGGAILMSAIAGIDQALWDIKGKIYNAPVYQLLGGACRNTMRVYSWVGGDRPIDVVQAALEKKAAGFTAIKMNASEEMQFIDTHDKIYAIVERVAAIREACGPEFGIAVDFHGRLHKPMARGLARELDPYRLMFIEEPVLPENNEVLREIAHHTSTPIATGERMYSRWEFKNLLKDGVVDIIQPDLSHAGGITECKKIFAMAEAFDVAVAPHCPLGPIALAACLQVDATSYNAVIQEQSLGIHYNQGNDLLDYITDPTVFAYSDGHVHIPSGPGLGITVNEEYVRKMAEAGHRWRNPVWRHRDGSIAEW

They look the same to me!

###  filter by both taxa pair and protein pair

In [51]:
conn.execute("""
    SELECT 
        protein_pairs.prot_pair_index
    FROM protein_pairs
    INNER JOIN taxa_pairs ON (protein_pairs.meso_index=taxa_pairs.meso_index AND protein_pairs.thermo_index=taxa_pairs.thermo_index)
    WHERE
        taxa_pairs.query_align_len>1300
        AND taxa_pairs.subject_align_len>1300
        AND protein_pairs.local_E_Value< 1e-46
        AND protein_pairs.query_align_cov > 0.95
        AND protein_pairs.subject_align_cov > 0.95
        AND protein_pairs.scaled_local_symmetric_percent_id > 0.7
""").df()

Unnamed: 0,prot_pair_index
0,10299
1,10300
2,10316
3,10318
4,10320
...,...
1738393,179880601
1738394,179880788
1738395,179880791
1738396,179880792


In [52]:
# repeat above but print out the 16s sequence
test = conn.execute("""
    SELECT 
        protein_pairs.prot_pair_index,
        taxa_m.seq_16srRNA AS meso_16s,
        taxa_t.seq_16srRNA AS thermo_16s,
    FROM protein_pairs
    INNER JOIN taxa_pairs ON (protein_pairs.meso_index=taxa_pairs.meso_index AND protein_pairs.thermo_index=taxa_pairs.thermo_index)
    INNER JOIN taxa AS taxa_m ON (taxa_pairs.meso_index=taxa_m.taxa_index)
    INNER JOIN taxa AS taxa_t ON (taxa_pairs.thermo_index=taxa_t.taxa_index)
    WHERE
        taxa_pairs.query_align_len>1300
        AND taxa_pairs.subject_align_len>1300
        AND protein_pairs.local_E_Value< 1e-46
        AND protein_pairs.query_align_cov > 0.95
        AND protein_pairs.subject_align_cov > 0.95
        AND protein_pairs.scaled_local_symmetric_percent_id > 0.7
""").df()

In [53]:
test['thermo_16s'].apply(len).min()

1348

In [54]:
test['meso_16s'].apply(len).min()

1318

Dope, we sucessfully filtered out anything with meso and thermo 16s alignments using less that 1300 BPs

In [56]:
# repeat above but print out what we would need for ML, eg. protein sequence and OGT
test = conn.execute("""
    SELECT 
        protein_pairs.prot_pair_index,
        proteins_m.protein_seq AS meso_seq,
        proteins_t.protein_seq AS thermo_seq,
        taxa_m.ogt AS meso_ogt,
        taxa_t.ogt AS thermo_ogt,
        protein_pairs.scaled_local_symmetric_percent_id
    FROM protein_pairs
    INNER JOIN taxa AS taxa_m ON (protein_pairs.meso_index=taxa_m.taxa_index)
    INNER JOIN taxa AS taxa_t ON (protein_pairs.thermo_index=taxa_t.taxa_index)
    INNER JOIN proteins AS proteins_m ON (protein_pairs.meso_protein_int_index=proteins_m.protein_int_index)
    INNER JOIN proteins AS proteins_t ON (protein_pairs.thermo_protein_int_index=proteins_t.protein_int_index)
    WHERE
        taxa_m.len_16s>1300
        AND taxa_t.len_16s>1300
        AND protein_pairs.local_E_Value< 1e-46
        AND protein_pairs.query_align_cov > 0.95
        AND protein_pairs.subject_align_cov > 0.95
        AND protein_pairs.scaled_local_symmetric_percent_id > 0.7
""").df()

In [57]:
test

Unnamed: 0,prot_pair_index,meso_seq,thermo_seq,meso_ogt,thermo_ogt,scaled_local_symmetric_percent_id
0,173553002,MPTLNEPKLIAGNANLPLAQSITRRMSMHRGVDQGLVDARVERFND...,MPAVTEPKLIAGNANLSLAKSIARRMSMHRGMSVNLVDARVERFND...,24.0,42.5,0.758112
1,173552983,MPDSNGADIKVGIIMGSQSDWPTMKEAATILDELGVAYEAKIVSAH...,MAVSVGIIMGSQSDWPTMKAAAEILDELGIAYEAKIVSAHRTPDRL...,24.0,42.5,0.815951
2,173551535,MIGRLNHVAIAVPDLDAASAQYKNTLGANVGAPQDEPDHGVTVVFI...,MIGRLNHVAIAVPDLDAAADQYRNTLGAKVGAPQPEPDHGVTVVFI...,24.0,42.5,0.917910
3,173551557,MSWTDERVELLKKMWGEGQSASQIAKELGGVTRNAVIGKVHRLGLS...,MSWTDERVETLKRMWGEGQSASQIAKELGGVTRNAVIGKVHRLGLS...,24.0,42.5,0.738155
4,15222340,MPKINGNEIRPGNVLEHNGGLWAAVKVDHVKPGKGGAFAQVEMRNL...,MPKINGNEIRPGNVLEHNGGLWSVMKVEHVKPGKGGAYAQVEMRNL...,24.0,54.0,0.844920
...,...,...,...,...,...,...
1738489,16259326,MTVLRTIVLFGLAAVAEIGGAWLIRQGVREQRGWGWMGSGVIALGL...,MLILRSAALFVLAAILEIGGAWLVWQGVREHRGWMWAAGGVLALGA...,28.0,45.0,0.702703
1738490,58837791,MKYSGWPDSGELVVGEVDEITDFGVFVDLEEYEDKRGLCHISEVAN...,MKYEGWPEPGELVVGKVDEIEDFGVFVDLEEYRDKRGLVHISEVAS...,30.0,42.5,0.726592
1738491,43501488,MTHVVTENCIKCRYTDCVDVCPVDCFREGPNFLAIDPDECIDCAVC...,MTHVVTENCINCKYTDCVDVCPVDCFKEGPNFLVIDPDECIDCAVC...,30.0,48.0,0.768519
1738492,143509651,MTHIVTEACIKCKYTDCVDVCPVDCFREGPNFLTIDPDECIDCAVC...,MTHVVTENCINCKYTDCVDVCPVDCFKEGPNFLVIDPDECIDCAVC...,25.0,48.0,0.779817
