In [3]:
import glob
import psycopg2
from Bio import SeqIO
import gzip
import os


global testing
testing = 0

current_directory = os.path.dirname(os.path.abspath("__file__"))
os.chdir(current_directory)

#file pathways
pathways_input = './create_SQL_databases/input_files/metacyc_pathways'
rxn_input = './create_SQL_databases/input_files/metacyc_reactions_level4ec_only.uniref'
chocophlan_input = '.../chocophlan_v201901_v31/chocophlan/'  # replace with correct pathway after downloading the database
ip2mc_input = './create_SQL_databases/input_files/interpro2metacyc.txt'
ip2ec_input = './create_SQL_databases/input_files/interpro2ec.txt'
metadata_input = './create_SQL_databases/input_files/Curatedmetadata.txt'
abundance_input = './create_SQL_databases/input_files/CuratedAbundance.txt'

In [2]:
# update taxonomy with species that are in abundance and not in chocophlan
def update_taxonomy(abundance_input, conn):
    cur = conn.cursor()

    species_list = set()
    taxa_ids = []

    with open(abundance_input, 'r') as file:
        header = file.readline()
        for line in file:
            taxa_string, sample_id, count, species = line.strip().split('\t')
            species = species.split('__')[1].replace('_', ' ')
            taxa_string = taxa_string.split('|')
            if species not in species_list:
                species_list.add(species)
                taxa_ids.append((
                taxa_string[6].split('__')[1].replace('_', ' '),
                taxa_string[5].split('__')[1].replace('_', ' '),
                taxa_string[4].split('__')[1].replace('_', ' '),
                taxa_string[1].split('__')[1].replace('_', ' '),
                taxa_string[0].split('__')[1].replace('_', ' ')
            ))

    # Create the SQL query string
    insert_query = """
        INSERT INTO ds_mgpt_mgpt.taxonomy (species, genus, family, phylum, kingdom)
        VALUES (%s, %s, %s, %s, %s)
        ON CONFLICT (species) DO NOTHING;
    """

    # Execute the query for each row in taxa_ids
    for row in taxa_ids:
        cur.execute(insert_query, row)

    conn.commit()
    cur.close()


In [7]:
# update taxonomy with species that are in CHOCOPHLAN and not in taxonomy yet: these have been identified with query: 
"""SELECT DISTINCT c.taxa_string                                                                                                                                                        FROM ds_mgpt_mgpt.chocophlan AS c
LEFT JOIN ds_mgpt_mgpt.taxonomy AS t ON t.species = c.species
WHERE t.species IS NULL;"""
# this step is necessary to allow foreign keys settings

extra_taxa_input = "./create_SQL_databases/postgreSQL_database_script/extra_taxa.txt"

def update_taxonomy2(extra_taxa_input, conn):
    cur = conn.cursor()
    
    taxa_ids = []

    with open(extra_taxa_input, 'r') as file:
        for line in file:
            taxa_string = line.strip()
            taxa_string = taxa_string.split('.')
            taxa_ids.append((
            taxa_string[6].split('__')[1].replace('_', ' '),
            taxa_string[5].split('__')[1].replace('_', ' '),
            taxa_string[4].split('__')[1].replace('_', ' '),
            taxa_string[1].split('__')[1].replace('_', ' '),
            taxa_string[0].split('__')[1].replace('_', ' ')
        ))

    # Create the SQL query string
    insert_query = """
        INSERT INTO ds_mgpt_mgpt.taxonomy (species, genus, family, phylum, kingdom)
        VALUES (%s, %s, %s, %s, %s)
        ON CONFLICT (species) DO NOTHING;
    """

    # Execute the query for each row in taxa_ids
    for row in taxa_ids:
        cur.execute(insert_query, row)

    conn.commit()
    cur.close()

In [8]:
# CONNECTION TO THE DATABASE and execute function that inserts the table of interest

hostname = '' # replace with hostname
database = 'datalake' # replace with database name
username = '' # replace with username
port_id = 5434
conn = None
cur = None
try: 
    conn = psycopg2.connect(host = hostname,
                           dbname = database,
                           user = username,
                           port = port_id)
    
    update_taxonomy2(extra_taxa_input, conn)
    
# exception class captures the errors that can happen so I can know why the connection is failing
except Exception as error:
    print(error)
finally:
    if cur is not None:
        cur.close()
    if conn is not None:
        conn.close()

print('done')

done
