In [103]:
import sqlite3
conn = sqlite3.connect('my.db')
c = conn.cursor()

c.execute("""DROP TABLE Gene_Table""")
c.execute("""DROP TABLE Pathway_Table""")
c.execute("""DROP TABLE Enzyme_Table""")

c.execute("""CREATE TABLE Gene_Table (id INT PRIMARY KEY ASC,
                                        name TEXT, 
                                        description TEXT, 
                                        organism TEXT,
                                        ntsequence TEXT, 
                                        chromosome TEXT, 
                                        startpos INT, 
                                        endpos INT, 
                                        strand VARCHAR(1), 
                                        transseq INT);""")

c.execute("""CREATE TABLE Pathway_Table (id INT PRIMARY KEY ASC,
                                        name TEXT, 
                                        description TEXT);""")

c.execute("""CREATE TABLE Enzyme_Table (id INT PRIMARY KEY ASC,
                                        name TEXT, 
                                        function TEXT, 
                                        ECnum INT);""")

conn.commit()

In [106]:
from Bio import Entrez
Entrez.email = "ngoodwin97@berkeley.edu"

# Adding data to Pathway_Table

pathways = ["glycolysis", "TCA cycle (citic acid cycle)", "pentose phosphate cycle, pathway"]

for i in pathways:
    a = Entrez.esearch(db= "biosystems", term = i, sort = "relevance", idtype = "acc")
    handle = Entrez.efetch(db = "biosystems", id = Entrez.read(a)["IdList"][0] , rettype = "xml", retmode = "text")
    print(handle.read())

    

<?xml version="1.0" ?>
<!DOCTYPE Sys-set SYSTEM "ftp://ftp.ncbi.nih.gov/pub/biosystems/schema/NCBI_Systems.dtd">
<Sys-set>


<System>
  <System_sysid>
    <Sys-id>
      <Sys-id_bsid>1458176</Sys-id_bsid>
      <Sys-id_version>4</Sys-id_version>
    </Sys-id>
  </System_sysid>
  <System_source>
    <Sys-source>
      <Sys-source_source>
        <Dbtag>
          <Dbtag_db>WikiPathways</Dbtag_db>
          <Dbtag_tag>
            <Object-id>
              <Object-id_id>5</Object-id_id>
            </Object-id>
          </Dbtag_tag>
        </Dbtag>
      </Sys-source_source>
    </Sys-source>
  </System_source>
  <System_externalaccn>WP2862</System_externalaccn>
  <System_recordurl>http://www.wikipathways.org/index.php/Pathway:WP2862</System_recordurl>
  <System_names>
    <System_names_E>Glycolysis</System_names_E>
  </System_names>
  <System_description>This plant pathway represents the Glycolysis (cytosol). Glucose, originating from sucrose, and triose phosphates (originating from t

<?xml version="1.0" ?>
<!DOCTYPE Sys-set SYSTEM "ftp://ftp.ncbi.nih.gov/pub/biosystems/schema/NCBI_Systems.dtd">
<Sys-set>


<System>
  <System_sysid>
    <Sys-id>
      <Sys-id_bsid>855834</Sys-id_bsid>
      <Sys-id_version>5</Sys-id_version>
    </Sys-id>
  </System_sysid>
  <System_source>
    <Sys-source>
      <Sys-source_source>
        <Dbtag>
          <Dbtag_db>KEGG</Dbtag_db>
          <Dbtag_tag>
            <Object-id>
              <Object-id_id>2</Object-id_id>
            </Object-id>
          </Dbtag_tag>
        </Dbtag>
      </Sys-source_source>
    </Sys-source>
  </System_source>
  <System_externalaccn>ppa_M00009</System_externalaccn>
  <System_recordurl>http://www.kegg.jp/module/ppa_M00009</System_recordurl>
  <System_names>
    <System_names_E>Citrate cycle (TCA cycle, Krebs cycle)</System_names_E>
  </System_names>
  <System_description>Pathway module; Carbohydrate and lipid metabolism; Central carbohydrate metabolism</System_description>
  <System_interactors

<?xml version="1.0" ?>
<!DOCTYPE Sys-set SYSTEM "ftp://ftp.ncbi.nih.gov/pub/biosystems/schema/NCBI_Systems.dtd">
<Sys-set>


<System>
  <System_sysid>
    <Sys-id>
      <Sys-id_bsid>518076</Sys-id_bsid>
      <Sys-id_version>2</Sys-id_version>
    </Sys-id>
  </System_sysid>
  <System_source>
    <Sys-source>
      <Sys-source_source>
        <Dbtag>
          <Dbtag_db>GO</Dbtag_db>
          <Dbtag_tag>
            <Object-id>
              <Object-id_id>6</Object-id_id>
            </Object-id>
          </Dbtag_tag>
        </Dbtag>
      </Sys-source_source>
    </Sys-source>
  </System_source>
  <System_externalaccn>GO:0080152</System_externalaccn>
  <System_recordurl>http://amigo.geneontology.org/cgi-bin/amigo/term_details?term=GO:0080152</System_recordurl>
  <System_names>
    <System_names_E>regulation of reductive pentose-phosphate cycle</System_names_E>
  </System_names>
  <System_description>Any process that modulates the frequency, rate or extent of reductive pentose-phos

In [107]:
# I could not figure out how to parse the XML file, since there were only three pathways, it seemed more efficient to
# add their names and descriptions printed above into the table manually 

c.execute("""INSERT INTO Pathway_Table (name, description) VALUES ("Glycolysis", 
                                                                    "This plant pathway represents the Glycolysis (cytosol).
                                                                    Glucose, originating from sucrose, and triose phosphates 
                                                                    (originating from the Calvin cycle) enter the pathway to
                                                                    be converted into pyruvate. The series of reactions produced
                                                                    ATP and NADH.")""")
c.execute("""INSERT INTO Pathway_Table (name, description) VALUES ("Citrate cycle (TCA cycle, Krebs cycle)", 
                                                                    "Pathway module; Carbohydrate and lipid metabolism; Central 
                                                                    carbohydrate metabolism")""")
c.execute("""INSERT INTO Pathway_Table (name, description) VALUES ("Pentose phosphate pathway (Pentose phosphate cycle)", 
                                                                    "Pathway module; Carbohydrate and lipid metabolism;
                                                                    Central carbohydrate metabolism")""")


<sqlite3.Cursor at 0x15b78c64b20>

In [117]:
# Define relevent species: these were given in lab instructions
species = ["homo sapien", "drosophila", "e coli"]
# Defining the list of 4 genes per pathway: these were chosen from flow diagram at :
# https://rgd.mcw.edu/rgdweb/pathway/pathwayRecord.html?acc_id=PW:0000640&species=Rat
genes_glyc = ["phosphoglycerate kinase", "glucose-6-phosphate isomerase", "glucokinase", "triosephosphate isomerase 1"]

# Perform an esearch and efetch of the first item in the search for every "element in species" + "element in genes_glyc"
for i in genes_glyc:
    for j in species:
        b = Entrez.esearch(db= "protein", term = i+" " +j, sort = "relevance", idtype = "acc")
        handle = Entrez.efetch(db = "protein", id = Entrez.read(b)["IdList"][0] , rettype = "fasta", retmode = "text")
        print(handle.read())

>AAB36381.2 phosphoglycerate kinase, partial [Homo sapiens]
KVADKIQLINNMLDKVNEMIIGGGMAFTFLKVLNNMAVGNKCQVDVK


>XP_023165869.1 phosphoglycerate kinase [Drosophila hydei]
MAFNKLSIESLDLEGKRVLMRVDFNVPIKEGKITSNQRIVAAVDSIKLALSKKAKSVVLMSHLGRPDGNK
NLKYTLAPVAEELTTLLGQKVIFLNDCVGAEVEAACKDPAAGSVILLENVRFYLEEEGKGVDACGAKVKA
NADKVKEFRTSLAKLGDVYVNDAFGTAHRAHSSMMGEGFEKRAAGLLLNKELKYFSQALDKPPNPFLAIL
GGAKVADKIQLIENLLDKVNEMIIGGGMAFTFLKVLNNMKIGGSLFDEEGSKIVQKLVDKAKKNNVQLHL
PVDFVCGDKFAEDAAVSEATVEAGIPDGHMGLDVGPKTRELFAAPIARAKLIVWNGPPGVFEFPNFAKGT
KSIMDGVVAATKNGTVSIIGGGDTASCCAKWNTEALVSHVSTGGGASLELLEGKTLPGVAALSNA


>WP_077826021.1 phosphoglycerate kinase [Escherichia coli]
MAAQGHDVGKSLYEADLVDEAKRLLTTCNIPVPSDVRVATEFSETAPATLKSVNDVKADEQILDIGDASA
QELAEILKNAKTILWNGPVGVFEFPNFRKGTEIVANAIADSEAFSIAGGGDTLAAIDLFGIADKISYIST
GGGAFLEFVEGKVLPAVAMLEERAKK


>sp|P06744.4|G6PI_HUMAN RecName: Full=Glucose-6-phosphate isomerase; Short=GPI; AltName: Full=Autocrine motility factor; Short=AMF; AltName: Full=Neuroleukin; Short=NLK; AltName:

In [116]:
# Defining the list of 4 genes per pathway: these were chosen from flow diagram at :
# https://rgd.mcw.edu/rgdweb/pathway/pathwayRecord.html?processType=view&species=Rat&acc_id=PW:0000045
genes_pentose = ["glucose-6-phosphate dehydrogenase", "ribose 5-phosphate isomerase A", "6-phosphogluconolactonase", "ribulose-5-phosphate-3-epimerase" ]

#Retreiving from Protein Database
for i in genes_pentose:
    for j in species:
        c = Entrez.esearch(db= "protein", term = i+" " +j, sort = "relevance", idtype = "acc")
        handle = Entrez.efetch(db = "protein", id = Entrez.read(c)["IdList"][0] , rettype = "fasta", retmode = "text")
        print(handle.read())
        
#Retreiving from Gene Database        
for i in genes_pentose:
    for j in species:
        c = Entrez.esearch(db= "gene", term = i+" " +j, sort = "relevance", idtype = "acc")
        handle = Entrez.efetch(db = "gene", id = Entrez.read(c)["IdList"][0] , rettype = "fasta", retmode = "text")
        print(handle.read())

>AAB26169.1 glucose-6-phosphate dehydrogenase, partial [Homo sapiens]
MGRRGSAPGNGRTLRGCERGGRRRRSADSVMAEQVALSRTQVCGILREELFQGDAFHQSDTHIFIIMGAS
GDLAKKKIYPTIWWLFRDGLLPENTFIVGYA


>AAR12953.1 glucose-6-phosphate dehydrogenase, partial [Drosophila mojavensis]
GWNRVIVEKPFGRDDVTSKALSDHLAGLFEEEQLYRIDHYLGKEMVQNLMTIRFGNKILSSTWNRENIAC
VLITFKEPFGTQGRGGYFDEFGIIRDVMQNHLLQILSLVAMEKPCSCHPDDIRDEKVKVLKCIKPLELSD
MVLGQYVGNPAGSTAEEQCGYLDDPTVSKTSNTPTYAMAVIQINNERWQGVPFILRCGKALNERKAEVRI
QYQDVPGDIFEGNSKRNELVIRVQPGEALYFKMMTKSPGITFDIEETELDLTYEHRYKHSVLPDAYERLI
LDVFCGSQMHFVRSDELREAWRIFTPILHKIEHERIPPIPYPYGSRGPTEADRKCVENNFIYSASYKWHG
NKADASHL


>WP_085454991.1 glucose-6-phosphate dehydrogenase, partial [Escherichia coli]
QVLNKVPGLDHKHNLQITKLDLSYSETFNQTHLADAYERLLLETMRGIQALFVRRDEVEEAWKWVDSITE
AWAMDNDAPKPYQAGTWGPVASVAMITRDGRSWNEFE


>AAH15529.2 Ribose 5-phosphate isomerase A [Homo sapiens]
MQRPGPFSTLYGRVLAPLPGRAGGAASGGGGNSWDLPGSHVRLPGRAQSGTRGGAGNTSTSCGDSNSICP
APSTMSKAEEAKKLAGRAAVENHVRNNQVLGIGSGSTIVHAVQRIAERVKQENLNLVCIPTS

RuntimeError: Search Backend failed: read request has timed out. peer: 130.14.18.61:7011

In [None]:
# Defining the list of 4 genes per pathway: these were chosen from flow diagram at :
# https://rgd.mcw.edu/rgdweb/pathway/pathwayRecord.html?processType=view&species=Rat&acc_id=PW:0000026
genes_TCA = ["citrate synthase"]

for i in genes_TCA:
    for j in species:
        d = Entrez.esearch(db= "protein", term = i+" " +j, sort = "relevance", idtype = "acc")
        handle = Entrez.efetch(db = "protein", id = Entrez.read(c)["IdList"][0] , rettype = "fasta", retmode = "text")
        print(handle.read())

In [108]:
## I worked on this lab since the day it was assigned but I am having a really hard time. Although I have read a lot of 
## documentation for Entrez I cannot figure out how to parse and splice the outputs such that I can add them to the table

In [None]:
#check