<a href="https://colab.research.google.com/github/FouadAIAzar/Flurine/blob/main/get_eDGAR_DB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get a Database, like eDGAR

In [None]:
# Importing the required modules
import os
import sys
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [None]:
# Get Table from eDGAR
URL = "http://edgar.biocomp.unibo.it/cgi-bin/gene_disease_db/main_table.py"
  
# sending get request and saving the response as r
r = requests.get(url = URL)
with open('./response.html', 'wb+') as f:
    f.write(r.content)

In [None]:
## Convert HTML table to .csv

path = 'response.html'
  
# Create an empty list
data = []

# the HTML file
header = [] # <th></th> attribute
sp = BeautifulSoup(open(path),'html.parser')
tableRow = sp.find_all("table")[0].find("tr") # find table rows
 
for i in tableRow:
    try:
        header.append(i.get_text())
    except:
        continue
 
# for getting the data
HTML_data = sp.find_all("table")[0].find_all("tr")[1:]
 
for i in HTML_data:
    sub = []
    for j in i:
        try:
            sub.append(j.get_text())
        except:
            continue
    data.append(sub)

# Store the results in pd's DataFrame for SQL processing
df = pd.DataFrame(data = data, columns = header)
  
# convert into DataFrame into CSV
df.to_csv('disease2gene.csv')

# Look up a disease in eDGAR

In [None]:
term = "KERATOCONUS" # Change to desired diesease name
rows = df[df['Disease Name'].str.contains(term)] # All rows of desired disease 

if rows.empty:
  print("No genes found")
else:
  print(rows)

genes = rows.iloc[:,0].values # collect all genes associated

print(genes)

    Gene Name   Disease Name  Disease ID                   Database
1       VSX1    KERATOCONUS    PS148300    ClinVar, OMIM, HUMSAVAR 
593   ZNF469    KERATOCONUS    PS148300                    ClinVar 
[' VSX1 ' ' ZNF469 ']


# Query FASTA gene sequence from NCBI

In [None]:
import requests
import json
from pandas import json_normalize

def fetch_gene_seq(genes: str) -> list:
  results = pd.DataFrame()  
  for g in genes:
    url=f"https://clinicaltables.nlm.nih.gov/api/ncbi_genes/v3/search?terms={genes}"
    response = json_normalize(requests.request("GET",url))
    results.append(response)
    return results



In [None]:
# Figure out a way to grab gene sequence from NCBI 

# Translating Gene Sequence by without Biopython

In [None]:
inputfile = 'dna.txt'
f = open(inputfile, "r")
dna = f.read()
dna = dna.replace("\n", "")
dna = dna.replace("\r", "")
table = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}

In [None]:
def translate (dna, table):
  """Translate a string containing a nucleotide sequence into a string
  containing the corresponding sequence of amino acids. Nucleotides are
  tranlated in triplets using the table dictionary; each amino acid
  is encoded with a string of length 1."""
  polypeptide = ""
  # Check the sequence length is divisible by 3
  if len(dna) % 3 == 0:
    # Loop over the sequence
    for i in range (0, len(dna), 3):
      # extract a single codon
      codon = dna [i : i+3]
      # look up the codon and store the result
      polypeptide += table[codon]
  return polypeptide

In [None]:
translate (dna[20:935], table)

# Translating DNA using Biopython

In [None]:
# Install biopython
%%capture
!pip install biopython

In [None]:
# Import libs
import Bio
from Bio.Seq import Seq

In [None]:
# Coding DNA 2 Polypeptide translation 
coding_dna = Seq(dna)
polypeptide = coding_dna.translate()

# Sending AA sequence to Alpha Fold for 3D folding