<h3>Goals</h3>

- obtain sequences and 2D structures of aptamers used in paper (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6975895/)

- write a function that can convert aptamers 3D structure (file.pdb or PDB ID) to sequence and secondary structure

In [29]:
import pandas as pd

import time
from selenium import webdriver
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup

<h3>Reconstruction of dataset</h3>

Reference_results.csv - table of the results of Mfold, RNAfold and CentroidFold on dataset of 69 DNA aptamers provided in paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6975895/

In [30]:
#PDB IDs of sequences
ref_table = pd.read_csv('Reference_results.csv')
pdb_ids = list(ref_table['PDB_ID'])
pdb_ids[:5]

['1JVE', '1NGU', '2OEY', '2VWJ', '3THW']

<h3>Converting function </h3>

In [34]:
#PDB ID to sequence and secondary structure using 3DNA DSSR via RNApdbee web-server
def pdbid_2_2d(pdbid): #-->[pdbid, sequence, 2D]

    driver = webdriver.Chrome()
    #driver.implicitly_wait(30)
    
    # Open web-server
    driver.get('http://rnapdbee.cs.put.poznan.pl/')  # Replace with your target URL

    # type pdbid 
    input_pdb = driver.find_element(By.ID, "pdbId")
    input_pdb.send_keys(pdbid)

    #click Get button
    getbutton = driver.find_element(By.XPATH, "//input[@type='button' and @value='Get' and @onclick='fetchPdb()']")
    getbutton.click()

    # Wait for 2 minutes (120 seconds)
    time.sleep(10)

    #select non-canonical inclusion
    noncancheck = driver.find_element(By.ID, "nonCanonicalHandling2")
    noncancheck.click()

    #click Run button
    runbutton = driver.find_element(By.ID, 'commitPdb')
    runbutton.click()

    # Wait for the page to load 
    time.sleep(5)

    # Get the resulting HTML
    result_html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(result_html, "html.parser")
    raw_symbols = soup.findAll('span', class_='mono')
    syms = []
    for s in raw_symbols:
        syms.append(s.get_text())

    sequence = ''
    secondary = ''
    for ss in syms:
        if ss in ('a', 't', 'g', 'c', 'u'):
            sequence = sequence+ss
        elif ss in ('(', ')', '.', '[', ']', '{', '}'):
            secondary = secondary+ss

    return [pdbid, sequence, secondary]






In [32]:
#settings verification
print('(((((((((((..)))(((..)))((((..))))))))))))')
print(*pdbid_2_2d('2F1Q'), sep='\n')


(((((((((((..)))(((..)))((((..))))))))))))
2F1Q
gcactgcatccttggacgcttgcgccacttgtggtgcagtgc
(((((((((((..)))(((..)))((((..))))))))))))


Database Collecting

In [35]:
fullaptabase = []
for p in pdb_ids:
    fullaptabase.append(pdbid_2_2d(p))
    print(len(fullaptabase), end='--')

1--2--3--4--5--6--7--8--9--10--11--12--13--14--15--16--17--18--19--20--21--22--23--24--25--26--27--28--29--30--31--32--33--34--35--36--37--38--39--40--41--42--43--44--45--46--47--48--49--50--51--52--53--54--55--56--57--58--59--60--61--62--63--64--65--66--67--68--69--

In [38]:
fullaptabase

[['1JVE', 'cctaattataacgaagttataattagg', '(((((((((((((.)))))))))))))'],
 ['1NGU', 'ctctccttgtatttcttacaaaaagag', '(((((((((((.....)))))))))))'],
 ['2OEY', 'ccatcgtctacctttggtaggatgg', '(((((..(((((...))))))))))'],
 ['2VWJ', 'aauggagacacggcttttgccgtgtc', '......((((((((....))))))))'],
 ['3THW',
  'cctctatctgaagccgatcgatgaagcatcgatcgcacagcttcagatagagga',
  '((((((((((((((((((((((((.))))))))))....)))))))))))))).'],
 ['4HT4', 'cgcgaacggaacgttcgcataagtgcgc', '.((((((((.))))))))........()'],
 ['1NGO', 'ctctttttgtaagaaatacaaggagag', '(((((((((((.....)))))))))))'],
 ['5N2Q', 'actttatgaaaataaagtatagtgtg', '(((((((....)))))))........'],
 ['3H25', 'cctttccccctacccgaagggtggggg', '......(((((((((...)))))))))'],
 ['1AW4',
  'acctgggggagtattgcggaggaaggtaaacctgggggagtattgcggaggaaggtaa',
  '(((((()([(((...))))..{)))))]}(((((()([(((...))))..{)))))]}'],
 ['6CCE',
  'gcttgacaaaagtgttaaattgtgctatactagcacaatttaacacttttgtcaagc',
  '((((((((((((((((((((((((((.....))))))))))))))))))))))))))'],
 ['1B4Y', 'tctt

In [36]:
reftab = pd.read_csv('Reference_results.csv')
reftab.head()

Unnamed: 0,CentroidFold,RNAfold,Mfold,PDB_ID,Structure,Len
0,1.0,1.0,1.0,1JVE,Triplex-DNA,27
1,1.0,1.0,1.0,1NGU,Hairpin with pseudoknots,27
2,1.0,1.0,1.0,2OEY,Hairpin with loops,25
3,1.0,1.0,1.0,2VWJ,Hairpin with dangling ends,26
4,1.0,1.0,1.0,3THW,Hairpin with loops,53


In [39]:
cols = ['PDB_ID', 'Sequence', '2D']
aptabase = pd.DataFrame(fullaptabase, columns=cols)
aptabase.head()

Unnamed: 0,PDB_ID,Sequence,2D
0,1JVE,cctaattataacgaagttataattagg,(((((((((((((.)))))))))))))
1,1NGU,ctctccttgtatttcttacaaaaagag,(((((((((((.....)))))))))))
2,2OEY,ccatcgtctacctttggtaggatgg,(((((..(((((...))))))))))
3,2VWJ,aauggagacacggcttttgccgtgtc,......((((((((....))))))))
4,3THW,cctctatctgaagccgatcgatgaagcatcgatcgcacagcttcag...,((((((((((((((((((((((((.))))))))))....)))))))...


In [40]:
aptabase['Len'] = reftab['Len']
aptabase['Structure'] = reftab['Structure']
aptabase['RNAfold_ref'] = reftab['RNAfold']
aptabase.head()

Unnamed: 0,PDB_ID,Sequence,2D,Len,Structure,RNAfold_ref
0,1JVE,cctaattataacgaagttataattagg,(((((((((((((.))))))))))))),27,Triplex-DNA,1.0
1,1NGU,ctctccttgtatttcttacaaaaagag,(((((((((((.....))))))))))),27,Hairpin with pseudoknots,1.0
2,2OEY,ccatcgtctacctttggtaggatgg,(((((..(((((...)))))))))),25,Hairpin with loops,1.0
3,2VWJ,aauggagacacggcttttgccgtgtc,......((((((((....)))))))),26,Hairpin with dangling ends,1.0
4,3THW,cctctatctgaagccgatcgatgaagcatcgatcgcacagcttcag...,((((((((((((((((((((((((.))))))))))....)))))))...,53,Hairpin with loops,1.0


In [41]:
aptabase.to_csv('Aptamers_2D.csv', index=False)

In [28]:
pdbid_2_2d('2M92')

['2M92',
 'agggtgggtgctggggcgcgaagcattcgcgagg',
 '(([{.)]}.[.)]([(((((((...))))))))]']