In [1]:
import numpy as np
import pandas as pd
import os 
import tqdm
import urllib

from Bio import SeqIO
import seaborn as sns
import matplotlib.pyplot as plt
from bioservices import UniProt

In [2]:
df_g = pd.read_csv('genes.csv')
genes = df_g['Gene'].tolist()

In [4]:
with open('genes.txt', 'w') as f:
    for item in genes:
        f.write("%s\n" % item)

In [1]:
gene_name = 'mb'

base_url = 'https://gnomad.broadinstitute.org/gene/' + gene_name + '?dataset=exac'

In [2]:
base_url

'https://gnomad.broadinstitute.org/gene/mb?dataset=exac'

In [2]:
import lxml
import bs4
import requests as req
from bs4 import BeautifulSoup

In [86]:
def find_right_id(acc):

    #acc = 'P13380'
    url = "https://www.uniprot.org/uniprot/" + acc
    resp = req.get(url)
 
    soup = BeautifulSoup(resp.text, 'lxml')
 
    metadata = soup.title.parent
    flag_obsolete = False
    flag_obsolete2 = False
    flag_pdb = False
    protein_analogues_list = []
    protein_with_structure = []
    pdb_elem = 'None'
    k = 0

    for line in metadata:
        if 'var isObsolete = true' in str(line):
            flag_obsolete = True
            break

    data = soup.prettify().split('\n')

    if flag_obsolete:
        for i, line in enumerate(data):
            if 'this entry became obsolete' in str(line):
                flag_obsolete2 = True
                break

    if flag_obsolete2:            
        for k, line in enumerate(data[i+1:]):
            if '<a href="/uniprot/' in str(line):
                protein_analogues_list.append(str.strip(data[i+k+2]))
                #k += 1
                if '<p>' in str(line):
                    break    
                
    if len(protein_analogues_list) > 1:
        protein_analogues_list = protein_analogues_list[:-1]
    
    
    for j, acc in enumerate(protein_analogues_list):
        url = "https://www.uniprot.org/uniprot/" + acc 
        resp = req.get(url)
 
        soup = BeautifulSoup(resp.text, 'lxml')
        data2 = soup.prettify().split('\n')
    
        for line in data2:
        
            if 'class="pdb"' in str(line):
                flag_pdb = True
                break

        data2 = soup.prettify().split('\n')
        if flag_pdb:
            for line in data2:
                if 'structurePresent = true' in str(line):
                    protein_with_structure.append(acc)
    
    if len(protein_with_structure) > 0:
        pdb_elem = protein_with_structure[0]
    
    return pdb_elem

In [28]:
df_unpr = pd.read_csv("uniprot.csv")
df_unpr = df_unpr.drop(['Unnamed: 0'], axis=1)
df_unpr = df_unpr.set_index('Filename')

df = pd.read_csv("df.csv")
df = df.drop(['Unnamed: 0'], axis=1)
df = df.set_index('Filename')

df = df.join(df_unpr)

uniprot_seqids = df['Uniprot_id'].tolist()

df_g = pd.read_csv('genes.csv')
uniprot_genes = df_g['Uniprot_id'].tolist()

df_new_id = pd.read_csv('genes_to_update.csv')
old_id_list = df_new_id['Uniprot_id_old'].tolist()
new_id_list = df_new_id['Uniprot_id_valid'].tolist()

In [104]:
#dict_fail = {}
#dict_succ = {}

for i, acc in tqdm.tqdm_notebook(enumerate(uniprot_seqids[10000:20000])):
    if acc not in uniprot_genes:
        try:
            pdb_elem = find_right_id(acc)
        except ChunkedEncodingError:
            print("Error: ", i)
        
        if pdb_elem == 'None':
            dict_fail[acc] = pdb_elem
        else:
            dict_succ[acc] = pdb_elem

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [105]:
len(dict_succ)

209

In [106]:
len(dict_fail)

3864

In [107]:
df_ = pd.DataFrame(dict_succ.items(), columns=['Uniprot_id_old', 'Uniprot_id_valid'])
display(df_)

df_.to_csv('genes_to_update.csv')

Unnamed: 0,Uniprot_id_old,Uniprot_id_valid
0,P24991,P0AEG4
1,P24930,P0C918
2,P06128,P0AG82
3,P09097,P0AES4
4,P06982,P0AES6
...,...,...
204,P65726,P9WI83
205,P62161,P0DP29
206,P0A4Y0,P9WQ35
207,P32132,P0DTT0


In [108]:
dict_fail

{'P00718': 'None',
 'P07378': 'None',
 'P06492': 'None',
 'P00969': 'None',
 'P98119': 'None',
 'P00327': 'None',
 'Q51760': 'None',
 'P10762': 'None',
 'P23694': 'None',
 'P16094': 'None',
 'P25779': 'None',
 'P24289': 'None',
 'P05414': 'None',
 'P56272': 'None',
 'P48499': 'None',
 'Q01693': 'None',
 'Q03464': 'None',
 'P15636': 'None',
 'P28313': 'None',
 'P28316': 'None',
 'P07584': 'None',
 'Q7SIH2': 'None',
 'P00760': 'None',
 'P16404': 'None',
 'P21505': 'None',
 '220253': 'None',
 'P56588': 'None',
 '708676': 'None',
 'P38197': 'None',
 '22948': 'None',
 'P08515': 'None',
 'Q26609': 'None',
 'Q8K1Z5': 'None',
 'P51541': 'None',
 'P07229': 'None',
 'S31776': 'None',
 'Q28133': 'None',
 'P23540': 'None',
 'P08056': 'None',
 'P00784': 'None',
 'P02879': 'None',
 'P14559': 'None',
 'Q9PW35': 'None',
 'P00798': 'None',
 ':Z25878': 'None',
 '8810': 'None',
 'P04067': 'None',
 '983524': 'None',
 'P55915': 'None',
 'P29600': 'None',
 'P16184': 'None',
 'P15555': 'None',
 'Q27743': 'No

In [109]:
def get_genes(U, Uniprot_seqids): 
    gene_list = []
    gene_line_list = []
    Un_list = []
    blacklist = []
    df_gene = pd.DataFrame(columns = ["Uniprot_id", "Gene", "Gene (line)"])
    
    for i, un_id in tqdm.tqdm_notebook(enumerate(Uniprot_seqids)):
       
        if un_id.isdigit():
            continue
        
        if i in blacklist: 
            continue
                
        else:
            data = u.search(un_id, frmt="txt") 
  
            if not str(data).isdigit(): # If it is digit, then it is the code for an error

                flag = 0
                for line in data.split("\n"):
                    if line.startswith('GN'):
                        #flag = 1
  
                        if (line.split()[1][0:5] == 'Name=') and flag == 0:

                            gene_name = line.split()[1][5:len(line.split()[1])]
                            if gene_name[-1] == ";":
                                gene_name = gene_name[0:len(gene_name)-1]
                            gene_list.append(gene_name)
                            gene_line_list.append(line)
                            Un_list.append(un_id)
                            flag = 1
                            continue
                        continue

            else:
                continue
            
        if (i+1) % 1000 == 0 :
            print("Iteration " + str(i+1) + ", len(gene_list) = " + str(len(gene_list)))
        
    
    return df, gene_list, gene_line_list, Un_list

In [110]:
u = UniProt(verbose=False)
df_gene, gene_list, gene_line_list, un_list = get_genes(u, list(dict_succ.values()))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [115]:
df_ = pd.DataFrame(list(zip(list(dict_succ.keys()), list(dict_succ.values()), gene_list)), columns=['Uniprot_id_old', 'Uniprot_id_valid',  "Gene"])
display(df_)

df_.to_csv('genes_to_update.csv')

Unnamed: 0,Uniprot_id_old,Uniprot_id_valid,Gene
0,P24991,P0AEG4,dsbA
1,P24930,P0C918,rus
2,P06128,P0AG82,pstS
3,P09097,P0AES4,gyrA
4,P06982,P0AES6,gyrB
...,...,...,...
191,Q9FH83,P0DKH5,pknA
192,P62204,P0DP26,Calm1
193,P72056,P9WJF1,cya
194,Q07702,P9WG47,bipA


In [116]:
gene_list

['dsbA',
 'rus',
 'pstS',
 'gyrA',
 'gyrB',
 'speC',
 'malE',
 'thyA1',
 'thyA',
 'dtxR',
 'map',
 'plc',
 'celC',
 'celA',
 'SULT1A3',
 'linB',
 'inlB',
 'CTS1',
 'folA',
 'mog',
 'pdxH',
 'fbpC',
 'sucB',
 'fucA',
 'gpmA',
 'cyp51',
 'SAP2',
 'tufA',
 'rlmE',
 'inhA',
 'sodC',
 'tufA',
 'folP1',
 'fbpB',
 'PGA4',
 'glpF',
 'recA',
 'tmk',
 'MB',
 'mglB',
 'ino1',
 'argB',
 'fbaA',
 'hisG',
 'dacA',
 'HSPA1A',
 'C4A',
 'trmI',
 'leuB',
 'tesA',
 'ppiA',
 'pelA',
 'yciO',
 'purA',
 'cmaA2',
 'aroK',
 'def',
 'MB',
 'dut',
 'cyp121',
 'aphA',
 'N',
 'phzD2',
 'hisG',
 'ygfZ',
 'msrA',
 'inlA',
 'pknB',
 'bla',
 'yibA',
 'mshD',
 'pfo',
 'potD',
 'sodC',
 'sipA',
 'purE',
 'ompA',
 'rmpM',
 'glnL',
 'lcrV',
 'proX',
 'rnhA',
 'narL',
 'pelL',
 'pknD',
 'gmk',
 'pdtaR',
 'ppa',
 'ethR',
 'coaD',
 'dsbC',
 'cyp51',
 'ptpA',
 'citE',
 'dps',
 'yggS',
 'eda',
 'sspA',
 'glnH',
 'tyrS',
 'frr',
 'metK',
 'tpx',
 'map',
 'cya',
 'mtcA2',
 'livJ',
 'mtnN',
 'CSH1',
 'desA2',
 'pgk',
 'aroC',
 '

In [None]:
# несоответсвие!!!!!!!!!!!!!!!!!!!! переписать скрипт через парсинг html

In [53]:
url = 'https://www.uniprot.org/uniprot/P24991'
resp = req.get(url)
 
soup = BeautifulSoup(resp.text, 'html.parser')

In [54]:
html_txt = soup.prettify()
data = html_txt.split('\n')
for i, line in enumerate(data):
    if 'content-gene' in line:
        break
        
#gene = str.strip(data[i+2])


In [138]:
soup.find_all('fa')

[]

In [84]:
def find_gene_html(acc):    
    
    flag = False
    na = 'N/A'
    url = 'https://www.uniprot.org/uniprot/' + acc
    resp = req.get(url)
 
    soup = BeautifulSoup(resp.text, 'html.parser')
    
    html_txt = soup.prettify()
    data = html_txt.split('\n')
    for i, line in enumerate(data):
        if 'content-gene' in line:
            flag = True
            break
        
    if flag:
        gene = str.strip(data[i+2])
    else:
        gene = na
        
    return gene

In [85]:
gene_new_dict = {}

for i, acc in tqdm.tqdm_notebook(enumerate(uniprot_seqids)):
    if acc not in uniprot_genes:
        
        if acc in old_id_list:
            
            index = old_id_list.index(acc)
            acc_valid = new_id_list[index]
            
            gene_new_dict[acc] = find_gene_html(acc_valid)
        
        else:
            
            new_gene = find_gene_html(acc) 
            
            if (new_gene == 'N/A'):
                gene_new_dict[acc] = ''
            else:
                gene_new_dict[acc] = new_gene

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [87]:
len(gene_new_dict)

4073

In [89]:
k = 0
k_na = 0
k_none = 0
gene_list_towrite = {}

for key, value in gene_new_dict.items():
    if value == '':
        k_none += 1
    else:
        gene_list_towrite[key] = value


In [90]:
len(gene_list_towrite)

2357

In [91]:
gene_list_towrite

{'P06492': 'UL48',
 'P00969': '1.3',
 'P24991': 'dsbA',
 'P24930': 'rus',
 'P06128': 'pstS',
 'P09097': 'gyrA',
 'P06982': 'gyrB',
 'P13380': 'speC',
 'P02928': 'malE',
 'P42326': 'thyA1',
 'P38197': 'YBL036C',
 'P00470': 'thyA',
 'P33120': 'dtxR',
 'P07906': 'map',
 'P15310': 'plc',
 'P07985': 'celC',
 'P04955': 'celA',
 'P50224': 'SULT1A3',
 'P80561': 'SGR_5809',
 'P03692': '4',
 'P51698': 'linB',
 'P11922': 'YPTB1668',
 'P25147': 'inlB',
 'P54196': 'CTS1',
 'P0A546': 'folA',
 'P28694': 'mog',
 'P28225': 'pdxH',
 'P0A4V4': 'fbpC',
 'Q58292': 'MJ0882',
 'P07016': 'sucB',
 'P11550': 'fucA',
 'P31217': 'gpmA',
 'P77901': 'cyp51',
 'P28871': 'SAP2',
 'P46076': 'AO090010000493',
 'P02990': 'tufA',
 'P28692': 'rlmE',
 'O26253': 'MTH_150',
 'O26255': 'MTH_152',
 'P0A5Y6': 'inhA',
 'P53635': 'sodC',
 'P0A6N1': 'tufA',
 'O51923': 'PF1739',
 'P0A578': 'folP1',
 'P31952': 'fbpB',
 'Q58206': 'MJ0796',
 'P00790': 'PGA4',
 'P11244': 'glpF',
 'P0A5U4': 'recA',
 'Q8U3I1': 'PF0485',
 'O05891': 'tmk',

In [92]:
df_ = pd.DataFrame(list(zip(list(gene_list_towrite.keys()), list(gene_list_towrite.values()))), columns=['Uniprot_id_valid',  "Gene"])
display(df_)

df_.to_csv('genes_to_update.csv')

Unnamed: 0,Uniprot_id_valid,Gene
0,P06492,UL48
1,P00969,1.3
2,P24991,dsbA
3,P24930,rus
4,P06128,pstS
...,...,...
2352,G2R014,THITE_2112714
2353,B2UR60,Amuc_1119
2354,Q53W80,TTHB082
2355,C7PL61,Cpin_0907


In [73]:
k = 0

for i, acc in tqdm.tqdm_notebook(enumerate(uniprot_seqids)):
    if acc not in uniprot_genes:
        
        if acc in old_id_list:
            
            k += 1
        
        else:
            
            k += 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [76]:
gene_full = {}

for i, acc in tqdm.tqdm_notebook(enumerate(uniprot_seqids)):
        
        if acc in old_id_list:
            
            index = old_id_list.index(acc)
            acc_valid = new_id_list[index]
            
            gene_full[acc] = find_gene_html(acc_valid)
        
        else:
            
            new_gene = find_gene_html(acc) 
            
            if (new_gene == 'N/A'):
                gene_full[acc] = ''
            else:
                gene_full[acc] = new_gene

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [93]:
import selenium

In [128]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome(ChromeDriverManager().install())

acc = 'mb'
url = 'https://gnomad.broadinstitute.org/gene/' + acc + '?dataset=gnomad_r2_1'
xpath = '/html/body/div[1]/div/div/div[2]/div/div[5]/section/div[2]/button'
class_ = 'Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT'
driver.get(url)
assert "gnomAD" in driver.title


driver.switch_to.default_content()
#content = driver.find_elements(By.CLASS_NAME , class_)
#button = driver.find_elements(By.XPATH, '//button')

#button[0].click()
#button[1].click()
assert "No results found." not in driver.page_source
driver.close()

[WDM] - Current google-chrome version is 86.0.4240
[WDM] - Get LATEST driver version for 86.0.4240
[WDM] - Driver [C:\Users\alexh\.wdm\drivers\chromedriver\win32\86.0.4240.22\chromedriver.exe] found in cache


 


IndexError: list index out of range

In [126]:
content

[]

In [None]:
'''
<
button type="button" 
class="Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT"
>
Export variants to CSV
</button>

copy selector:

#root > div > div > div.Page-w7h773-0.TrackPage-sc-1kiz7uj-0.hADDWA > div > div:nth-child(5) > section > div:nth-child(2) > button


copy js path:

document.querySelector("#root > div > div > div.Page-w7h773-0.TrackPage-sc-1kiz7uj-0.hADDWA > div > div:nth-child(5) > section > div:nth-child(2) > button")


copy element:

<button type="button" class="Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT">Export variants to CSV</button>
'''