In [None]:
"""A script to reterive Uniprot-ID and the associated name of proteins from RCSB's REST API"""
#could also be used to reterive other info regarding pdb structures.#
#for more uses of the RCSB's REST API visit: https://data.rcsb.org/redoc/index.html#
#==================================================================================#
import requests
import warnings
import json
import pandas as pd
from tqdm import tqdm
import time
warnings.filterwarnings('ignore')

## Part1- Data Collection ##

In [None]:
#Access check to RCSB#
a = requests.get('https://data.rcsb.org/redoc/index.html', verify=False)
print(a.status_code) ## status code 200 means OK.#

In [None]:
#read in your pdb list data pereferably in csv format#
df = pd.read_csv('CSV_pdbandligand.csv', encoding='utf-8') 

In [None]:
#Remove the duplicates from the database#
df2=df.drop_duplicates()

In [None]:
pdblist = df2['PDB-ID'].tolist()
#writes the PDB-ID column to a defined list#

In [None]:
##Uniprot-ID extractor##
from tqdm import tqdm
import time
for pdb in tqdm(pdblist):
    ldata=requests.get(url=f'https://data.rcsb.org/rest/v1/core/uniprot/{pdb}/1', verify=False).json()
    try:
        id_data=ldata[0]
        with open('output_id.txt', "a") as f:
            print(pdb, id_data['rcsb_id'], file=f)
    except Exception as e:
        with open('output_id.txt', "a") as f:
            print(pdb, 'NaN', e, file=f)  

In [None]:
##Uniprot-Name extractor##
from tqdm import tqdm
import time
for pdb in tqdm(pdblist):
    ldata=requests.get(url=f'https://data.rcsb.org/rest/v1/core/uniprot/{pdb}/1', verify=False).json()
    try:
        name_data=ldata[0]
        with open('output_name.txt', "a") as f:
            print(pdb, name_data['rcsb_uniprot_protein']['name']['value'], file=f)
    except Exception as e:
        with open('output_name.txt', "a") as f:
            print(pdb, 'NaN', e, file=f)

## Part2- String Manipulation ## 
I used Knime and the line reader and string manipulation node to read-in and edit the generated text file.<br> 
`String manipulation node with the following syntax: substr(str, start[0], length[4]) extracts the PDB-ID to a new column.`
> e.g. if your line is: "1q0k P80734" the output would be "1q0k".<br>    
Next use `substr(str, start[5])` to append the Uniprot-ID to another column.<br>
Finally filter out original columns of data and export the the file as a .csv. 

## Part3- Data Mapping ##

In [None]:
#Read-in your newly generated .csv file for Uniprot-IDs or Names#
Dictdf = pd.read_csv('NewNameDict/Mapped.csv', encoding='utf-8')

### Only use `one` of the bellow mappings at anytime ###

In [None]:
map_dict = dict(zip(df['PDB-ID'], df['Uniprot-ID']))
#Maps the PDB-ID to the corresponding Uniprot-ID#

In [None]:
map_dict = dict(zip(df['PDB-ID'], df['Uniprot-Name']))
#Maps the PDB-ID to the corresponding Uniprot-Name#

### ____________________________________ ###

In [None]:
#Optional: you can save this dictionary file into a text file to use later#
with open('MyDictionary.txt', "w") as f:
            print(map_dict, file=f)
#Careful with which function you use before exportig#

In [None]:
#This reads the original .csv which includes the duplicates#
pdblist_mapping= df['PDB-ID'].tolist()

In [None]:
mapped_list=[map_dict[k] for k in pdblist_mapping if k in map_dict]
#for every pdb entry maps the corresponding uniprot ID/Name#

In [None]:
#Parse the newly made list into a dataframe#
Mapped_df = pd.DataFrame (mapped_list, columns = ['Uniprot-ID/Name'], index=False)

In [None]:
#Export the results as a new .csv file#
Mapped_df.to_csv('Mapped_/Uniprot-ID/Names.csv')