In [86]:
## Import Libraries
import re
import os
from os import path as p
import pandas as pd
from bioservices import UniProt


In [87]:
# utilities
def ifnotmkdir(dir):
    if not p.isdir(dir):
        os.mkdir(dir)
    return dir

In [90]:
## load dataset as pandas datframe
excelDir="/home/esp/dataset_generation/flavin_dataset/Excel_and_Fasta"
# get FAD dataset
dataUnfilteredFAD=pd.read_excel(p.join(excelDir,"FAD_unfiltered.xlsx"))
print(f"FAD unfiltered dataset{dataUnfilteredFAD.shape}")
# get FMN dataset
dataUnfilteredFMN=pd.read_excel(p.join(excelDir,"FMN_unfiltered.xlsx"))
print(f"FMN unfiltered dataset{dataUnfilteredFMN.shape}")
# combine FAD and FMN datasets
dataUnfiltered=pd.concat([dataUnfilteredFAD,dataUnfilteredFMN])



## Add photenzymes to dataset
# read FAP and ene-reductase uniprot search result into dataFrame
knownFlavinPhotoezymes=pd.read_excel(p.join(excelDir,"Known_Photoenzymes.xlsx"))
dataUnfiltered=pd.concat([dataUnfiltered,knownFlavinPhotoezymes])

# Add True/False column for known photoenzymes
photoenzymeIDs=knownFlavinPhotoezymes["Entry"]
dataUnfiltered["Known_Photoenzyme"] = dataUnfiltered["Entry"].isin(photoenzymeIDs)

# remove dupes
dataNonRedundant=dataUnfiltered.drop_duplicates(subset="Sequence")
print(f"non-redundant dataset{dataNonRedundant.shape}")
dataNonRedundant.to_excel(p.join(excelDir,"flavins_non-redundant.xlsx"))
# remove entries without alphafold predictions
dataAlphaFold=dataNonRedundant.dropna(subset=["AlphaFoldDB"])
dataAlphaFold.to_excel(p.join(excelDir,"flavins_alphaFold.xlsx"))
print(f"has alphafold predictions dataset{dataAlphaFold.shape}")

print(dataAlphaFold[dataAlphaFold["Known_Photoenzyme"]==True])


  warn("Workbook contains no default style, apply openpyxl's default")


FAD unfiltered dataset(14395, 14)
FMN unfiltered dataset(16815, 14)
non-redundant dataset(27970, 15)
has alphafold predictions dataset(24969, 15)
         Entry    Reviewed        Entry Name  \
0   A0A248QE08    reviewed         FAP_CHLVA   
1       A8JHB7    reviewed         FAP_CHLRE   
2       W0T2Q3  unreviewed      W0T2Q3_9BACI   
3   A0A125RY21  unreviewed  A0A125RY21_RHORH   
4   A0A125RY22  unreviewed  A0A125RY22_RHORH   
5   A0A125RY23  unreviewed  A0A125RY23_RHORH   
6   A0A8A1QR26  unreviewed  A0A8A1QR26_CYCAE   
9       B0JDW3  unreviewed      B0JDW3_THESC   
10      B0KAH1  unreviewed      B0KAH1_THEP3   
11      B8G5D6  unreviewed      B8G5D6_CHLAD   
12      E8PRF1  unreviewed      E8PRF1_THESS   
13      G9YLX2  unreviewed      G9YLX2_FLAPL   
14      O31246  unreviewed      O31246_RHIRD   
15      Q1LDQ5  unreviewed      Q1LDQ5_CUPMC   
16      Q5NLA1  unreviewed      Q5NLA1_ZYMMO   
17      Q72HI0  unreviewed      Q72HI0_THET2   
18      Q9R9V9  unreviewed      Q9R9V9

In [91]:
## grab pdb files from alphafold
def alpha_fold_scrape(dataSet,outDir):
    # import libraries
    import requests
    
    
    # make new dir to put pdb files
    alphaFoldPdbDir=p.join(dataDir,"flavin_alphafold_pdbs")
    ifnotmkdir(alphaFoldPdbDir)
    #set alphafold url and get accession numbers from dataset
    url="https://alphafold.ebi.ac.uk/"
    accessions = dataSet["AlphaFoldDB"]
    
    #run through all accessions and get pdb files
    for accession in accessions:
        accession=accession[:-1]
        print(accession)
    # Construct the URL for the PDB file
        pdb_url = f'https://alphafold.ebi.ac.uk/files/AF-{accession}-F1-model_v4.pdb'
        # Send a GET request to retrieve the PDB file content
        response = requests.get(pdb_url)
        if response.status_code==200:
            # Save the PDB file content to a local file
            pdbFile=p.join(outDir,f'{accession}.pdb')
            with open(pdbFile, 'w') as f:
               f.write(response.text)
        else:
            print(f"response code from request: {response.status_code}")
