In [1]:
# Standard library imports
import json
import os
import time

# Third-party imports
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import MDAnalysis as mda
import requests
from tqdm.notebook import tqdm

In [2]:
os.chdir("/home/adri/Projects/phd/pipeline")

In [3]:
class_a_path = "data/raw/class_a.json"
ligand_type_path = "data/raw/lig_type.json"

In [4]:
# load jsons
with open(class_a_path) as f:
    class_a = json.load(f)

with open(ligand_type_path) as f:
    ligand_type = json.load(f)


In [5]:
complex_dynids = ligand_type[1]['dyn_id']
class_a_dyn_ids = class_a[0]["dyn_id"]
# Check overlapping between the two lists
overlapping = [dyn_id for dyn_id in complex_dynids if dyn_id in class_a_dyn_ids]
len(overlapping)

305

In [7]:
# def get_simulation_files(simulation_id, session):
#     url = f"{base_url}{simulation_id}"
#     print(f'fetching {url}')
#     # Using the provided session to make a request
#     r = session.get(url)
    
#     # Use 'lxml' for faster parsing
#     soup = BeautifulSoup(r.text, 'html.parser')
    
#     allfiles = soup.find(id="allfiles")
#     if not allfiles:
#         return []
    
#     links = allfiles.find_all("a")
#     hrefs = [link.get("href") for link in links]
#     return hrefs

# base_url = "https://www.gpcrmd.org/dynadb/dynamics/id/"
# urls = []
# filenames = []

# # Using requests.Session for better performance
# with requests.Session() as session:
#     with ThreadPoolExecutor(max_workers=15) as executor:
#         # Partial function application to pass the session object
#         from functools import partial
#         fetch_func = partial(get_simulation_files, session=session)
        
#         # Execute the partial function with the executor
#         results = list(tqdm(executor.map(fetch_func, overlapping), total=len(overlapping)))

#     for i, hrefs in zip(overlapping, results):
#         dest_folder = f"data/raw/simulations/{i}"
#         for href in hrefs:
#             file_name = href.split("/")[-1]
#             output_path = f"{dest_folder}/{file_name}"
#             urls.append(href)
#             filenames.append(output_path)

In [8]:
# # save the urlfs and filenames list
# with open("data/raw/urls.json", "w") as f:
#     json.dump(urls, f)

# with open("data/raw/filenames.json", "w") as f:
#     json.dump(filenames, f)

In [6]:
# load urls and filenames
with open("data/raw/urls.json") as f:
    urls = json.load(f)

with open("data/raw/filenames.json") as f:
    filenames = json.load(f)

In [7]:
from tqdm import tqdm

In [8]:
non_corrupted_files = []
deleted_files_counter = 0
three_tries = {}

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import hashlib

def is_file_corrupted(file_path, expected_hash=None):
    """Check if a file is corrupted based on its hash."""
    if expected_hash:
        return hashlib.md5(open(file_path, 'rb').read()).hexdigest() != expected_hash
    else:
        # Default way: try opening with MDAnalysis
        try:
            u = mda.Universe(file_path)
            return False
        except Exception as e:
            print(f"Error: {e}")
            return True

def download_file(url, filename):
    """Download a file and optionally check its hash."""
    if os.path.exists(filename):
        # print(f"File {filename} already exists")
        return
    print(f"Downloading {filename}")
    r = requests.get("https://www.gpcrmd.org/" + url, stream=True)
    # create the directory if it does not exist
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
    # print(f"Download finished {filename}")

while True:

    print("Starting download iteration")

    print("Checking for corrupted files")

    list_of_files = os.listdir("data/raw/simulations")
    for system_folder in tqdm(list_of_files):
        files = os.listdir(f"data/raw/simulations/{system_folder}")

        # Filtering files based on extensions
        traj_files = [f for f in files if f.endswith(('.xtc', '.dcd'))]
        topologies = [f for f in files if f.endswith(('.psf', '.top', '.pdb'))]

        for file in traj_files + topologies:
            file_path = f"data/raw/simulations/{system_folder}/{file}"
            if file_path in non_corrupted_files or three_tries.get(file_path, 0) >= 3:
                continue
            if is_file_corrupted(file_path):
                print(f"File {file_path} is corrupted")
                os.remove(file_path)
                if file_path not in three_tries:
                    three_tries[file_path] = 1
                else:
                    three_tries[file_path] += 1

                deleted_files_counter += 1
            else:
                non_corrupted_files.append(file_path)

    print(f"Deleted {deleted_files_counter} corrupted files")

    #check if all files either are downloaded or have +3 tries
    files_with_tries = [k for k, v in three_tries.items() if v >= 3]
    if len(non_corrupted_files) + len(files_with_tries) == len(filenames):
        break
    
    files_to_download = []
    urls_to_download = []
    for url, filename in zip(urls, filenames):
        if os.path.exists(filename):
            continue
        files_to_download.append(filename)
        urls_to_download.append(url)

    print(f"Downloading {len(files_to_download)} files")

    # Downloading corrupted files again
    with ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(download_file, urls_to_download, files_to_download)

In [9]:
# Count the number of systems with at least a topology file and a trajectory file
systems = 0
all_systems = os.listdir("data/raw/simulations")
for system_folder in all_systems:
    files = os.listdir(f"data/raw/simulations/{system_folder}")

    # Filtering files based on extensions
    traj_files = [f for f in files if f.endswith(('.xtc', '.dcd'))]
    topologies = [f for f in files if f.endswith(('.psf', '.top', '.pdb'))]

    if len(traj_files) > 0 and len(topologies) > 0:
        systems += 1
print(systems , len(all_systems), systems/len(all_systems)*100)

247 250 98.8


In [10]:
len(urls)

1215

In [12]:
def download_file(url, filename):
    # if file already exists
    if os.path.exists(filename):

        print(f"File {filename} already exists")
        return
    r = requests.get("https://www.gpcrmd.org/" + url, stream=True)
    with open(filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Downloaded {filename}")

In [13]:
non_corrupted_files = []


In [None]:
tries_dict = {}
while True:
    deleted_files_counter = 0
    # delete corruped files: 
    for system_folder in tqdm(os.listdir("data/raw/simulations")):

        # get the list of xtc or dcd files in the folder
        xtc_files = [file for file in os.listdir(f"data/raw/simulations/{system_folder}") if file.endswith(".xtc")]
        dcd_files = [file for file in os.listdir(f"data/raw/simulations/{system_folder}") if file.endswith(".dcd")]

        # get the list of files with extension .psf, .top or .pdb
        psf_files = [file for file in os.listdir(f"data/raw/simulations/{system_folder}") if file.endswith(".psf")]
        top_files = [file for file in os.listdir(f"data/raw/simulations/{system_folder}") if file.endswith(".top")]
        pdb_files = [file for file in os.listdir(f"data/raw/simulations/{system_folder}") if file.endswith(".pdb")]
        tolopologies = psf_files + top_files + pdb_files
        topology = tolopologies[0]

        for traj in xtc_files + dcd_files:
            
            traj_path = f"data/raw/simulations/{system_folder}/{traj}"
            topology_path = f"data/raw/simulations/{system_folder}/{topology}"

            if traj_path in non_corrupted_files:
                continue

            # check if the file is corrupted
            try:
                u = mda.Universe(topology_path, traj_path)
            except EOFError:
                print(f"File {traj_path} is corrupted")
                os.remove(traj_path)
                deleted_files_counter += 1
                tries_dict[traj_path] = tries_dict.get(traj_path, 0) + 1
                continue
            except ValueError as e:
                print(e)
                continue
            except OSError as e:
                print(e)
                continue
            print("file is not corrupted")
            del tries_dict[traj_path]
            non_corrupted_files.append(traj_path)

    print(f"Deleted {deleted_files_counter} corrupted files")
    # bool variable to check if all files have been tried 3 times
    all_files_tried = True
    for file, tries in tries_dict.items():
        if tries < 3:
            all_files_tried = False
            break
    if deleted_files_counter == 0 and all_files_tried and tries_dict:
        break

    with ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(download_file, urls, filenames)

    

In [None]:
    dest_folder = f"data/raw/lig_type/{i}"

    os.makedirs(dest_folder, exist_ok=True)

    for href in hrefs:
        # get the file name
        file_name = href.split("/")[-1]
        print(file_name)

        # download the file
        r = requests.get("https://www.gpcrmd.org/"+href)

        # save the file
        with open(f"{dest_folder}/{file_name}", "wb") as f:
            f.write(r.content)

In [None]:
list(range(0, len(complex_dynids), 5))

In [14]:
api_call + dyn_ids

'https://www.gpcrmd.org/api/downloader_all/9,10,15,16,19'

In [None]:
https://www.gpcrmd.org/dynadb/tmp/GPCRmd_downloads/download_all_14.zip