In [None]:
pip install osfclient

In [None]:
# 1) Python script to search for the file or a folder through OSF Project

from osfclient.api import OSF

# Set up the OSF project details
project_id = 'rs6un'  # Project ID
target_name = 'data.prep.R'  # The specific file or folder name

# Initialize OSF and access the project
osf = OSF()
project = osf.project(project_id)
storage = project.storage('osfstorage')

def find_item(storage, target_name):
    """
    Searches through the OSF storage to find the specified file or folder.
    """
    # If the target_name is a file name then check for matching files in storage.files
    for file in storage.files:
        if file.name == target_name:
            print(f"Found file: {file.name} at path: {file.path}")
            return True

    # If the target_name is a folder name then check for matching folders in storage.folders
    for folder in storage.folders:
        if folder.name == target_name:
            print(f"Found folder: {folder.name} at path: {folder.path}")
            return True
        else:
            print(f"Entering folder: {folder.name}")
            # Recursively search within each folder
            if find_item(folder, target_name):
                return True

    return False

# Start the search
if not find_item(storage, target_name):
    print(f"The item '{target_name}' was not found in the project.")


In [None]:
# 2) Python script to search for the file or a folder through OSF Project and download it locally

from osfclient.api import OSF
import os

# Set up the OSF project details
project_id = 'rs6un'  # Replace with your actual project ID
target_name = 'data.prep.R'  # The specific file or folder you want to download

# Get the current working directory as the download path
download_directory = os.getcwd()  # This sets the path to the current directory

# Initialize OSF and access the project
osf = OSF()
project = osf.project(project_id)
storage = project.storage('osfstorage')

def download_file(file, download_path, osf_path):
    """
    Downloads a single file to the specified path and prints where it was found.
    """
    file_path = os.path.join(download_path, file.name)
    print(f"Found file '{file.name}' in OSF path: '{osf_path}'")
    print(f"Downloading file to {file_path}...")
    with open(file_path, 'wb') as f:
        file.write_to(f)
    print(f"Downloaded '{file.name}' successfully to {file_path}.")

    # If running in Google Colab, trigger a download to the local machine
    try:
        from google.colab import files
        files.download(file_path)
        print(f"Triggered download of '{file.name}' to your local machine.")
    except ImportError:
        print("Not running in Google Colab. The file is saved locally.")

def download_folder(folder, download_path, osf_path):
    """
    Recursively downloads all contents of a folder to the specified path.
    """
    folder_path = os.path.join(download_path, folder.name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    print(f"Found folder '{folder.name}' in OSF path: '{osf_path}'")

    # Download all files in the folder
    for file in folder.files:
        download_file(file, folder_path, osf_path + '/' + file.name)

    # Recursively download all subfolders
    for subfolder in folder.folders:
        download_folder(subfolder, folder_path, osf_path + '/' + subfolder.name)

def find_and_download_item(storage, target_name, download_path, osf_path=""):
    """
    Searches through the OSF storage to find and download the specified file or folder.
    """
    # Check for matching files in storage.files
    for file in storage.files:
        if file.name == target_name:
            download_file(file, download_path, osf_path + '/' + file.name)
            return True

    # Check for matching folders in storage.folders
    for folder in storage.folders:
        if folder.name == target_name:
            print(f"Found folder: {folder.name}. Downloading all contents to {download_path}...")
            download_folder(folder, download_path, osf_path + '/' + folder.name)
            return True
        else:
            # Recursively search within each folder
            if find_and_download_item(folder, target_name, download_path, osf_path + '/' + folder.name):
                return True

    return False

# Start the search and download process
if not find_and_download_item(storage, target_name, download_directory):
    print(f"The item '{target_name}' was not found in the project.")


In [None]:
# 3) To search for the dependency file of the OSF project
from osfclient.api import OSF

# Set up the OSF project details
project_id = '7h94n'  # Replace with your actual project ID

# List of common dependency file names to search for
dependency_files = [
    'renv.lock', 'sessionInfo.txt', 'sessionInfo.RData', '.Rprofile', 'DESCRIPTION', 'NAMESPACE',
    'requirements.txt', 'environment.yml', 'Dockerfile', 'README.md', 'README.txt', 'Makefile',
    'metadata.yml', 'metadata.json'
]

# Initialize OSF and access the project
osf = OSF()
project = osf.project(project_id)
storage = project.storage('osfstorage')

def search_dependency_files(storage, dependency_files, current_path="", found_files={}):
    """
    Recursively searches for common dependency files in the OSF storage and records unique OSF paths if found.
    """
    # Check for matching files in storage.files
    for file in storage.files:
        if file.name in dependency_files:
            # Construct the full OSF path, including folders
            full_osf_path = f"{current_path}/{file.name}" if current_path else f"/{file.name}"
            # Store only if file is unique or appears in a different path
            if file.name not in found_files or found_files[file.name] != full_osf_path:
                print(f"Found dependency file: {file.name} in OSF path: {full_osf_path}")
                found_files[file.name] = full_osf_path

    # Recursively search folders
    for folder in storage.folders:
        # Build the new path by appending the current folder's name
        new_path = f"{current_path}/{folder.name}" if current_path else f"/{folder.name}"
        search_dependency_files(folder, dependency_files, new_path, found_files)

    return found_files

# Start the search for dependency files
unique_files = search_dependency_files(storage, dependency_files)

# Print the result
if not unique_files:
    print("No dependency files were found in the project.")
else:
    print("\nList of unique dependency files with their OSF paths:")
    for file_path in unique_files.values():
        print(file_path)



In [None]:
import pandas as pd

osf_metadata = pd.read_csv("OSF_Meta_Data_SC_v0.csv")
osf_metadata

In [None]:
source_url_list = list(osf_metadata.Source.dropna().unique())
source_url_list

In [None]:
len(source_url_list)

In [None]:
from tqdm import tqdm

osf = OSF()

all_unique_files = {}
for url in tqdm(source_url_list):
    project_id = url.split('/')[-2]
    project = osf.project(project_id)
    storage = project.storage('osfstorage')
    unique_files = search_dependency_files(storage, dependency_files)
    
    all_unique_files[url] = unique_files

all_unique_files