In [1]:
pip install osfclient

Collecting osfclient
  Downloading osfclient-0.0.5-py2.py3-none-any.whl.metadata (5.5 kB)
Downloading osfclient-0.0.5-py2.py3-none-any.whl (39 kB)
Installing collected packages: osfclient
Successfully installed osfclient-0.0.5
Note: you may need to restart the kernel to use updated packages.


In [2]:
# 1) Python script to search for the file or a folder through OSF Project

from osfclient.api import OSF

# Set up the OSF project details
project_id = 'rs6un'  # Project ID
target_name = 'data.prep.R'  # The specific file or folder name

# Initialize OSF and access the project
osf = OSF()
project = osf.project(project_id)
storage = project.storage('osfstorage')

def find_item(storage, target_name):
    """
    Searches through the OSF storage to find the specified file or folder.
    """
    # If the target_name is a file name then check for matching files in storage.files
    for file in storage.files:
        if file.name == target_name:
            print(f"Found file: {file.name} at path: {file.path}")
            return True

    # If the target_name is a folder name then check for matching folders in storage.folders
    for folder in storage.folders:
        if folder.name == target_name:
            print(f"Found folder: {folder.name} at path: {folder.path}")
            return True
        else:
            print(f"Entering folder: {folder.name}")
            # Recursively search within each folder
            if find_item(folder, target_name):
                return True

    return False

# Start the search
if not find_item(storage, target_name):
    print(f"The item '{target_name}' was not found in the project.")


Found file: data.prep.R at path: /Code and simulations results first draft - July 2020/Simulation Study II - Predictive Accuracy/data.prep.R


In [None]:
# 2) Python script to search for the file or a folder through OSF Project and download it locally

from osfclient.api import OSF
import os

# Set up the OSF project details
project_id = 'rs6un'  # Replace with your actual project ID
target_name = 'data.prep.R'  # The specific file or folder you want to download

# Get the current working directory as the download path
download_directory = os.getcwd()  # This sets the path to the current directory

# Initialize OSF and access the project
osf = OSF()
project = osf.project(project_id)
storage = project.storage('osfstorage')

def download_file(file, download_path, osf_path):
    """
    Downloads a single file to the specified path and prints where it was found.
    """
    file_path = os.path.join(download_path, file.name)
    print(f"Found file '{file.name}' in OSF path: '{osf_path}'")
    print(f"Downloading file to {file_path}...")
    with open(file_path, 'wb') as f:
        file.write_to(f)
    print(f"Downloaded '{file.name}' successfully to {file_path}.")

    # If running in Google Colab, trigger a download to the local machine
    try:
        from google.colab import files
        files.download(file_path)
        print(f"Triggered download of '{file.name}' to your local machine.")
    except ImportError:
        print("Not running in Google Colab. The file is saved locally.")

def download_folder(folder, download_path, osf_path):
    """
    Recursively downloads all contents of a folder to the specified path.
    """
    folder_path = os.path.join(download_path, folder.name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    print(f"Found folder '{folder.name}' in OSF path: '{osf_path}'")

    # Download all files in the folder
    for file in folder.files:
        download_file(file, folder_path, osf_path + '/' + file.name)

    # Recursively download all subfolders
    for subfolder in folder.folders:
        download_folder(subfolder, folder_path, osf_path + '/' + subfolder.name)

def find_and_download_item(storage, target_name, download_path, osf_path=""):
    """
    Searches through the OSF storage to find and download the specified file or folder.
    """
    # Check for matching files in storage.files
    for file in storage.files:
        if file.name == target_name:
            download_file(file, download_path, osf_path + '/' + file.name)
            return True

    # Check for matching folders in storage.folders
    for folder in storage.folders:
        if folder.name == target_name:
            print(f"Found folder: {folder.name}. Downloading all contents to {download_path}...")
            download_folder(folder, download_path, osf_path + '/' + folder.name)
            return True
        else:
            # Recursively search within each folder
            if find_and_download_item(folder, target_name, download_path, osf_path + '/' + folder.name):
                return True

    return False

# Start the search and download process
if not find_and_download_item(storage, target_name, download_directory):
    print(f"The item '{target_name}' was not found in the project.")


In [28]:
# 3) To search for the dependency file of the OSF project
from osfclient.api import OSF

# Set up the OSF project details
project_id = '7h94n'  # Replace with your actual project ID

# List of common dependency file names to search for
dependency_files = [
    'renv.lock', 'sessionInfo.txt', 'sessionInfo.RData', '.Rprofile', 'DESCRIPTION', 'NAMESPACE',
    'requirements.txt', 'environment.yml', 'Dockerfile', 'README.md', 'README.txt', 'Makefile',
    'metadata.yml', 'metadata.json'
]

# Initialize OSF and access the project
osf = OSF()
project = osf.project(project_id)
storage = project.storage('osfstorage')

def search_dependency_files(storage, dependency_files, current_path="", found_files={}):
    """
    Recursively searches for common dependency files in the OSF storage and records unique OSF paths if found.
    """
    # Check for matching files in storage.files
    for file in storage.files:
        if file.name in dependency_files:
            # Construct the full OSF path, including folders
            full_osf_path = f"{current_path}/{file.name}" if current_path else f"/{file.name}"
            # Store only if file is unique or appears in a different path
            if file.name not in found_files or found_files[file.name] != full_osf_path:
                print(f"Found dependency file: {file.name} in OSF path: {full_osf_path}")
                found_files[file.name] = full_osf_path

    return found_files

# Start the search for dependency files
unique_files = search_dependency_files(storage, dependency_files)

# Print the result
if not unique_files:
    print("No dependency files were found in the project.")
else:
    print("\nList of unique dependency files with their OSF paths:")
    for file_path in unique_files.values():
        print(file_path)



Found dependency file: README.md in OSF path: /README.md

List of unique dependency files with their OSF paths:
/README.md


In [6]:
import pandas as pd

osf_metadata = pd.read_csv("OSF_Meta_Data_SC_v0.csv")
osf_metadata

Unnamed: 0,Author,Datasets,Packages,Output Types,Output Names,Source,Title,Domain,Publication date,Date modified,DOI,File name,License;,Unnamed: 13
0,Alessandra S. Souza,,,,,https://osf.io/9qdkv/,Gaze-based and Attention-based Rehearsal in Sp...,Psychology,3/11/18,3/6/23,,11207_checkFixationsAgainstMatrix_E2_R1.R,No License;,
1,"Alessandra S. Souza,None,""['grid', 'gridExtra'...",,,,,,,,,,,,,
2,Alessandra S. Souza,['dotSpan_E5_toneTask.txt'],,{'TXT'},{'ToneTask_E5_results.txt'},https://osf.io/9qdkv/,Gaze-based and Attention-based Rehearsal in Sp...,Psychology,3/11/18,3/6/23,,11207_Spatial_Span_ToneTask_E5.R,No License;,
3,"Alessandra S. Souza,""['E2_dataMatrix_Loc.txt',...",,,,,,,,,,,,,
4,Alessandra S. Souza,,,,,https://osf.io/9qdkv/,Gaze-based and Attention-based Rehearsal in Sp...,Psychology,3/11/18,3/6/23,,11207_summarySEwithin.R,No License;,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7550,Ying-Yu Hsieh,,,,,https://osf.io/bh9cq/,Impact of noise in coffee shop (HW #7 Open Sci...,Social Science,5/21/17,5/24/17,,631_simulation_script.R,No License;,
7551,Ying-Yu Hsieh,,,,,https://osf.io/bh9cq/,Impact of noise in coffee shop (HW #7 Open Sci...,Social Science,5/21/17,5/24/17,,631_analysis.R,No License;,
7552,"Joshua Conrad Jackson,""['campnet.gexf', '~/Des...",,,,,,,,,,,,,
7553,"Joshua Conrad Jackson,""['https://culturalevo.c...",,,,,,,,,,,,,


In [7]:
source_url_list = list(osf_metadata.Source.dropna().unique())
source_url_list

['https://osf.io/9qdkv/',
 'https://osf.io/ynbwu/',
 'https://osf.io/e3jrc/',
 'https://osf.io/fw7vz/',
 'https://osf.io/cvw8s/',
 'https://osf.io/c4v7g/',
 'https://osf.io/ypcqn/',
 'https://osf.io/zpse3/',
 'https://osf.io/a9bv6/',
 'https://osf.io/g7a49/',
 'https://osf.io/4ya6x/',
 'https://osf.io/qjwht/',
 'https://osf.io/q7nc4/',
 'https://osf.io/s7u23/',
 'https://osf.io/bdw5r/',
 'https://osf.io/5cpts/',
 'https://osf.io/z643c/',
 'https://osf.io/4pfes/',
 'https://osf.io/8vapw/',
 'https://osf.io/ehz9s/',
 'https://osf.io/9qbwv/',
 'https://osf.io/xh36s/',
 'https://osf.io/6v3r9/',
 'https://osf.io/rmkge/',
 'https://osf.io/cxy58/',
 'https://osf.io/9b7mk/',
 'https://osf.io/hfjgw/',
 'https://osf.io/vguey/',
 'https://osf.io/q2zrp/',
 'https://osf.io/94jyp/',
 'https://osf.io/3ztyb/',
 'https://osf.io/et2dz/',
 'https://osf.io/hrf5t/',
 'https://osf.io/uqyff/',
 'https://osf.io/qynhu/',
 'https://osf.io/jk2zf/',
 'https://osf.io/cvwu9/',
 'https://osf.io/n7dw9/',
 'https://os

In [8]:
len(source_url_list)

890

In [None]:
from tqdm import tqdm

osf = OSF()

all_unique_files = {}
for url in tqdm(source_url_list):
    project_id = url.split('/')[-2]
    project = osf.project(project_id)
    storage = project.storage('osfstorage')
    unique_files = search_dependency_files(storage, dependency_files)
    
    all_unique_files[url] = unique_files

all_unique_files

In [44]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle
import os

# Define dependencies to search for
dependency_files = [
    'renv.lock', 'sessionInfo.txt', 'sessionInfo.RData', '.Rprofile', 'DESCRIPTION',
    'NAMESPACE', 'requirements.txt', 'environment.yml', 'Dockerfile', 'README.md',
    'README.txt', 'Makefile', 'metadata.yml', 'metadata.json'
]
dependency_files_set = set(dependency_files)  # Using a set for faster lookups

# Read data
osf_metadata = pd.read_csv("OSF_Meta_Data_SC_v0.csv")
source_url_list = list(osf_metadata.Source.dropna().unique())

# Initialize OSF client
osf = OSF()

# Define caching mechanism to avoid re-searching projects
#cache_file = 'osf_file_cache.pkl'
#if os.path.exists(cache_file):
#    with open(cache_file, 'rb') as f:
#        file_cache = pickle.load(f)
#else:
file_cache = {}

# Function to search for dependency files in the project
def search_dependency_files(storage, dependency_files_set, max_depth=2, current_depth=0):
    unique_files = []

    # Limit depth to avoid deep recursion
    if current_depth > max_depth:
        return unique_files

    try:
        # Search for files in the current storage level
        for file in storage.files:
            if file.name in dependency_files_set:
                unique_files.append(file.name)
                if len(unique_files) == len(dependency_files_set):
                    return unique_files  # Early exit if all dependencies are found

    except Exception as e:
        print(f"Error accessing storage: {e}")

    return unique_files

# Fetch unique files with project ID and caching
def fetch_unique_files(url):
    project_id = url.split('/')[-2]

    # Check if project is already cached
    if project_id in file_cache:
        unique_files = file_cache[project_id]
    else:
        try:
            # Access project storage
            project = osf.project(project_id)
            storage = project.storage('osfstorage')
            
            # Perform optimized search for dependency files
            unique_files = search_dependency_files(storage, dependency_files_set)
            
            # Cache the results
            file_cache[project_id] = unique_files
            with open(cache_file, 'wb') as f:
                pickle.dump(file_cache, f)
        except Exception as e:
            print(f"Error processing project {project_id}: {e}")
            unique_files = []

    return url, project_id, unique_files

# Initialize dictionary to store all results
all_unique_files = {}

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = {executor.submit(fetch_unique_files, url): url for url in source_url_list}
    for future in tqdm(as_completed(futures), total=len(futures), mininterval=0.5):
        try:
            url, project_id, unique_files = future.result()
            # Store results as a dictionary entry with project_id and found files
            all_unique_files[url] = {
                "project_id": project_id,
                "files": unique_files
            }
        except Exception as e:
            print(f"Error retrieving result: {e}")

# Print the result
print("\nList of unique dependency files found in each project:")
for url, details in all_unique_files.items():
    print(f"\nProject URL: {url}")
    print(f"Project ID: {details['project_id']}")
    print("Files Found:", details["files"])


 32%|█████████████▏                           | 285/890 [15:53<19:25,  1.93s/it]

Error processing project cnqta: Response has status code 410 not (200,)


 36%|██████████████▌                          | 316/890 [18:30<37:21,  3.90s/it]

Error processing project p3eu8: Response has status code 410 not (200,)


 41%|████████████████▉                        | 369/890 [21:53<51:11,  5.90s/it]

Error processing project qg8xw: 


 51%|████████████████████▉                    | 455/890 [26:31<21:16,  2.93s/it]

Error retrieving result: list index out of range


 52%|█████████████████████▍                   | 466/890 [26:57<16:13,  2.30s/it]

Error processing project uq4pd: Response has status code 410 not (200,)


100%|█████████████████████████████████████████| 890/890 [49:14<00:00,  3.32s/it]


List of unique dependency files found in each project:

Project URL: https://osf.io/e3jrc/
Project ID: e3jrc
Files Found: []

Project URL: https://osf.io/fw7vz/
Project ID: fw7vz
Files Found: []

Project URL: https://osf.io/9qdkv/
Project ID: 9qdkv
Files Found: []

Project URL: https://osf.io/c4v7g/
Project ID: c4v7g
Files Found: []

Project URL: https://osf.io/zpse3/
Project ID: zpse3
Files Found: []

Project URL: https://osf.io/cvw8s/
Project ID: cvw8s
Files Found: []

Project URL: https://osf.io/g7a49/
Project ID: g7a49
Files Found: []

Project URL: https://osf.io/a9bv6/
Project ID: a9bv6
Files Found: []

Project URL: https://osf.io/4ya6x/
Project ID: 4ya6x
Files Found: []

Project URL: https://osf.io/s7u23/
Project ID: s7u23
Files Found: []

Project URL: https://osf.io/ynbwu/
Project ID: ynbwu
Files Found: []

Project URL: https://osf.io/bdw5r/
Project ID: bdw5r
Files Found: []

Project URL: https://osf.io/5cpts/
Project ID: 5cpts
Files Found: []

Project URL: https://osf.io/q7nc4




In [45]:
import csv

output_file = "osf_dependency_results.csv"

with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Project URL", "Project ID", "Files Found"])

    for url, details in all_unique_files.items():
        project_id = details.get("project_id", "N/A")  # Get project ID, default to "N/A" if missing
        files = ", ".join(details.get("files", []))   # Join list of files into a single string
        writer.writerow([url, project_id, files])
