In [1]:
import json
from pathlib import Path
from tqdm.autonotebook import tqdm
import pickle
import ramanspy as rp
import numpy as np
import json
from pathlib import Path

import requests
import time
import os
from typing import List, Dict, Optional
import json
BASE_URL = "https://api.ramanbase.org"

class RamanbaseAPIN:
    def __init__(self, api_token: str):
        self.token = api_token.strip()
        self.session = requests.Session()
        self.session.headers.update({
            "Authorization": f"Token {self.token}",
            "Accept": "application/json"
        })

        print("initted")
    def fetch_and_save_available_spectra(self, output_dir: str = "."):
        """Fetch available public spectra index (up to API limit) and save it."""
        url = f"{BASE_URL}/api/v1/public/spectra/list"
        page_size = 100
        all_spectra: List[Dict] = []
        page = 1
        max_pages = 1400  # Safety cap to avoid infinite loop if bug

        os.makedirs(output_dir, exist_ok=True)
        full_path = os.path.join(output_dir, "ramanbase_available_spectra_full.json")
        simple_path = os.path.join(output_dir, "ramanbase_spectra_id_name.json")
        #print('running here')
        while page <= max_pages:
            print(f'run page {page}')
            params = {"page": page, "page_size": page_size}
            try:
                response = self.session.get(url, params=params)
                response.raise_for_status()
                data = response.json()
            except requests.exceptions.HTTPError as e:
                print(f"Error on page {page}: {e} (likely API limit reached)")
                break

            results = data.get("results", [])
            if not results:
                print(f"Page {page}: Empty results. API limit reached.")
                break

            all_spectra.extend(results)
            with open(full_path, "w", encoding="utf-8") as f:
                json.dump(all_spectra, f, indent=2, ensure_ascii=False)
            print(f"Page {page}: Fetched {len(results)} spectra (cumulative: {len(all_spectra)})")

            page += 1
            time.sleep(1)  # Delay

        print(f"\nFetching complete! Total spectra downloaded: {len(all_spectra)}")
        print("Note: If only 100, that's the current API limit for public list access.")

        # Save full data
        with open(full_path, "w", encoding="utf-8") as f:
            json.dump(all_spectra, f, indent=2, ensure_ascii=False)
        print(f"Full details saved to: {full_path}")

        # Save simple ID/Name list
        simple_list = [
            {
                "id": spec["id"],
                "name": spec.get("name") or "Unnamed",
                "identifier": spec.get("identifier", "N/A")
            }
            for spec in all_spectra
        ]
        with open(simple_path, "w", encoding="utf-8") as f:
            json.dump(simple_list, f, indent=2, ensure_ascii=False)
        print(f"Simple ID/Name list saved to: {simple_path}")

        return all_spectra

    def get_processed_data(self, spectrum_id: int) -> Dict:
        url = f"{BASE_URL}/api/v1/public/spectra/{spectrum_id}/download/processed"
        response = self.session.get(url)
        response.raise_for_status()
        return response.json()

    def download_raw_file(self, spectrum_id: int, save_path: Optional[str] = None):
        url = f"{BASE_URL}/api/v1/public/spectra/{spectrum_id}/download/raw"
        response = self.session.get(url, stream=True)
        response.raise_for_status()

        if save_path is None:
            disposition = response.headers.get('Content-Disposition', '')
            filename = disposition.split('filename=')[-1].strip('"') if 'filename=' in disposition else f"spectrum_{spectrum_id}_raw"
            save_path = filename

        os.makedirs(os.path.dirname(save_path) or '.', exist_ok=True)
        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Raw file saved to {save_path}")

json_path = Path("ramanbase_available_spectra_full.json") #RamanBase index (in the repository)

with json_path.open("r", encoding="utf-8") as f:
    spectra_index = json.load(f)

  from tqdm.autonotebook import tqdm


# Dataset assembly based on RamanBase resource

# 1. Selecting the spectra of interest

In [7]:
#example querywords for dataset assembly
querywords = [
    # --- solvents & small organics ---
    "water",
    "methanol",
    "ethanol",
    "isopropanol",
    "isopropyl alcohol",
    "propanol",
    "butanol",
    "glycerol",
    "ethylene glycol",
    "acetone",
    "methyl ethyl ketone",
    "acetonitrile",
    "dimethyl sulfoxide",
    "DMSO",
    "dimethylformamide",
    "DMF",
    "tetrahydrofuran",
    "THF",
    "diethyl ether",
    "dioxane",
    "chloroform",
    "dichloromethane",
    "carbon tetrachloride"]

In [8]:
index_fordset=[]
names=[]
for dict in spectra_index:
    for word in querywords:
        try:
            if word in dict['name'].lower():
                index_fordset.append(dict)
                names.append(dict['name'])
        except:
            pass
print(f'Found {len(index_fordset)} dataset entries')

Found 1155 dataset entries


# 2. Fetching and saving the dataset

In [None]:
tok="" #your RamanBase API (https://ramanbase.org/) token
api = RamanbaseAPIN(api_token=tok)

initted


In [None]:
from tqdm.autonotebook import tqdm
dsetpath='ramanbaseprocspec_ex.json'
all_spectra = []
ind=0
start=False
for spec in tqdm(index_fordset):
    specid=spec['id']
    specname=spec['name']
    specidentifier=spec['identifier']
    url=spec['url']
    try:
        res=api.get_processed_data(specid)
        res['id']=specid
        res['identifier']=specidentifier
        res['name']=specname
        res['url']=url
        res['series']=spec['series']
        res['spectroscopy_type']=spec['spectroscopy_type']
        all_spectra.append(res)
        time.sleep(1)
        ind+=1
        if ind%100==0:
            with open(dsetpath, "w", encoding="utf-8") as f:
                json.dump(all_spectra, f, indent=2, ensure_ascii=False)
    except Exception as e:
        print(e)

In [None]:
json_path = Path("ramanbaseprocspec_ex.json")

with json_path.open("r", encoding="utf-8") as f:
    rbase = json.load(f)

rbase_specdict={}
for spec in tqdm(rbase):
    try:
        specax=spec['x']
        specint=spec['y']
        id=spec['id']
        name=spec['name']
        url=spec['url']
        spectrum=rp.Spectrum(np.array(specint[0]), np.array(specax))

        rbase_specdict[id]={'spectrum':spectrum,
                            'name':name,
                            'url':url,
                            'identifier':spec['identifier']}
    except Exception as e:
        print(e)
with open("rbase_specdictcur.pkl", "wb") as f: #this file is used by the app
    pickle.dump(rbase_specdict, f)

  0%|          | 0/1100 [00:00<?, ?it/s]

In [3]:
with open("rbase_specdictcur.pkl", "rb") as f:
    specdict=pickle.load(f)

In [4]:
sw='sulfoxide'
[specdict[i] for i  in list(specdict.keys()) if sw in specdict[i]['name'].lower()]

[{'spectrum': <ramanspy.core.Spectrum at 0x7cd1c9e7d210>,
  'name': 'Dimethyl sulfoxide-d6 CAS 2206-27-1',
  'url': None,
  'identifier': 'EsO7C3'}]