In [7]:
config = {
    "query": 'hi',
    "output_folder": "/content/output",
    "start_year": 2022,
    "end_year": 2023
}

In [8]:
api_key = "60261e8755ce5224a6dead5feec2e448"

In [9]:
import os
from pybliometrics.scopus.utils import create_config
import traceback
from pybliometrics.scopus import ScopusSearch, AbstractRetrieval
import json
from tqdm import tqdm
from collections import Counter
import time

os.environ['PYB_API_KEY'] = api_key

#create_config()

def find_json_files(directory):
    json_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                json_files.append(os.path.join(root, file))
    return json_files


In [14]:
def download_scopus_data(api_key, config):
    # Extract the necessary configuration values
    query = config["query"]
    output_folder = config["output_folder"]
    start_year = config["start_year"]
    end_year = config["end_year"]

    # Set the rate limit (in seconds) between API calls
    rate_limit = 0  # Adjust this value based on your API limits

    # Iterate over the specified years
    for year in range(start_year, end_year + 1):
        print(f"Processing documents for year {year}")
        # Create the folder to store the data for the current year
        folder_path = os.path.join(output_folder, str(year))
        os.makedirs(folder_path, exist_ok=True)

        # Perform the Scopus search using the defined query
        X = ScopusSearch(query, view="STANDARD", subscriber=True)
        print(f"Query: {query} Results count: {len(X.results)}")

        # Process and store the search results
        for doc in tqdm(X.results, desc=f"Processing documents for year {year}"):
            print(f"Processing document: {doc.eid}")
            try:
                # Extract the document metadata
                doc_dict = doc._asdict()
                eid = doc_dict["eid"]

                # Create the file path to store the document JSON
                file_path = os.path.join(folder_path, f"{eid}.json")

                # Check if the file already exists
                if not os.path.exists(file_path):
                    # Retrieve the abstract and references for the document
                    document = AbstractRetrieval(eid, view="REF", apikey=api_key)
                    refs = []

                    # Store the references
                    for ref in document.references:
                        ref_doc = {
                            "doi": ref.doi,
                            "title": ref.title,
                            "authors": ['; '.join(ref.authors)],
                            "source": ref.sourcetitle
                        }
                        refs.append(ref_doc)

                    # Add the references to the document dictionary
                    doc_dict["ref_docs"] = refs

                    # Dump the dictionary to the JSON file
                    with open(file_path, "w") as json_file:
                        json.dump(doc_dict, json_file)
                else:
                    print("SKIP (File already exists)")

                # Add a delay between API calls to comply with the rate limit
                time.sleep(rate_limit)

            except Exception as e:
                print(f"An error occurred: {e}")
                traceback.print_exc()

In [15]:
def extract_references(json_folder):
    # Find all the JSON files in the specified folder
    json_files = find_json_files(json_folder)

    # Initialize a Counter to keep track of the most common references
    all_refs = Counter()

    # Iterate over each JSON file (representing a paper)
    for json_file in tqdm(json_files, desc="Processing JSON files"):
        print(f"Processing {json_file}")
        with open(json_file, "r") as file:
            try:
                # Load the JSON data for the current paper
                data = json.load(file)

                # Extract the references for the current paper
                ref_docs = data["ref_docs"]
                doc_strs = []

                # Process each reference
                for ref in ref_docs:
                    # Extract relevant information from the reference
                    doi = str(ref.get("doi", ""))
                    title = ref.get("title") or ref.get("sourcetitle", "")
                    refid = str(ref.get("id", ""))

                    # Create a string representation of the reference
                    doc_strs.append(f"{doi}\t{title}\t{refid}")

                # Update the Counter with the references from the current paper
                all_refs.update(doc_strs)

            except json.JSONDecodeError as e:
                print(f"Error reading {json_file}: {e}")

    # Get the 100 most common references
    top_items = all_refs.most_common(25)

    # Print the most common references along with their counts
    print("Most common references:")
    for item, count in top_items:
        print(f"{item}\t{count}")

In [16]:
if __name__ == "__main__":
    # Download Scopus data based on the configuration
    download_scopus_data(api_key, config)

    # Set the folder path containing the downloaded JSON files
    json_folder = config["output_folder"]

    # Extract references from the downloaded JSON files
    extract_references(json_folder)

Processing documents for year 2022


KeyboardInterrupt: 

In [18]:
import requests
import json

api_key = "60261e8755ce5224a6dead5feec2e448"

def fetch_scopus_abstracts(api_key, query, count=25, view='STANDARD', response_format='json'):
    """
    Fetches research abstracts from the Scopus API.

    Parameters:
    - api_key (str): Your API key for authentication.
    - query (str): Search query.
    - count (int): Number of results to fetch. Default is 25.
    - view (str): Level of detail in the response ('STANDARD' or 'COMPLETE').
    - response_format (str): Format of the response ('json' or 'xml'). Default is 'json'.

    Returns:
    - dict or str: Search results in JSON or XML format.
    """
    base_url = 'http://api.elsevier.com/content/search/scopus'
    headers = {
        'X-ELS-APIKey': api_key,
        'Accept': 'application/json' if response_format == 'json' else 'application/atom+xml'
    }
    params = {
        'query': query,
        'count': count,
        'view': view
    }
    response = requests.get(base_url, headers=headers, params=params)
    if response_format == 'json':
        return response.json()  # Returns a dictionary
    else:
        return response.text  # Returns a string (XML)

# Example usage
query = 'heart sutra'
# Fetching 30 abstracts in JSON format with complete view
results = fetch_scopus_abstracts(api_key, query, count=10, view='COMPLETE')

# Parsing the JSON response and storing it in a pretty format
pretty_results = json.dumps(results, indent=4)

# Storing the pretty-formatted results in a file
with open('scopus_abstracts.json', 'w') as file:
    file.write(pretty_results)

print("Scopus abstracts stored in 'scopus_abstracts.json' file.")

Scopus abstracts stored in 'scopus_abstracts.json' file.
