### Notebook to convert PDFs to markdown and then generate synopses for all the PDFs en masse via Google Gemini API

In [None]:
!pip install google-generativeai

In [1]:
import os
import subprocess

# Define the source directory containing the PDFs and the output directory
source_directory = r""
output_directory = r""

# Iterate through each file in the source directory
for filename in os.listdir(source_directory):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(source_directory, filename)
        # Construct and run the Nougat command
        command = f'nougat "{pdf_path}" -o "{output_directory}" --no-skippin'
        
        # Run the command
        subprocess.run(command, shell=True, check=True)
        print(f"Processed {filename}")

Processed 1-s2.0-S0014292117302003-main.pdf
Processed 1-s2.0-S0020025519310904-main.pdf
Processed 1-s2.0-S0040162520309811-main.pdf
Processed 1-s2.0-S0148296318304740-main.pdf
Processed 3402707.3402736.pdf
Processed A_survey_of_heterogeneous_information_network_analysis.pdf
Processed Complexity - 2018 - Luo - Research on Decision‐Making of Complex Venture Capital Based on Financial Big Data Platform.pdf
Processed Complexity - 2019 - Zhang - Modeling Risk Contagion in the Venture Capital Market  A Multilayer Network Approach.pdf
Processed Complexity - 2020 - Wen - Heterogeneous Information Network‐Based Scientific Workflow Recommendation for Complex.pdf
Processed Complexity - 2020 - Wu - A Social Network Analysis on Venture Capital Alliance s Exit from an Emerging Market.pdf
Processed lerner-nanda-2020-venture-capital-s-role-in-financing-innovation-what-we-know-and-how-much-we-still-need-to-learn.pdf
Processed PhysRevE.76.046115.pdf
Processed s10994-010-5205-8.pdf
Processed ssrn-2628861

In [4]:
import os
import time
import random
import google.generativeai as genai
import numpy as np

# Define directories
mmd_directory = r""
synopsis_directory = os.path.join(mmd_directory, "synopsis")

# List of Google Gemini API keys
GOOGLE_AI_API_KEYS = [
    # TODO: fill in your Gemini API keys
]

# Ensure the synopsis directory exists
os.makedirs(synopsis_directory, exist_ok=True)

# Define function to configure and send prompt with retries
def process_prompt_with_retries(user_prompt, api_key, max_retries=3):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-1.5-flash')

    for attempt in range(max_retries):
        try:
            response = model.generate_content(user_prompt)
            return response.text
        except Exception as e:
            print(f"Error on attempt {attempt + 1} with API key {api_key}: {e}")
            # Rotate to a new API key and retry after a short delay
            api_key = random.choice(GOOGLE_AI_API_KEYS)
            time.sleep(25)  # Wait before retrying
    return None

# Iterate through each .mmd file in the directory
for filename in os.listdir(mmd_directory):
    if filename.endswith(".mmd"):
        mmd_path = os.path.join(mmd_directory, filename)

        # Read the content of the .mmd file
        with open(mmd_path, 'r', encoding='utf-8') as file:
            mmd_content = file.read()

        # Define the detailed prompt
        user_prompt = (
            "Summarize this paper for me. Include all major algorithms, math formulas and concepts "
            "with terms, definitions, and conceptualizations used, results and outcomes, logic, and overall "
            "methodology in your extensive summary. Include code or LaTeX code snippets where applicable, include the outcomes, discussion, reasoning, literature review, results and tables, and all other data related information from the research paper. Be detailed, elaborate, comprehensive, highly thorough, accurate, objective, and complete in your analysis of "
            "the work presented in the paper below:\n\n" + mmd_content
        )

        # Choose an initial API key
        api_key = random.choice(GOOGLE_AI_API_KEYS)

        # Process the prompt and get the response
        response_text = process_prompt_with_retries(user_prompt, api_key)

        if response_text:
            # Save the response to a new .md file in the synopsis directory
            output_filename = os.path.splitext(filename)[0] + ".md"
            output_path = os.path.join(synopsis_directory, output_filename)
            with open(output_path, 'w', encoding='utf-8') as output_file:
                output_file.write(response_text)
            print(f"Synopsis saved for {filename} as {output_filename}")
        else:
            print(f"Failed to process {filename} after multiple attempts.")

Synopsis saved for 1-s2.0-S0014292117302003-main.mmd as 1-s2.0-S0014292117302003-main.md
Synopsis saved for 1-s2.0-S0020025519310904-main.mmd as 1-s2.0-S0020025519310904-main.md
Synopsis saved for 1-s2.0-S0040162520309811-main.mmd as 1-s2.0-S0040162520309811-main.md
Synopsis saved for 1-s2.0-S0148296318304740-main.mmd as 1-s2.0-S0148296318304740-main.md
Synopsis saved for 2020-sac-los.mmd as 2020-sac-los.md
Synopsis saved for 3402707.3402736.mmd as 3402707.3402736.md
Synopsis saved for A_survey_of_heterogeneous_information_network_analysis.mmd as A_survey_of_heterogeneous_information_network_analysis.md
Synopsis saved for Complexity - 2018 - Luo - Research on Decision‐Making of Complex Venture Capital Based on Financial Big Data Platform.mmd as Complexity - 2018 - Luo - Research on Decision‐Making of Complex Venture Capital Based on Financial Big Data Platform.md
Synopsis saved for Complexity - 2019 - Zhang - Modeling Risk Contagion in the Venture Capital Market  A Multilayer Network A

In [7]:
import os
import nbformat as nbf

# Define directories
base_directory = r""
synopsis_directory = os.path.join(base_directory, "synopsis")
notebook_path = os.path.join(base_directory, "Literature Review.ipynb")

# Mapping of synopsis filenames to their citations
papers = [
    {
        "filename": "1-s2.0-S0014292117302003-main.md",
        "citation": "Hellmann, T., & Thiele, V. (2019). Fostering entrepreneurship: Promoting founding or funding?. European Economic Review, 113, 47-68."
    },
    {
        "filename": "1-s2.0-S0020025519310904-main.md",
        "citation": "Zhang, Y., & Zhao, X. (2020). Recommending investors for new startups by integrating network diffusion and investors' domain preference. Information Sciences, 516, 182-196."
    },
    {
        "filename": "1-s2.0-S0040162520309811-main.md",
        "citation": "Liang, X., & Hu, X. (2020). Geographic distance, venture capital and technological performance: Evidence from Chinese enterprises. Technological Forecasting and Social Change, 158, 120155."
    },
    {
        "filename": "1-s2.0-S0148296318304740-main.md",
        "citation": "Zheng, Y., & Liu, S. (2020). Partner-selection effects on venture capital investment performance with uncertainties. Journal of Business Research, 108, 201-212."
    },
    {
        "filename": "2020-sac-los.md",
        "citation": "Luef, J., Ohrfandl, C., Sacharidis, D., & Werthner, H. (2020). A recommender system for investing in early-stage enterprises. Proceedings of the 35th Annual ACM Symposium on Applied Computing, 1262-1271."
    },
    {
        "filename": "3402707.3402736.md",
        "citation": "Sun, Y., Han, J., Yan, X., Yu, P. S., & Wu, T. (2011). PathSim: Meta path-based top-k similarity search in heterogeneous information networks. Proceedings of the VLDB Endowment, 4(11), 992-1003."
    },
    {
        "filename": "A_survey_of_heterogeneous_information_network_analysis.md",
        "citation": "Shi, C., Li, Y., Zhang, J., Sun, Y., & Yu, P. S. (2017). A survey of heterogeneous information network analysis. IEEE Transactions on Knowledge and Data Engineering, 29(1), 17-37."
    },
    {
        "filename": "Complexity - 2018 - Luo - Research on Decision‐Making of Complex Venture Capital Based on Financial Big Data Platform.md",
        "citation": "Luo, S., Lin, Y., & Chen, H. (2018). Research on Decision-Making of Complex Venture Capital Based on Financial Big Data Platform. Complexity, 2018, 5170281."
    },
    {
        "filename": "Complexity - 2019 - Zhang - Modeling Risk Contagion in the Venture Capital Market  A Multilayer Network Approach.md",
        "citation": "Zhang, Y., & Zhang, Z. (2019). Modeling Risk Contagion in the Venture Capital Market: A Multilayer Network Approach. Complexity, 2019, 9209345."
    },
    {
        "filename": "Complexity - 2020 - Wen - Heterogeneous Information Network‐Based Scientific Workflow Recommendation for Complex.md",
        "citation": "Wen, Y., Chen, J., Chen, H., & Jiang, Y. (2020). Heterogeneous Information Network-Based Scientific Workflow Recommendation for Complex Applications. Complexity, 2020, 4129063."
    },
    {
        "filename": "Complexity - 2020 - Wu - A Social Network Analysis on Venture Capital Alliance s Exit from an Emerging Market.md",
        "citation": "Wu, W., & Zhu, Y. (2020). A Social Network Analysis on Venture Capital Alliance's Exit from an Emerging Market. Complexity, 2020, 4650160."
    },
    {
        "filename": "eif_working_paper_2023_91.md",
        "citation": "Kraemer-Eis, H., Botsari, A., Lang, F., & Pal, K. (2023). Using machine learning to map the European Cleantech sector. EIF Working Paper 2023/91, European Investment Fund."
    },
    {
        "filename": "lerner-nanda-2020-venture-capital-s-role-in-financing-innovation-what-we-know-and-how-much-we-still-need-to-learn.md",
        "citation": "Lerner, J., & Nanda, R. (2020). Venture Capital's Role in Financing Innovation: What We Know and How Much We Still Need to Learn. Journal of Economic Perspectives, 34(3), 237-261."
    },
    {
        "filename": "PhysRevE.76.046115.md",
        "citation": "Zhou, T., Ren, J., Medo, M., & Zhang, Y. C. (2007). Bipartite network projection and personal recommendation. Physical Review E, 76(4), 046115."
    },
    {
        "filename": "s10994-010-5205-8.md",
        "citation": "Lao, N., & Cohen, W. W. (2010). Relational retrieval using a combination of path-constrained random walks. Machine Learning, 81(1), 53-67."
    },
    {
        "filename": "ssrn-2628861.md",
        "citation": "Salamzadeh, A., & Kawamorita Kesim, H. (2015). Startup Companies: Life Cycle and Challenges. 4th International Conference on Employment, Education and Entrepreneurship (EEE), Belgrade, Serbia."
    },
    {
        "filename": "sustainability-12-03447-v2.md",
        "citation": "Jang, H., Lee, K., & Yoon, B. (2020). The Role of Venture Capital Investment in Startups' Sustainable Growth and Performance: Focusing on Absorptive Capacity and Venture Capitalists' Reputation. Sustainability, 12(8), 3447."
    },
    {
        "filename": "vc-recsys.md",
        "citation": "Liu, Y., Zhao, G., & Thirunarayan, K. (2020). A venture capital recommendation algorithm based on heterogeneous information network. International Journal of Computers Communications & Control, 15(1), 3779."
    }
]

# Create a new notebook
nb = nbf.v4.new_notebook()

# Add the main heading
main_heading = "# Literature Review of Recommender Systems for Investing"
nb.cells.append(nbf.v4.new_markdown_cell(main_heading))

# Iterate through each paper and add citation and synopsis
for idx, paper in enumerate(papers, start=1):
    filename = paper["filename"]
    citation = paper["citation"]
    synopsis_path = os.path.join(synopsis_directory, filename)

    # Check if the synopsis file exists
    if not os.path.isfile(synopsis_path):
        print(f"Synopsis file not found: {synopsis_path}")
        continue

    # Read the synopsis content
    with open(synopsis_path, 'r', encoding='utf-8') as f:
        synopsis_content = f.read()

    # Create citation markdown
    citation_md = f"## {idx}. {citation}"
    
    # Add citation and synopsis to the notebook
    nb.cells.append(nbf.v4.new_markdown_cell(citation_md))
    nb.cells.append(nbf.v4.new_markdown_cell(synopsis_content))
    print(f"Added section for: {filename}")

# Write the notebook to a file
with open(notebook_path, 'w', encoding='utf-8') as f:
    nbf.write(nb, f)

print(f"\nJupyter Notebook 'Literature Review.ipynb' has been created at {notebook_path}")

Added section for: 1-s2.0-S0014292117302003-main.md
Added section for: 1-s2.0-S0020025519310904-main.md
Added section for: 1-s2.0-S0040162520309811-main.md
Added section for: 1-s2.0-S0148296318304740-main.md
Added section for: 2020-sac-los.md
Added section for: 3402707.3402736.md
Added section for: A_survey_of_heterogeneous_information_network_analysis.md
Added section for: Complexity - 2018 - Luo - Research on Decision‐Making of Complex Venture Capital Based on Financial Big Data Platform.md
Added section for: Complexity - 2019 - Zhang - Modeling Risk Contagion in the Venture Capital Market  A Multilayer Network Approach.md
Added section for: Complexity - 2020 - Wen - Heterogeneous Information Network‐Based Scientific Workflow Recommendation for Complex.md
Added section for: Complexity - 2020 - Wu - A Social Network Analysis on Venture Capital Alliance s Exit from an Emerging Market.md
Added section for: eif_working_paper_2023_91.md
Added section for: lerner-nanda-2020-venture-capital