### Notebook to convert PDFs to markdown and then generate synopses for all the PDFs en masse via Google Gemini API

In [None]:
!pip install transformers==4.38.2

In [None]:
!pip install google-generativeai nougat-ocr

In [1]:
import os
import subprocess

# Define the source directory containing the PDFs and the output directory
source_directory = r""
output_directory = r""

# Iterate through each file in the source directory
for filename in os.listdir(source_directory):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(source_directory, filename)
        # Construct and run the Nougat command
        command = f'nougat "{pdf_path}" -o "{output_directory}" --no-skippin'
        
        # Run the command
        subprocess.run(command, shell=True, check=True)
        print(f"Processed {filename}")

Processed DeepSeekMath Pushing the Limits of Mathematical Reasoning in Open Language Models.pdf
Processed Integrative Decoding Improve Factuality via Implicit Self-consistency.pdf
Processed PEDAL Enhancing Greedy Decoding with Large Language Models using Diverse Exemplars.pdf
Processed Self-Para-Consistency Improving Reasoning Tasks at Low Cost for Large Language Models.pdf
Processed Soft Self-Consistency Improves Language Model Agents.pdf
Processed To Know or Not To Know Analyzing Self-Consistency of Large Language Models under Ambiguity.pdf
Processed Toolformer Language Models Can Teach Themselves to Use Tools.pdf
Processed Universal Self-Consistency for Large Language Model Generation.pdf


In [2]:
import os
import time
import random
import google.generativeai as genai
import numpy as np

# Define directories
mmd_directory = output_directory
synopsis_directory = r""

# List of Google Gemini API keys
GOOGLE_AI_API_KEYS = "".split(',')

# Ensure the synopsis directory exists
os.makedirs(synopsis_directory, exist_ok=True)

# Define function to configure and send prompt with retries
def process_prompt_with_retries(user_prompt, api_key, max_retries=3):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-1.5-flash')

    for attempt in range(max_retries):
        try:
            response = model.generate_content(user_prompt)
            return response.text
        except Exception as e:
            print(f"Error on attempt {attempt + 1} with API key {api_key}: {e}")
            # Rotate to a new API key and retry after a short delay
            api_key = random.choice(GOOGLE_AI_API_KEYS)
            time.sleep(25)  # Wait before retrying
    return None

# Iterate through each .mmd file in the directory
for filename in os.listdir(mmd_directory):
    if filename.endswith(".mmd"):
        mmd_path = os.path.join(mmd_directory, filename)
        output_filename = os.path.splitext(filename)[0] + ".md"
        output_path = os.path.join(synopsis_directory, output_filename)
        
        if os.path.exists(output_path):
            continue

        # Read the content of the .mmd file
        with open(mmd_path, 'r', encoding='utf-8') as file:
            mmd_content = file.read()

        # Define the detailed prompt
        user_prompt = (
            "Summarize this paper for me. Include all major algorithms, math formulas, notation, and concepts "
            "with terms, definitions, and conceptualizations used, results and outcomes, logic, and overall "
            "methodology in your extensive summary. Include code or LaTeX code snippets where applicable, include the outcomes, discussion, reasoning, literature review, results and tables, and all other data related information from the research paper. Be detailed, elaborate, comprehensive, highly thorough, accurate, objective, and complete in your analysis of "
            "the work presented in the paper below:\n\n" + mmd_content
        )

        # Choose an initial API key
        api_key = random.choice(GOOGLE_AI_API_KEYS)

        # Process the prompt and get the response
        response_text = process_prompt_with_retries(user_prompt, api_key)

        if response_text:
            with open(output_path, 'w', encoding='utf-8') as output_file:
                output_file.write(response_text)
            print(f"Synopsis saved for {filename} as {output_filename}")
        else:
            print(f"Failed to process {filename} after multiple attempts.")

Synopsis saved for DeepSeekMath Pushing the Limits of Mathematical Reasoning in Open Language Models.mmd as DeepSeekMath Pushing the Limits of Mathematical Reasoning in Open Language Models.md
Synopsis saved for Integrative Decoding Improve Factuality via Implicit Self-consistency.mmd as Integrative Decoding Improve Factuality via Implicit Self-consistency.md
Synopsis saved for PEDAL Enhancing Greedy Decoding with Large Language Models using Diverse Exemplars.mmd as PEDAL Enhancing Greedy Decoding with Large Language Models using Diverse Exemplars.md
Synopsis saved for Self-Para-Consistency Improving Reasoning Tasks at Low Cost for Large Language Models.mmd as Self-Para-Consistency Improving Reasoning Tasks at Low Cost for Large Language Models.md
Synopsis saved for Soft Self-Consistency Improves Language Model Agents.mmd as Soft Self-Consistency Improves Language Model Agents.md
Synopsis saved for To Know or Not To Know Analyzing Self-Consistency of Large Language Models under Ambigui

In [11]:
clr = ""

synopsis_directory = r""

for filename in os.listdir(synopsis_directory):
    if not('self' in filename.lower() or 'putnam' in filename.lower()):
        continue
    synopsis_path = os.path.join(synopsis_directory, filename)

    # Check if the synopsis file exists
    if not os.path.isfile(synopsis_path):
        print(f"Synopsis file not found: {synopsis_path}")
        continue

    # Read the synopsis content
    with open(synopsis_path, 'r', encoding='utf-8') as f:
        synopsis_content = f.read()
    
    clr += synopsis_content + '\n\n\n'
    
print(clr)

## Extensive Summary of "Integrative Decoding: Improve Factuality via Implicit Self-consistency"

This paper introduces Integrative Decoding (ID), a novel decoding strategy designed to enhance the factuality of Large Language Models (LLMs) in open-ended generation tasks.  The core idea is to implicitly incorporate self-consistency into the decoding process, overcoming limitations of existing self-consistency methods that often restrict task formats or are computationally expensive.

**1. Literature Review and Problem Statement:**

The paper begins by highlighting the issue of "hallucinations" in LLMs – the generation of factually incorrect information.  Existing research demonstrates that repeated sampling, generating multiple outputs for the same prompt, significantly improves factuality.  Self-consistency (SC), measuring the consistency among these multiple outputs, serves as a valuable indicator of truthfulness.  However, most SC-based methods are limited to tasks with easily defina

In [6]:
import os
import nbformat as nbf

# Define directories
synopsis_directory = r""
notebook_path = os.path.join(r"", "Literature Review.ipynb")

# Mapping of synopsis filenames to their citations
# papers = [
#     {
#         "filename": "1-s2.0-S0014292117302003-main.md",
#         "citation": "Hellmann, T., & Thiele, V. (2019). Fostering entrepreneurship: Promoting founding or funding?. European Economic Review, 113, 47-68."
#     },
#     {
#         "filename": "1-s2.0-S0020025519310904-main.md",
#         "citation": "Zhang, Y., & Zhao, X. (2020). Recommending investors for new startups by integrating network diffusion and investors' domain preference. Information Sciences, 516, 182-196."
#     },
# ]

papers = [{"filename": f} for f in os.listdir(r"")]

# Create a new notebook
nb = nbf.v4.new_notebook()

# Add the main heading
main_heading = "# Literature Review"
nb.cells.append(nbf.v4.new_markdown_cell(main_heading))

# Iterate through each paper and add citation and synopsis
for idx, paper in enumerate(papers, start=1):
    filename = paper["filename"]
    citation = ""  # paper["citation"]
    synopsis_path = os.path.join(synopsis_directory, filename)

    # Check if the synopsis file exists
    if not os.path.isfile(synopsis_path):
        print(f"Synopsis file not found: {synopsis_path}")
        continue

    # Read the synopsis content
    with open(synopsis_path, 'r', encoding='utf-8') as f:
        synopsis_content = f.read()

    # Create citation markdown
    citation_md = f"## {idx}. {citation}"
    
    # Add citation and synopsis to the notebook
    nb.cells.append(nbf.v4.new_markdown_cell(citation_md))
    nb.cells.append(nbf.v4.new_markdown_cell(synopsis_content))
    print(f"Added section for: {filename}")

# Write the notebook to a file
with open(notebook_path, 'w', encoding='utf-8') as f:
    nbf.write(nb, f)

print(f"\nJupyter Notebook 'Literature Review.ipynb' has been created at {notebook_path}")

Added section for: Chain-of-Thought Prompting Elicits Reasoning in Large Language Models.md
Added section for: DeepSeek-Prover-V1.5 Harnessing Proof Assistant Feedback.md
Added section for: DeepSeekMath Pushing the Limits of Mathematical Reasoning in Open Language Models.md
Added section for: Integrative Decoding Improve Factuality via Implicit Self-consistency.md
Added section for: Measuring Mathematical Problem Solving With the MATH Dataset.md
Added section for: OlympiadBench A Challenging Benchmark for Promoting AGI with Olympiad-Level Bilingual Multimodal Scientific Problems.md
Added section for: PEDAL Enhancing Greedy Decoding with Large Language Models using Diverse Exemplars.md
Added section for: Plan-and-Solve Prompting Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models.md
Added section for: Program of Thoughts Prompting Disentangling Computation from Reasoning for Numerical Reasoning Tasks.md
Added section for: Putnam-AXIOM A Functional and Static Benchmar