In [None]:
import os
import fitz  # PyMuPDF for PDF extraction # use pip install pymupdf if not found
import google.generativeai as genai
import re
from tqdm import tqdm
from docx import Document # pip install python-docx

# Configure Gemini API
genai.configure(api_key="Add_your_API_key_here")  # Add your API key here

# Define theme folders (Ensure the user places PDFs here)
base_folder = "Add/Path/to/PDFs/Folder"  # Add your base folder path here

theme_folders = {
    "Digital Twin Architecture and Frameworks": os.path.join(base_folder, "Digital Twin Architecture and Frameworks"),
    "Data Processing and Simulation Techniques": os.path.join(base_folder, "Data Processing and Simulation Techniques"),
    "Artificial Intelligence and Machine Learning in Traffic Control": os.path.join(base_folder, "Artificial Intelligence and Machine Learning in Traffic Control"),
    "Safety and Vulnerable Road User Protection": os.path.join(base_folder, "Safety and Vulnerable Road User Protection"),
    "Applications of Digital Twins in Smart Infrastructure": os.path.join(base_folder, "Applications of Digital Twins in Smart Infrastructure")
}

# Output folder for synthesized findings
synthesis_output_folder = os.path.join(base_folder, "synthesized_reports")
os.makedirs(synthesis_output_folder, exist_ok=True)

def extract_text_and_metadata(pdf_path):
    """Extracts text and metadata (title) from a given PDF."""
    text = ""
    metadata = {"Title": "Unknown Title", "BibTeX Key": "Unknown"}

    try:
        with fitz.open(pdf_path) as doc:
            text = "\n".join([page.get_text("text") for page in doc])
            meta = doc.metadata
            if meta and meta.get("title"):
                metadata["Title"] = meta["title"].strip()
            else:
                # Extract title from first few lines if missing
                lines = text.split("\n")
                for line in lines:
                    if len(line.strip()) > 5:  # Ensure it's a meaningful title
                        metadata["Title"] = line.strip()
                        break

            # Generate a simple BibTeX key
            metadata["BibTeX Key"] = re.sub(r'[^a-zA-Z0-9]', '', metadata["Title"])[:15]
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")

    return text, metadata

def generate_synthesis_prompt(theme, extracted_texts, citations):
    """Creates a synthesis prompt for Gemini with citations."""
    combined_text = "\n\n---\n\n".join(extracted_texts[:5])  # Use more samples for better synthesis

    citation_text = "\n".join([f"- {meta['Title']} ({meta['BibTeX Key']})" for meta in citations])

    return f"""
    You are analyzing research papers related to the theme: **{theme}**.

    Below are excerpts from several research articles:

    {combined_text}

    Below are the citation details of the articles:

    {citation_text}

    Based on these articles, generate a synthesis including:
    1. Common findings and key takeaways.
    2. Promising research directions based on trends.
    3. Potential research gaps that future studies should address.

    **Important:** Integrate in-text citations within the synthesis using the title (or pdf name if title is not found) of the article placed in a pair of parentheses.

    Structure the response as follows:
    - **Common Findings**: (List key findings across studies, with citations)
    - **Promising Directions**: (Highlight emerging areas of research, with citations)
    - **Research Gaps**: (Identify areas needing further investigation, with citations)
    """

# Process each theme folder
for theme, folder_path in theme_folders.items():
    if not os.path.exists(folder_path):
        print(f"Skipping {theme}, folder does not exist: {folder_path}")
        continue

    print(f"\nProcessing Theme: {theme}")
    extracted_texts = []
    citations = []

    # Extract text from each PDF in the theme folder
    for pdf_file in tqdm(os.listdir(folder_path), desc=f"Extracting PDFs from {theme}"):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            text, metadata = extract_text_and_metadata(pdf_path)
            if text:
                extracted_texts.append(text)
                citations.append(metadata)  # Store metadata for citations

    # If no text was extracted, skip
    if not extracted_texts:
        print(f"No valid text extracted for {theme}, skipping synthesis.")
        continue

    # Generate synthesis prompt with citations
    synthesis_prompt = generate_synthesis_prompt(theme, extracted_texts, citations)

    try:
        print(f"Generating synthesis for {theme}...")
        response = genai.GenerativeModel("gemini-1.5-pro-latest", generation_config={"temperature": 0}).generate_content([synthesis_prompt])
        synthesis_text = response.text.strip()

        # Save synthesis result to DOCX
        doc = Document()
        doc.add_heading(f"Synthesis Report: {theme}", level=1)
        doc.add_paragraph(synthesis_text)
        doc_path = os.path.join(synthesis_output_folder, f"{theme.replace(' ', '_')}_synthesis.docx")
        doc.save(doc_path)
        print(f"Synthesis saved to {doc_path}")

    except Exception as e:
        print(f"Error generating synthesis for {theme}: {e}")