In [None]:
# To run this notebook locally, you must install gcloud 
# and authenticate with your Google Cloud account by running `gcloud auth application-default login`.

import os
import base64
import csv
import google.generativeai as genai
import re
import time

# Configure the API
genai.configure(api_key="Add_your_API_key_here")

# Define the generative model
model = genai.GenerativeModel("gemini-1.5-pro-latest", generation_config={"temperature": 0})

# Folder containing the PDFs
pdf_folder = "Add/Your/Path/Here"

# Output CSV file
output_csv = os.path.join(pdf_folder, "responses.csv")

# Define the prompt
prompt = (
    "Please provide the contributions of this article in the following exact format, ensuring no deviations:\n\n"
    "Here's a paragraph summarizing the contributions of the article, \"<Title of the Article>\":\n\n"
    "<Detailed Contributions in paragraph form (no enumeration, no bullet points, and no additional formatting)>\n\n"
    "IMPORTANT: Ensure there are exactly two newline breaks (`\\n\\n`) before the contributions section. Do not modify the format."
)

# List to store responses
responses = []

# Retry settings
MAX_RETRIES = 5  # Maximum retry attempts per file
INITIAL_WAIT = 2  # Initial wait time in seconds (for exponential backoff)

# Loop through all PDF files in the folder
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):  
        pdf_path = os.path.join(pdf_folder, pdf_file)
        print(f"Processing: {pdf_path}")

        retry_count = 0
        while retry_count < MAX_RETRIES:
            try:
                # Read and encode the PDF file in base64
                with open(pdf_path, "rb") as doc_file:
                    doc_data = base64.standard_b64encode(doc_file.read()).decode("utf-8")

                # Generate content using the model
                response = model.generate_content([{'mime_type': 'application/pdf', 'data': doc_data}, prompt])
                print(response.text)

                # Extract title
                title_match = re.search(r'Here\'s a paragraph summarizing the contributions of the article, "(.*?)"', response.text)
                title = title_match.group(1) if title_match else "Title not found."

                # Extract contributions
                contributions = response.text.split('\n\n', 1)[1] if "\n\n" in response.text else "Contributions not found."

                # Append to responses
                responses.append([pdf_file, title, contributions])
                break  # Exit retry loop if successful

            except Exception as e:
                retry_count += 1
                wait_time = INITIAL_WAIT * (2 ** (retry_count - 1))  # Exponential backoff
                print(f"Error processing {pdf_file} (Attempt {retry_count}/{MAX_RETRIES}): {e}")
                if retry_count < MAX_RETRIES:
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"Skipping {pdf_file} after {MAX_RETRIES} failed attempts.")
                    responses.append([pdf_file, "Error", f"Failed after {MAX_RETRIES} attempts: {e}"])

# Save responses to a CSV file
with open(output_csv, mode="w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Filename", "Title", "Contributions"])  
    writer.writerows(responses)  

print(f"Responses saved to {output_csv}")

In [None]:
# Input and output files
input_csv = os.path.join(pdf_folder, "responses.csv")  # Input CSV with summarized responses
output_csv = os.path.join(pdf_folder, "relevant_articles.csv")  # Output CSV for relevant articles
irrelevant_csv = os.path.join(pdf_folder, "irrelevant_articles.csv") # Output CSV for irrelevant articles

# Function to generate a relevance check prompt
def generate_relevance_prompt(article_title, article_response):
    return f"""
    Based on the following summarized response of the article titled "{article_title}":

    {article_response}

    Is this article relevant to the topic of digital twins for intelligent traffic intersections? Answer with Yes or No and explain briefly.
    """

# Read the input CSV and process each article
relevant_data = []
irrelevant_data = []

with open(input_csv, mode="r", encoding="utf-8") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        filename = row.get("Filename", "")
        title = row.get("Title", "Unknown Title")
        
        # If the title is "Title not found.", use the filename as the title
        if title == "Title not found.":
            title = filename
            
        response = row.get("Contributions", "")  # Use 'Contributions' as the description column
        contribution = row.get("Contributions", "")

        # Generate relevance prompt
        relevance_prompt = generate_relevance_prompt(title, response)

        print(f"Checking relevance for article: {title}")
        try:
            # Check relevance
            relevance_response = genai.GenerativeModel("gemini-1.5-pro-latest", generation_config={"temperature": 0}).generate_content([relevance_prompt])
            relevance_answer = relevance_response.text.strip().split("\n")[0].lower()

            if "yes" in relevance_answer:
                relevant_data.append({"Filename": filename, "Title": title, "Contributions": contribution, "Response": relevance_response.text})
            else:
                irrelevant_data.append({
                    "Title": title,
                    "Reason": relevance_response.text
                })

        except Exception as e:
            print(f"Error processing article '{title}': {e}")
            irrelevant_data.append({
                "Title": title,
                "Reason": f"Error: {e}"
            })

# Save relevant articles to a CSV
with open(output_csv, mode="w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["Filename", "Title", "Contributions", "Response"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(relevant_data)

# Save irrelevant articles to a separate CSV
with open(irrelevant_csv, mode="w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["Title", "Reason"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(irrelevant_data)

print(f"Relevant articles saved to {output_csv}")
print(f"Irrelevant articles saved to {irrelevant_csv}")

In [None]:
# Define input and output files
relevant_csv = os.path.join(pdf_folder, "relevant_articles.csv")

# Output files
categorized_scores_csv = os.path.join(pdf_folder, "categorized_articles_with_scores.csv")
final_categorized_csv = os.path.join(pdf_folder, "final_categorized_articles.csv")
error_log_csv = os.path.join(pdf_folder, "error_log.csv")

# Define themes
themes = {
    "Digital Twin Architecture and Frameworks": [
        "Focus on architectural/design considerations for digital twins",
        "Emphasis on integration of diverse/heterogeneous data sources",
        "Discusses scalability, interoperability, or standardization of frameworks",
        "Addresses critical architectural considerations for seamless data integration"
    ],
    "Data Processing and Simulation Techniques": [
        "Emphasis on data fusion, probabilistic modeling, or filtering methods",
        "Focus on handling uncertainties (e.g., human behavior, environmental variability)",
        "Proposes/evaluates advanced simulation techniques for safety/reliability",
        "Discusses detailed methods of simulation development including calibration and validation"
    ],
    "Artificial Intelligence and Machine Learning in Traffic Control": [
        "Focuses on AI/ML techniques for traffic management",
        "Discusses creation of adaptive and intelligent systems",
        "Explores robustness and scalability of AI/ML systems",
        "Focuses on adaptive responses to real-world traffic complexities"
    ],
    "Safety and Vulnerable Road User Protection": [
        "Centers on safety concerns for vulnerable road users",
        "Discusses the role of digital twins in improving safety outcomes",
        "Addresses ethical and societal considerations (e.g., equity, privacy)",
        "Emphasizes policies/strategies for inclusive and ethical deployment"
    ],
    "Applications of Digital Twins in Smart Infrastructure": [
        "Explores large-scale implementation of digital twins",
        "Focuses on integration with smart infrastructure (e.g., IoT, connected vehicles)",
        "Discusses challenges/advancements for scalability",
        "Focuses on modernization of infrastructure networks with digital twins"
    ]
}

def generate_scoring_prompt(article_title, article_response, themes):
    theme_prompts = "\n\n".join([
        f"**{theme}**:\n- {criteria[0]}\n- {criteria[1]}\n- {criteria[2]}\n- {criteria[3]}"
        for theme, criteria in themes.items()
    ])

    return f"""
    Based on the summarized response of the article titled "{article_title}":

    {article_response}

    Evaluate the relevance of this article for each theme based on the following criteria:

    {theme_prompts}

    Assign a numerical score from 1 to 10 for each theme, where:
    - 1 means 'Not relevant at all'
    - 10 means 'Highly relevant'

    **Response Format:**  
    Each theme must be followed by a score **on the same line** in this exact format:

    ```
    Digital Twin Architecture and Frameworks: X  
    Data Processing and Simulation Techniques: X  
    Artificial Intelligence and Machine Learning in Traffic Control: X  
    Safety and Vulnerable Road User Protection: X  
    Applications of Digital Twins in Smart Infrastructure: X  
    ```

    IMPORTANT: Only return the theme name and score. Do not add explanations, bullets, or extra text.
    """

# Function to extract scores from LLM response
def extract_scores(response_text, themes):
    scores = {theme: 0 for theme in themes}  # Default all themes to 0

    # Regex pattern to capture "Theme: Score" format
    score_pattern = re.compile(r"(.+?):\s*(\d+)", re.MULTILINE)

    matches = score_pattern.findall(response_text)

    for theme, score in matches:
        theme = theme.strip()
        if theme in themes:  # Ensure extracted theme is valid
            scores[theme] = int(score.strip())

    return scores

# Function to extract only the theme name from the tie-breaking response
def extract_theme_name(response_text, themes):
    for theme in themes.keys():
        if theme in response_text:
            return theme
    return "Unknown Theme"  # Fallback if no theme is detected

# Lists to store results
categorized_scores = []
final_categorized = []
error_log = []

with open(relevant_csv, mode="r", encoding="utf-8") as csv_file:
    reader = csv.DictReader(csv_file)

    for row in reader:
        title = row.get("Title", "Unknown Title")
        response = row.get("Contributions", "")
        pdf_file = row.get("Filename", "")

        # Generate the scoring prompt
        scoring_prompt = generate_scoring_prompt(title, response, themes)

        print(f"Scoring article: {title}")
        try:
            # Generate scoring response
            scoring_response = genai.GenerativeModel("gemini-1.5-pro-latest", generation_config={"temperature": 0}).generate_content([scoring_prompt])
            scoring_text = scoring_response.text.strip()

            # Extract scores using the function
            scores = extract_scores(scoring_text, themes)

            # Save scores to categorized_scores
            categorized_scores.append({
                "Title": title,
                "Tie-breaker": "",  # Default empty, will be updated if a tie occurs
                **scores
            })

            # Determine the final category
            max_score = max(scores.values())
            top_categories = [theme for theme, score in scores.items() if score == max_score]

            if len(top_categories) == 1:
                final_category = top_categories[0]
            else:
                # Tie detected, ask Gemini to break the tie
                tie_prompt = f"""
                The following article has multiple themes with the highest score:

                Title: {title}

                Top themes: {', '.join(top_categories)}

                Based on the summarized response, which theme is the best fit? Provide only the theme name, no explanation.
                """
                tie_response = genai.GenerativeModel("gemini-1.5-pro-latest", generation_config={"temperature": 0}).generate_content([tie_prompt])
                
                # Extract only the theme name
                final_category = extract_theme_name(tie_response.text.strip(), themes)

                # Save tie-breaker decision
                categorized_scores[-1]["Tie-breaker"] = final_category  # Update last entry with the tie-breaker result

            # Save final categorization
            final_categorized.append({
                "Filename": pdf_file,
                "Title": title,
                "Final Category": final_category
            })

        except Exception as e:
            print(f"Error processing article '{title}': {e}")
            error_log.append({
                "Title": title,
                "Error": str(e)
            })

# Save scored articles to a CSV
with open(categorized_scores_csv, mode="w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["Title", "Tie-breaker"] + list(themes.keys())
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(categorized_scores)

# Save final categorized articles to a CSV
with open(final_categorized_csv, mode="w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["Filename", "Title", "Final Category"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(final_categorized)

# Save errors to a separate CSV
with open(error_log_csv, mode="w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["Title", "Error"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(error_log)

print(f"Categorized articles with scores saved to {categorized_scores_csv}")
print(f"Final categorized articles saved to {final_categorized_csv}")
print(f"Errors logged in {error_log_csv}")