In [15]:
from openai import OpenAI
import dotenv
import requests
import xmltodict
import sqlite3
import pandas as pd
from PyPDF2 import PdfReader
import os
import re

In [16]:
def search_arxiv(query: str, num_results: int = 5):
    """
    Search the arXiv API for papers matching a query. Returns a list of dictionaries containing the paper information.
    :param query:
    :param num_results:
    :return:
    """
    url = f"http://export.arxiv.org/api/query?search_query={query}&start=0&max_results={num_results}"
    response = requests.get(url)
    xml_data = response.text
    dict_data = xmltodict.parse(xml_data)
    arxiv_papers = []
    # Iterate through the entries and print relevant information
    for entry in dict_data['feed']['entry']:
        if isinstance(entry['author'], list):
            authors = ', '.join([author['name'] for author in entry['author']])
        else:
            authors = entry['author']['name']

        paper = {
            'title': entry['title'].replace('\n', ' '),
            'id': entry['id'].split('/abs/')[-1],
            'published': entry['published'],
            'updated': entry['updated'],
            'authors': authors,
            'summary': entry['summary'].replace('\n', ' '),
            'link': entry['link'][0]['@href']
        }
        arxiv_papers.append(paper)
    return arxiv_papers

def clean_text(text):
    """
    Cleans the extracted text to make it suitable for use in a prompt.
    """
    # Remove page numbers, excessive whitespace, and non-ASCII characters
    text = re.sub(r'\n+', '\n', text)  # Replace multiple newlines with a single newline
    text = re.sub(r'\s{2,}', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = text.strip()  # Trim leading and trailing spaces

    # Remove common non-informative sections (e.g., references, acknowledgments)
    sections_to_exclude = ["References", "Acknowledgments", "Appendix"]
    for section in sections_to_exclude:
        text = re.sub(f"{section}.*", "", text, flags=re.IGNORECASE | re.DOTALL)

    return text

def read_papers(arxiv_url):
    # Construct the PDF URL
    pdf_url = arxiv_url.replace("abs", "pdf")

    # File name for saving the PDF
    output_file = arxiv_url.split("/")[-1] + ".pdf"
    output_path = os.path.join('papers', output_file)

    # Ensure the directory exists
    os.makedirs('papers', exist_ok=True)

    # Download the PDF
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()  # Raise an error for bad status codes

        # Save the PDF to a file
        with open(output_path, "wb") as file:
            file.write(response.content)
        print(f"PDF saved to {output_path}")

    except requests.exceptions.RequestException as e:
        print(f"Error downloading PDF: {e}")
        return None

    # Load text from the saved PDF
    try:
        reader = PdfReader(output_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()

        # Clean the extracted text
        cleaned_text = clean_text(text)
        return cleaned_text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None


In [17]:
# List of queries to run
queries = [
    'ti: "Generative AI" OR abs: "Generative AI"',
    'ti: "Large Language Models" OR abs: "Large Language Models"',
    'ti: "Machine Learning" OR abs: "Machine Learning"',
    'ti: "Prompt Engineering" OR abs: "Prompt Engineering"',
    'ti: "Artificial Intelligence" OR abs: "Artificial Intelligence"',
    'ti: "Deep Learning" OR abs: "Deep Learning"',
    'ti: "Generative Adversarial Networks" OR abs: "Generative Adversarial Networks" OR ti: GANs OR abs: GANs',
    'ti: "Natural Language Processing" OR abs: "Natural Language Processing" OR ti: NLP OR abs: NLP',
    'ti: "Computer Vision" OR abs: "Computer Vision"',
    'ti: "Reinforcement Learning" OR abs: "Reinforcement Learning"',
    'ti: "Transformer Networks" OR abs: "Transformer Networks" OR ti: Transformers OR abs: Transformers',
    '(ti: "Generative AI" OR abs: "Generative AI") AND (ti: "Natural Language Processing" OR abs: "Natural Language Processing")',
    '(ti: "Large Language Models" OR abs: "Large Language Models") AND (ti: "Machine Translation" OR abs: "Machine Translation")',
    '(ti: "Deep Learning" OR abs: "Deep Learning") AND (ti: "Computer Vision" OR abs: "Computer Vision")',
    '(ti: "Generative AI" OR abs: "Generative AI") AND (ti: "Drug Discovery" OR abs: "Drug Discovery")',
    '(ti: "Machine Learning" OR abs: "Machine Learning") AND (ti: "Healthcare" OR abs: "Healthcare")',
    'ti: "Ethical AI" OR abs: "Ethical AI" OR ti: "AI Ethics" OR abs: "AI Ethics"',
    'ti: "Explainable AI" OR abs: "Explainable AI" OR ti: XAI OR abs: XAI',
    'ti: "AI Safety" OR abs: "AI Safety"',
    'ti: "Federated Learning" OR abs: "Federated Learning"',
    '(ti: "Artificial Intelligence" OR abs: "Artificial Intelligence") AND (ti: "Climate Change" OR abs: "Climate Change")'
]

# Database setup
conn = sqlite3.connect('arxiv_papers.db')
cursor = conn.cursor()

# Create table if it doesn't exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS papers (
        id TEXT PRIMARY KEY,
        title TEXT,
        published TEXT,
        updated TEXT,
        authors TEXT,
        summary TEXT,
        link TEXT,
        query TEXT
    )
''')
# Loop through the queries and fetch data
for query in queries:
    print(f"Fetching results for query: {query}")
    try:
        papers = search_arxiv(query, num_results=50)

        for paper in papers:
            try:
                # Insert data into the database
                cursor.execute('''
                    INSERT INTO papers (id, title, published, updated, authors, summary, link, query)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                ''', (
                    paper['id'], paper['title'], paper['published'], paper['updated'],
                    paper['authors'], paper['summary'], paper['link'], query
                ))
            except sqlite3.IntegrityError:
                print(f"  - Paper with id '{paper['id']}' already exists. Skipping.")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")

# Commit changes and close the connection
conn.commit()
conn.close()


Fetching results for query: ti: "Generative AI" OR abs: "Generative AI"
  - Paper with id '2309.07930v1' already exists. Skipping.
  - Paper with id '2307.02796v2' already exists. Skipping.
  - Paper with id '2412.10337v2' already exists. Skipping.
  - Paper with id '2409.19120v1' already exists. Skipping.
  - Paper with id '2304.11771v2' already exists. Skipping.
  - Paper with id '2403.02726v1' already exists. Skipping.
  - Paper with id '2305.12013v1' already exists. Skipping.
  - Paper with id '2408.11441v1' already exists. Skipping.
  - Paper with id '2407.11001v1' already exists. Skipping.
  - Paper with id '2311.01193v2' already exists. Skipping.
  - Paper with id '2408.16798v1' already exists. Skipping.
  - Paper with id '2406.04554v1' already exists. Skipping.
  - Paper with id '2402.12617v2' already exists. Skipping.
  - Paper with id '2501.10383v1' already exists. Skipping.
  - Paper with id '2406.00477v1' already exists. Skipping.
  - Paper with id '2306.02781v2' already ex

In [19]:
# Connect to the database
conn = sqlite3.connect('arxiv_papers.db')

# Example 1: Load all papers into a DataFrame
df = pd.read_sql_query("SELECT * FROM papers ORDER BY published DESC", conn)
df

Unnamed: 0,id,title,published,updated,authors,summary,link,query
0,2501.11496v1,Generative AI and Large Language Models in Lan...,2025-01-20T14:03:40Z,2025-01-20T14:03:40Z,Vincent Koc,Generative AI and large-scale language models ...,http://arxiv.org/abs/2501.11496v1,"(ti: ""Generative AI"" OR abs: ""Generative AI"") ..."
1,2501.10091v1,How Do Programming Students Use Generative AI?,2025-01-17T10:25:41Z,2025-01-17T10:25:41Z,"Christian Rahe, Walid Maalej",Programming students have a widespread access ...,http://arxiv.org/abs/2501.10091v1,"ti: ""Generative AI"" OR abs: ""Generative AI"""
2,2501.09223v1,Foundations of Large Language Models,2025-01-16T01:03:56Z,2025-01-16T01:03:56Z,"Tong Xiao, Jingbo Zhu",This is a book about large language models. As...,http://arxiv.org/abs/2501.09223v1,"ti: ""Large Language Models"" OR abs: ""Large Lan..."
3,2501.05765v1,Deontic Temporal Logic for Formal Verification...,2025-01-10T07:48:40Z,2025-01-10T07:48:40Z,"Priya T. V., Shrisha Rao",Ensuring ethical behavior in Artificial Intell...,http://arxiv.org/abs/2501.05765v1,"ti: ""Ethical AI"" OR abs: ""Ethical AI"" OR ti: ""..."
4,2501.06271v1,Large Language Models for Bioinformatics,2025-01-10T01:43:05Z,2025-01-10T01:43:05Z,"Wei Ruan, Yanjun Lyu, Jing Zhang, Jiazhang Cai...",With the rapid advancements in large language ...,http://arxiv.org/abs/2501.06271v1,"ti: ""Large Language Models"" OR abs: ""Large Lan..."
...,...,...,...,...,...,...,...,...
996,1107.2875v1,A Hilbert Scheme in Computer Vision,2011-07-14T17:36:59Z,2011-07-14T17:36:59Z,"Chris Aholt, Bernd Sturmfels, Rekha Thomas",Multiview geometry is the study of two-dimensi...,http://dx.doi.org/10.4153/CJM-2012-023-2,"ti: ""Computer Vision"" OR abs: ""Computer Vision"""
997,1106.4509v1,Machine Learning Markets,2011-06-22T17:12:42Z,2011-06-22T17:12:42Z,Amos Storkey,Prediction markets show considerable promise f...,http://arxiv.org/abs/1106.4509v1,"ti: ""Machine Learning"" OR abs: ""Machine Learning"""
998,cmp-lg/9704010v1,The Theoretical Status of Ontologies in Natura...,1997-04-25T13:00:14Z,1997-04-25T13:00:14Z,John A. Bateman,This paper discusses the use of `ontologies' i...,http://arxiv.org/abs/cmp-lg/9704010v1,"ti: ""Natural Language Processing"" OR abs: ""Nat..."
999,cmp-lg/9607017v1,Natural Language Processing: Structure and Com...,1996-07-13T21:31:43Z,1996-07-13T21:31:43Z,Wlodek Zadrozny,We introduce a method for analyzing the comple...,http://arxiv.org/abs/cmp-lg/9607017v1,"ti: ""Natural Language Processing"" OR abs: ""Nat..."


In [20]:
example = """
<example>
Prompt Engineering: A Comprehensive Overview of Techniques and Applications

Prompt engineering is a rapidly evolving field that is transforming the way we interact with large language models (LLMs) and vision-language models (VLMs). By providing carefully crafted instructions, known as prompts, we can guide these powerful models to perform a wide range of tasks, from generating creative text formats to solving complex reasoning problems. This approach allows us to leverage the vast knowledge encoded in these models and adapt them to specific applications without the need for extensive retraining.

Essentially, prompt engineering acts as a bridge between human intentions and machine capabilities. Instead of fine-tuning the model's internal parameters, we provide explicit instructions in the form of prompts, shaping the model's output and behavior. This technique has opened up exciting new possibilities for utilizing LLMs and VLMs across various domains.

The Core Principles of Prompt Engineering

At its core, prompt engineering relies on the ability of LLMs and VLMs to understand and respond to natural language instructions. By formulating prompts that are clear, concise, and contextually relevant, we can effectively guide the model towards the desired output. This process involves understanding the model's strengths and limitations, as well as the specific requirements of the task at hand.

Key Techniques in Prompt Engineering

The field of prompt engineering encompasses a variety of techniques, each with its own strengths and applications. Some of the most prominent techniques include:

Zero-shot Prompting: This technique involves providing the model with a task description and expecting it to generate a response without any prior examples. This demonstrates the remarkable ability of LLMs to generalize their knowledge to new tasks.

Few-shot Prompting: In this approach, we provide the model with a small number of examples to illustrate the desired task. This can help the model better understand the task and generate more accurate responses.

Chain-of-Thought Prompting: This technique encourages the model to generate a series of intermediate reasoning steps before arriving at the final answer. This can help improve the model's reasoning abilities and generate more comprehensive responses.

Retrieval Augmented Generation (RAG): This technique involves integrating external knowledge sources, such as databases or web pages, into the prompt. This allows the model to access and utilize relevant information that may not be present in its training data.

Prompt Engineering for Reducing Hallucinations:  LLMs can sometimes generate outputs that are factually incorrect or nonsensical, a phenomenon known as "hallucination."  Prompt engineering techniques like Chain-of-Verification (CoVe) and ReAct prompting are being developed to address this issue by encouraging the model to verify its outputs and access external information.

Applications of Prompt Engineering

Prompt engineering has found applications in a wide range of areas, including:

Natural Language Generation: Generating creative text formats, such as poems, code, scripts, musical pieces, email, letters, etc.
Question Answering: Answering questions based on given context or external knowledge sources.
Text Summarization: Condensing large amounts of text into concise summaries.
Code Generation: Generating code in various programming languages.
Machine Translation: Translating text between different languages.
Dialogue Systems: Building chatbots and conversational agents.
The Future of Prompt Engineering

As the field of prompt engineering continues to evolve, we can expect to see even more innovative techniques and applications emerge. Researchers are actively exploring new ways to improve the effectiveness of prompts, automate the prompt engineering process, and address the limitations of current approaches.

Prompt engineering is playing a crucial role in shaping the future of LLMs and VLMs. By providing a flexible and powerful way to interact with these models, prompt engineering is unlocking their full potential and paving the way for new and exciting applications in various fields.
</example>
"""

In [21]:
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta name="description" content="[Insert Page Description Here]">
    <meta property="og:title" content="[Insert Open Graph Title Here]">
    <meta property="og:description" content="[Insert Open Graph Description Here]">
    <meta property="og:url" content="[Insert Page URL Here]">
    <meta property="og:image" content="[Insert Thumbnail URL Here]">
    <meta property="og:type" content="article">
    <title>[Insert Page Title Here]</title>
    <link rel="stylesheet" href="style.css">
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap" rel="stylesheet">
    <link rel="icon" href="favicon.ico" type="image/x-icon">
</head>
<body>

    <header>
        <h1>[Insert Main Heading Here]</h1>
    </header>

    <main>
        <article>
            <section>
                <h2>[Insert Section 1 Title Here]</h2>
                <p>[Insert Section 1 Content Here]</p>
            </section>

            <section>
                <h2>[Insert Section 2 Title Here]</h2>
                <p>[Insert Section 2 Content Here]</p>
            </section>

            <section>
                <h2>[Insert Section 3 Title Here]</h2>
                <p>[Insert Section 3 Content Here]</p>
            </section>

            <section>
                <h2>[Insert Section 4 Title Here]</h2>
                <p>[Insert Section 4 Content Here]</p>
            </section>

            <section>
                <h2>[Insert Section 5 Title Here]</h2>
                <p>[Insert Section 5 Content Here]</p>
            </section>

            <section>
                <h2>[Insert Conclusion Title Here]</h2>
                <p>[Insert Conclusion Content Here]</p>
            </section>
        </article>
    </main>

    <footer>
        <p>&copy; [Insert Year Here] [Insert Website Name Here]. All rights reserved.</p>
    </footer>

</body>
</html>
"""

In [23]:
# Load the API key from the .env file
dotenv.load_dotenv()

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ["OPENAI_API_KEY"],
)

# Connect to the database
conn = sqlite3.connect('arxiv_papers.db')

for query in queries:
    print(f"Query: {query}")
    df = pd.read_sql_query(f"SELECT * FROM papers WHERE query = '{query}' ORDER BY published DESC LIMIT 10", conn)
    information = df.to_dict(orient='records')
    in_depth = []
    for paper in information:
        in_depth.append(read_papers(paper['link']))

    brief_summary = []
    for paper in in_depth:
        if paper is not None:
            brief_summary.append(client.chat.completions.create(
                model="gpt-4o-mini",
                messages= [
                    {'role': 'system', 'content': f'You are a friendly AI assistant helping me summarise the latest advancements in AI, Machine Learning, Prompt Engineering, Large Language Models, and other related topics. Your goal is to write a summary of the paper to help a reader understand the key points and findings in a concise manner.'},
                    {'role': 'user', 'content': f'Write a brief summary of this paper {str(paper[:15000])}'},
                ],
                temperature=0.0,
            ).choices[0].message.content)

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages= [
            {'role': 'system', 'content': f'You are a friendly AI assistant helping me summarise the latest advancements in AI, Machine Learning, Prompt Engineering, Large Language Models, and other related topics. Your goal is to write an in depth article being very verbose on the topic based on the latest research papers from arXiv. Use the following as an example of the article style {example}. This needs to go into sufficient detail on what research has been done, what the results were, and what the implications are for the field. Perform a summary and conclusion comparing the outputs of the papers and how the reader can use this information to further their understanding of the topic and apply this knowledge to their own work.'},
            {'role': 'user', 'content': f'Write an in depth article based on this information {str(information + brief_summary)}. Please ensure that all information is accurate and up to date and that the article is well written and engaging. You also need to include all authors, links to the papers, and publication dates. At the end of the article, please ensure appropriate credit is given to the authors of the papers and arxiv for the API access to the content. Please output the format with this HTML template {html_template}'},
        ],
        temperature=0.7,
    )

    # parse the query name to get the topic
    topic = query.split(' OR ')[0].split('"')[1]

    # save the response to a file
    with open(f'articles/{topic}.html', 'w', encoding='utf-8') as f:
        f.write(response.choices[0].message.content[7:-4])


Query: ti: "Generative AI" OR abs: "Generative AI"
PDF saved to papers\2501.10091v1.pdf
PDF saved to papers\2501.05542v1.pdf
PDF saved to papers\2412.14286v1.pdf
PDF saved to papers\2412.10337v2.pdf
PDF saved to papers\2412.08610v1.pdf
PDF saved to papers\3685680.pdf
Error reading PDF: EOF marker not found
PDF saved to papers\2411.14627v1.pdf
PDF saved to papers\2410.19806v2.pdf
PDF saved to papers\2410.13899v1.pdf
PDF saved to papers\2410.03897v2.pdf
Query: ti: "Large Language Models" OR abs: "Large Language Models"
PDF saved to papers\2501.09223v1.pdf



KeyboardInterrupt

