In [2]:
import requests
import tarfile
import io
from io import BytesIO
import re
import os
import json
import fitz
import ast
import time
import sqlite3
import hashlib

ModuleNotFoundError: No module named 'urlib'

In [17]:
import requests
import xml.etree.ElementTree as ET

base_url = 'http://export.arxiv.org/api/query'

search_query = 'all:machine learning'
max_results = 200
start_index = 0  
total_papers = 200  
papers_data = []

while len(papers_data) < total_papers:
    params = {
        'search_query': search_query,
        'start': start_index,
        'max_results': max_results,
        'sortBy': 'relevance',
        'sortOrder': 'descending'
    }
    
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:

        root = ET.fromstring(response.content)
        
        for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
            abs_link = entry.find('{http://www.w3.org/2005/Atom}id').text
            title = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
            published = entry.find('{http://www.w3.org/2005/Atom}published').text
            authors = entry.findall('{http://www.w3.org/2005/Atom}author')
            author_list = [author.find('{http://www.w3.org/2005/Atom}name').text for author in authors]
            
            papers_data.append({
                'title': title,
                'published': published,
                'authors': author_list,
                'abs_link': abs_link
            })
            
            if len(papers_data) >= total_papers:
                break

        start_index += max_results

    else:
        print('Failed to retrieve data:', response.status_code)
        break

for paper in papers_data:
    print(f"Title: {paper['title']}")
    print(f"Published: {paper['published']}")
    print(f"Authors: {', '.join(paper['authors'])}")
    print(f"Abstract Link: {paper['abs_link']}\n")


Title: Lecture Notes: Optimization for Machine Learning
Published: 2019-09-08T21:49:42Z
Authors: Elad Hazan
Abstract Link: http://arxiv.org/abs/1909.03550v1

Title: An Optimal Control View of Adversarial Machine Learning
Published: 2018-11-11T14:28:34Z
Authors: Xiaojin Zhu
Abstract Link: http://arxiv.org/abs/1811.04422v1

Title: Minimax deviation strategies for machine learning and recognition with
  short learning samples
Published: 2017-07-16T09:15:08Z
Authors: Michail Schlesinger, Evgeniy Vodolazskiy
Abstract Link: http://arxiv.org/abs/1707.04849v1

Title: Machine Learning for Clinical Predictive Analytics
Published: 2019-09-19T22:02:00Z
Authors: Wei-Hung Weng
Abstract Link: http://arxiv.org/abs/1909.09246v1

Title: Towards Modular Machine Learning Solution Development: Benefits and
  Trade-offs
Published: 2023-01-23T22:54:34Z
Authors: Samiyuru Menik, Lakshmish Ramaswamy
Abstract Link: http://arxiv.org/abs/2301.09753v1

Title: Introduction to Machine Learning: Class Notes 67577
Publ

In [91]:
from dotenv import load_dotenv
load_dotenv('Key.env')

True

In [92]:
from openai import OpenAI
client = OpenAI(
    #Creates a client class
        api_key=os.environ.get("API_KEY"),
    )

In [109]:
def is_valid_python_code(code_str):
    try:
        ast.parse(code_str)
        return True
    except SyntaxError:
        return False
    
def create_unique_hash(input_string, input_list):
    import hashlib
    list_string = str(input_list)
    combined_input = input_string + list_string
    hash_object = hashlib.sha256(combined_input.encode())
    unique_hash = hash_object.hexdigest()
    return int(unique_hash[:8], 16)  # Convert part of the hash to an integer


In [110]:
debug = False
def process_pdf(url):
    response = requests.get(url)

    if response.status_code != 200:
        raise Exception("Request Failed!")

    # Open the PDF file with PyMuPDF directly from the in-memory bytes
    pdf_document = fitz.open(stream=BytesIO(response.content), filetype='pdf')

    # Extract text from each page and store it in a string
    pdf_text = ""
    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        pdf_text += page.get_text()

    # Optionally, encode the string as UTF-8
    pdf_text_utf8 = pdf_text.encode('utf-8').decode('utf-8')

    # Close the PDF document
    pdf_document.close()
    return pdf_text_utf8

def prompt_gpt(pdf, prompt_txt, word_start, word_cutoff, format_needed):
    #no good solution, simply grabs the first few hundred characters and hopes all the authors are in that area, has already failed so far, let's improve on this later
    intro = pdf[word_start:word_cutoff]
    #prompts gpt [possibly just hard code this solutoin, honestly just doing this for practice]

    valid_syntax = False
    loop_count = 0

    while(not valid_syntax):
        intro = pdf[word_start:word_cutoff]
        loop_count += 1
        if(loop_count == 3):
          #To prevent an infinite loop
          return False

        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": "Without any extra explanation text, " + prompt_txt + " " + intro,
                }
            ],
            model="gpt-4",
        )
        if(debug):
            print(chat_completion.choices[0].message.content)

        valid_syntax = (is_valid_python_code(chat_completion.choices[0].message.content)) or (not format_needed)

    #AST will automatically throw an error if GPT's response is deemed bad
    
    if(format_needed):
        return ast.literal_eval(chat_completion.choices[0].message.content)
    return chat_completion.choices[0].message.content

def grab_author_institutions(pdf):
    return prompt_gpt(pdf, "find all authors and corresponding instiutions, and return them in a dictionary, author key, insti",0,1500, True)

In [112]:
import sqlite3

# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('papers.db')
cursor = conn.cursor()

# Create the Paper table
cursor.execute('''
CREATE TABLE IF NOT EXISTS Paper (
    ID INTEGER PRIMARY KEY AUTOINCREMENT,
    TITLE TEXT NOT NULL,
    DATE TEXT NOT NULL
)
''')
# Create the Author table
cursor.execute('''
CREATE TABLE IF NOT EXISTS Author (
    ID INTEGER PRIMARY KEY AUTOINCREMENT,
    NAME TEXT NOT NULL,
    INSTITUTIONS TEXT NOT NULL
)
''')
# Create the PaperAuthor join table
cursor.execute('''
CREATE TABLE IF NOT EXISTS PaperAuthor (
    PaperID INTEGER,
    AuthorID INTEGER,
    FOREIGN KEY (PaperID) REFERENCES Paper(ID),
    FOREIGN KEY (AuthorID) REFERENCES Author(ID),
    PRIMARY KEY (PaperID, AuthorID)
)
''')
# Commit the changes
conn.commit()
conn.close()

['Raoyuan Zhao', 'Abdullatif Köksal', 'Yihong Liu', 'Leonie Weissweiler', 'Anna Korhonen', 'Hinrich Schütze'] SYNTHEVAL: Hybrid Behavioral Testing of NLP Models with Synthetic CheckLists 2023 {'Raoyuan Zhao': ['Technical University of Munich', 'LMU Munich'], 'Abdullatif Köksal': ['LMU Munich', 'Munich Center for Machine Learning', 'University of Cambridge'], 'Yihong Liu': ['LMU Munich', 'Munich Center for Machine Learning'], 'Leonie Weissweiler': ['LMU Munich', 'Munich Center for Machine Learning'], 'Anna Korhonen': ['University of Cambridge'], 'Hinrich Schütze': ['LMU Munich', 'Munich Center for Machine Learning']}
Success! 2408.17437 7.908407926559448
