# IMPORT MODULES (always run)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import json
import re
import time
from pathlib import Path

# arXiv Search Script

The following scripts are written to extract first date of visibility of every paper on arXiv, and the extracted data are stored as CSVs. arXiv is a static website, hence, data can be scraped from HTML tags of the website.

In [None]:
#load the conference submission data from json : ICML/ICLR
with open("ICML/ICML2023.json", 'r') as file:
    data = json.load(file)

In [None]:
with open ("arxiv DATA/ICML/ICML2023arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])

    for i in range (len(data)):
        paper_title = data[i]['name']
        paper = paper_title.replace(" ","+")
        search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
        response = requests.get(search_query)
        soup = BeautifulSoup(response.text, "html.parser")

        # <p> tag that contains the submission info
        submission_info = soup.find("p", class_="is-size-7")
        date = "Not Found"

        if submission_info:
            text = submission_info.get_text(" ", strip=True)

            # find v1 submitted date first
            v1_match = re.search(r"v1 submitted ([^;]+);", text)
            if v1_match:
                date = v1_match.group(1).strip()
            else:
                # If not found, originally announced date
                announced_match = re.search(r"originally announced ([^.]+)\.", text)
                if announced_match:
                    date = announced_match.group(1).strip()

        writer.writerow([paper_title, date])

## AAAI

In [None]:
years = [2023, 2024, 2025]
for year in years:
  df = pd.read_csv(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/AAAI/AAAI{year}.csv")
  titles = df["Title"].to_list()
  with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/AAAI/AAAI{year}arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
      writer=csv.writer(f)
      writer.writerow(["Title","Submission Date"])

      for i in range (len(titles)):
          paper_title = titles[i]
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

## KDD

In [None]:
year = 2025

In [None]:
df = pd.read_csv(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/KDD/KDD{year}papers.csv")
titles = df["Title"].to_list()

In [None]:

with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/KDD/KDD{year}arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])

    for i in range (len(titles)):
        paper_title = titles[i]
        paper = paper_title.replace(" ","+")
        search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
        response = requests.get(search_query)
        soup = BeautifulSoup(response.text, "html.parser")

        # <p> tag that contains the submission info
        submission_info = soup.find("p", class_="is-size-7")
        date = "Not Found"

        if submission_info:
            text = submission_info.get_text(" ", strip=True)

            # find v1 submitted date first
            v1_match = re.search(r"v1 submitted ([^;]+);", text)
            if v1_match:
                date = v1_match.group(1).strip()
            else:
                # If not found, originally announced date
                announced_match = re.search(r"originally announced ([^.]+)\.", text)
                if announced_match:
                    date = announced_match.group(1).strip()

        writer.writerow([paper_title, date])

## ACL

In [None]:
year = 2025

In [None]:
df = pd.read_csv(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ACL/ACL{year}papers.csv")
titles = df["Title"].to_list()

In [None]:

with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ACL/ACL{year}arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])

    for i in range (len(titles)):
        paper_title = titles[i]
        paper = paper_title.replace(" ","+")
        search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
        response = requests.get(search_query)
        soup = BeautifulSoup(response.text, "html.parser")

        # <p> tag that contains the submission info
        submission_info = soup.find("p", class_="is-size-7")
        date = "Not Found"

        if submission_info:
            text = submission_info.get_text(" ", strip=True)

            # find v1 submitted date first
            v1_match = re.search(r"v1 submitted ([^;]+);", text)
            if v1_match:
                date = v1_match.group(1).strip()
            else:
                # If not found, originally announced date
                announced_match = re.search(r"originally announced ([^.]+)\.", text)
                if announced_match:
                    date = announced_match.group(1).strip()

        writer.writerow([paper_title, date])

## CVPR

In [None]:
year = 2025

In [None]:
with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/CVPR/CVPR{year}.json", 'r') as file:
    data = json.load(file)

In [None]:
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/CVPR/CVPR{year}arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])

    for i in range (len(data)):
        paper_title = data[i]['name']
        paper = paper_title.replace(" ","+")
        search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
        response = requests.get(search_query)
        soup = BeautifulSoup(response.text, "html.parser")

        # <p> tag that contains the submission info
        submission_info = soup.find("p", class_="is-size-7")
        date = "Not Found"

        if submission_info:
            text = submission_info.get_text(" ", strip=True)

            # find v1 submitted date first
            v1_match = re.search(r"v1 submitted ([^;]+);", text)
            if v1_match:
                date = v1_match.group(1).strip()
            else:
                # If not found, originally announced date
                announced_match = re.search(r"originally announced ([^.]+)\.", text)
                if announced_match:
                    date = announced_match.group(1).strip()

        writer.writerow([paper_title, date])

## ICLR

In [None]:
category = "accepted(oral)" #["accepted(oral)", "accepted(poster)", "accepted(spotlight)", "desk_rejected", "rejected", "withdrawn"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ICLR/ICLR 2025/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}/ICLR2025_oral_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "accepted(poster)" #["accepted(oral)", "accepted(poster)", "accepted(spotlight)", "desk_rejected", "rejected", "withdrawn"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ICLR/ICLR 2025/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}/ICLR2025_poster_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "accepted(spotlight)" #["accepted(oral)", "accepted(poster)", "accepted(spotlight)", "desk_rejected", "rejected", "withdrawn"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ICLR/ICLR 2025/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}/ICLR2025_spotlight_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "desk_rejected" #["accepted(oral)", "accepted(poster)", "accepted(spotlight)", "desk_rejected", "rejected", "withdrawn"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ICLR/ICLR 2025/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}/ICLR2025_{category}_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "reject" #["accepted(oral)", "accepted(poster)", "accepted(spotlight)", "desk_rejected", "reject", "withdrawn"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ICLR/ICLR 2025/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}/ICLR2025_{category}_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "withdrawn" #["accepted(oral)", "accepted(poster)", "accepted(spotlight)", "desk_rejected", "rejected", "withdrawn"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ICLR/ICLR 2025/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/{category}/ICLR2025_{category}_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

## ICML

In [None]:
category = "accept(oral)" #["accept(poster)", "accept(oral), "accept(spotlight)", "rejected", "retracted acceptance"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICML/ICML 2025/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ICML/ICML 2025/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICML/ICML 2025/{category}/ICML2025_oral_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "accept(spotlight)" #["accept(poster)", "accept(oral), "accept(spotlight)", "rejected", "retracted acceptance"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICML/ICML 2025/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ICML/ICML 2025/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICML/ICML 2025/{category}/ICML2025_spotlight_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "accept(poster)" #["accept(poster)", "accept(oral), "accept(spotlight)", "rejected", "retracted acceptance"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICML/ICML 2025/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ICML/ICML 2025/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICML/ICML 2025/{category}/ICML2025_poster_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "rejected" #["accept(poster)", "accept(oral), "accept(spotlight)", "rejected", "retracted acceptance"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICML/ICML 2025/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ICML/ICML 2025/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICML/ICML 2025/{category}/ICML2025_{category}_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "retracted acceptance" #["accept(poster)", "accept(oral), "accept(spotlight)", "rejected", "retracted acceptance"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICML/ICML 2025/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/ICML/ICML 2025/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICML/ICML 2025/{category}/ICML2025_retracted_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

## NeurIPS

In [None]:
category = "accept(poster)" #["accept(poster)", "accept(oral), "accept(spotlight)", "reject"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/NeurIPS/NeurIPS 2023/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/{category}/NeurIPS2023_poster_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "accept(poster)" #["accept(poster)", "accept(oral), "accept(spotlight)", "reject"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2024/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/NeurIPS/NeurIPS 2024/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2024/{category}/NeurIPS2024_poster_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "accept(oral)" #["accept(poster)", "accept(oral), "accept(spotlight)", "reject"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/NeurIPS/NeurIPS 2023/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/{category}/NeurIPS2023_oral_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "accept(oral)" #["accept(poster)", "accept(oral), "accept(spotlight)", "reject"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2024/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/NeurIPS/NeurIPS 2024/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2024/{category}/NeurIPS2024_oral_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "accept(spotlight)" #["accept(poster)", "accept(oral), "accept(spotlight)", "reject"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/NeurIPS/NeurIPS 2023/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/{category}/NeurIPS2023_spotlight_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "accept(spotlight)" #["accept(poster)", "accept(oral), "accept(spotlight)", "reject"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2024/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/NeurIPS/NeurIPS 2024/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2024/{category}/NeurIPS2024_spotlight_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "reject" #["accept(poster)", "accept(oral), "accept(spotlight)", "reject"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/NeurIPS/NeurIPS 2023/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/{category}/NeurIPS2023_{category}_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

In [None]:
category = "reject" #["accept(poster)", "accept(oral), "accept(spotlight)", "reject"]
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2024/{category}")
json_count = len(list(directory.glob("*.json")))
with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/NeurIPS/NeurIPS 2024/{category}/{category}_arxiv.csv", "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])
    for i in range(1, json_count+1):
      #change json name with every category update
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2024/{category}/NeurIPS2024_{category}_page{i}.json", 'r') as file:
        all_content = json.load(file)
        for note in all_content['notes']:
          paper_title = note['content']['title']['value']
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          #error 400
          if response.status_code == 400:
            continue

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(search_query)
          response.raise_for_status()

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

        time.sleep(1) #to avoid error 429 : too many requests in short time

## EMNLP

In [None]:
year = 2023

In [None]:
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/EMNLP/{year}")
output_dir = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/EMNLP/{year}")
csv_files = list(directory.glob("*.csv"))
for file in csv_files:
  df = pd.read_csv(file)
  titles = df["title"]

  output_csv = output_dir / f"{file.stem}_arxiv.csv"
  with open (output_csv, "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])

    for i in range (len(titles)):
        paper_title = titles[i]
        paper = paper_title.replace(" ","+")
        search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
        response = requests.get(search_query)
        soup = BeautifulSoup(response.text, "html.parser")

        # <p> tag that contains the submission info
        submission_info = soup.find("p", class_="is-size-7")
        date = "Not Found"

        if submission_info:
            text = submission_info.get_text(" ", strip=True)

            # find v1 submitted date first
            v1_match = re.search(r"v1 submitted ([^;]+);", text)
            if v1_match:
                date = v1_match.group(1).strip()
            else:
                # If not found, originally announced date
                announced_match = re.search(r"originally announced ([^.]+)\.", text)
                if announced_match:
                    date = announced_match.group(1).strip()

        writer.writerow([paper_title, date])

In [None]:
year = 2024

In [None]:
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/EMNLP/{year}")
csv_files = list(directory.glob("*.csv"))
for file in csv_files:
  df = pd.read_csv(file)
  titles = df["title"]
  output_dir = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/EMNLP/{year}")
  output_csv = output_dir / f"{file.stem}_arxiv.csv"
  with open (output_csv, "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])

    for i in range (len(titles)):
        paper_title = titles[i]
        paper = paper_title.replace(" ","+")
        search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
        response = requests.get(search_query)
        soup = BeautifulSoup(response.text, "html.parser")

        # <p> tag that contains the submission info
        submission_info = soup.find("p", class_="is-size-7")
        date = "Not Found"

        if submission_info:
            text = submission_info.get_text(" ", strip=True)

            # find v1 submitted date first
            v1_match = re.search(r"v1 submitted ([^;]+);", text)
            if v1_match:
                date = v1_match.group(1).strip()
            else:
                # If not found, originally announced date
                announced_match = re.search(r"originally announced ([^.]+)\.", text)
                if announced_match:
                    date = announced_match.group(1).strip()

        writer.writerow([paper_title, date])

In [None]:

  df = pd.read_csv(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/EMNLP/2025/EMNLP2025.csv")
  titles = df["title"].to_list()
  with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/EMNLP/2025/EMNLP2025arxivarxiv.csv", "w", newline = "", encoding = "utf-8") as f:
      writer=csv.writer(f)
      writer.writerow(["Title","Submission Date"])

      for i in range (len(titles)):
          paper_title = titles[i]
          paper = paper_title.replace(" ","+")
          search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
          response = requests.get(search_query)
          soup = BeautifulSoup(response.text, "html.parser")

          # <p> tag that contains the submission info
          submission_info = soup.find("p", class_="is-size-7")
          date = "Not Found"

          if submission_info:
              text = submission_info.get_text(" ", strip=True)

              # find v1 submitted date first
              v1_match = re.search(r"v1 submitted ([^;]+);", text)
              if v1_match:
                  date = v1_match.group(1).strip()
              else:
                  # If not found, originally announced date
                  announced_match = re.search(r"originally announced ([^.]+)\.", text)
                  if announced_match:
                      date = announced_match.group(1).strip()

          writer.writerow([paper_title, date])

## COLING

In [None]:
year = 2025
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/COLING/{year}")
csv_files = list(directory.glob("*.csv"))
for file in csv_files:
  df = pd.read_csv(file)
  titles = df["title"]
  output_dir = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/COLING/{year}")
  output_csv = output_dir / f"{file.stem}_arxiv.csv"
  with open (output_csv, "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])

    for i in range (len(titles)):
        paper_title = titles[i]
        paper = paper_title.replace(" ","+")
        search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
        response = requests.get(search_query)
        soup = BeautifulSoup(response.text, "html.parser")

        # <p> tag that contains the submission info
        submission_info = soup.find("p", class_="is-size-7")
        date = "Not Found"

        if submission_info:
            text = submission_info.get_text(" ", strip=True)

            # find v1 submitted date first
            v1_match = re.search(r"v1 submitted ([^;]+);", text)
            if v1_match:
                date = v1_match.group(1).strip()
            else:
                # If not found, originally announced date
                announced_match = re.search(r"originally announced ([^.]+)\.", text)
                if announced_match:
                    date = announced_match.group(1).strip()

        writer.writerow([paper_title, date])

In [None]:
year = 2024
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/COLING/{year}")
csv_files = list(directory.glob("*.csv"))
for file in csv_files:
  df = pd.read_csv(file)
  titles = df["Title"]
  output_dir = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Arxiv/COLING/{year}")
  output_csv = output_dir / f"{file.stem}_arxiv.csv"
  with open (output_csv, "w", newline = "", encoding = "utf-8") as f:
    writer=csv.writer(f)
    writer.writerow(["Title","Submission Date"])

    for i in range (len(titles)):
        paper_title = titles[i]
        paper = paper_title.replace(" ","+")
        search_query = f"https://arxiv.org/search/?query={paper}&searchtype=title&order=-announced_date_first"
        response = requests.get(search_query)
        soup = BeautifulSoup(response.text, "html.parser")

        # <p> tag that contains the submission info
        submission_info = soup.find("p", class_="is-size-7")
        date = "Not Found"

        if submission_info:
            text = submission_info.get_text(" ", strip=True)

            # find v1 submitted date first
            v1_match = re.search(r"v1 submitted ([^;]+);", text)
            if v1_match:
                date = v1_match.group(1).strip()
            else:
                # If not found, originally announced date
                announced_match = re.search(r"originally announced ([^.]+)\.", text)
                if announced_match:
                    date = announced_match.group(1).strip()

        writer.writerow([paper_title, date])