# **EXTRACTION AND TRANSFORMATION OF CONFERENCE DATA**

This notebook is split into 2 sections.
- Data Extraction :
This section contains a list of methods to extract data from multiple data storage formats, to fetch data of accepted papers for major conferences in Ai/ML/NLP/CSV.
- Data Transformation :
This section contains a list of methods to transform data from multiple data storage formats, to a uniform storage format, i.e., csv.

## IMPORT MODULES (always run)

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import csv
from pathlib import Path
import time
import re

# **DATA EXTRACTION**

For the following conference, the data extraction is done from the HTML tags of static websites, using the BeautifulSoup library. The year-wise data of Paper titles, Paper Abstract, List of Authors and their affiliations (if available) are collected.

## COLING

In [None]:
year = 2024 #[2023, 2024, 2025]
url = f"https://lrec-coling-{year}.org/list-of-accepted-papers/"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

In [None]:

papers = []

rows = soup.find_all("tr")

for row in rows:
    cells = row.find_all('td')
    authors = None
    title = None

    for cell in cells:
        cell_id = cell.get('data-cell-id', '')
        if re.match(r'^B\d+$', cell_id):  # B followed by digits
            authors = cell.get_text(strip=True)
        elif re.match(r'^C\d+$', cell_id):  # C followed by digits
            title = cell.get_text(strip=True)

    if authors and title:
        papers.append({f"Title": title, f"Authors": authors})

In [None]:
df = pd.DataFrame(papers)
df.to_csv(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/COLING/2024/coling_2024_papers.csv", index=False, encoding="utf-8")

## KDD

In [None]:

def kdd_extraction(year, url):
  response = requests.get(url)
  response.raise_for_status()
  soup = BeautifulSoup(response.text, 'html.parser')
  with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/KDD/KDD{year}papers.csv", "w", newline="", encoding="utf-8") as csvfile:
      writer = csv.writer(csvfile)
      writer.writerow(["Title", "Authors_Institutes"])

      for h5 in soup.find_all('h5', class_ = "wp-block-heading has-text-align-left"):
        title = h5.get_text(strip = True)
        p_tag = h5.find_next("p")
        authors_institutes = p_tag.get_text(strip=True)
        writer.writerow([title, authors_institutes])
def main ():
  years = [2023, 2024, 2025]
  url2023 = f'https://kdd.org/kdd{years[0]}/research-track-papers/index.html'
  url2024 = f'https://kdd{years[1]}.kdd.org/research-track-papers/'
  url2025 = f'https://kdd{years[2]}.kdd.org/research-track-papers/'
  urls = [url2023, url2024, url2025]
  for year, url in zip(years, urls):
    kdd_extraction(year, url)

if __name__ == "__main__":
  main()

## ACL (different website layout for each year)

In [None]:
year = 2023
url = f'https://{year}.aclweb.org/program/accepted_main_conference/#long-papers'
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ACL/ACL{year}papers.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Title", "Authors"])

    for p in soup.find_all('p'):
      title = p.find("strong").get_text(strip = True)
      authors = p.find('em').get_text(strip=True)
      writer.writerow([title, authors])

year = 2024
url = f'https://{year}.aclweb.org/program/main_conference_papers/'
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ACL/ACL{year}papers.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Title", "Authors"])

    for li in soup.find_all('li'):

        title_tag = li.find("strong")
        authors_tag = li.find("em")

        title = title_tag.get_text(strip=True) if title_tag else ""
        authors = authors_tag.get_text(strip=True) if authors_tag else ""

        writer.writerow([title, authors])

year = 2025
url = f'https://{year}.aclweb.org/program/main_papers/'
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ACL/ACL{year}papers.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Title", "Authors"])

    for li in soup.find_all('li'):

        title_tag = li.find("strong")
        authors_tag = li.find("em")

        title = title_tag.get_text(strip=True) if title_tag else ""
        authors = authors_tag.get_text(strip=True) if authors_tag else ""

        writer.writerow([title, authors])

df = pd.read_csv(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ACL/ACL{year}papers.csv")
df.dropna(how="all", inplace=True)
df.to_csv(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ACL/ACL{year}papers.csv", index=False)

## CVPR

CVPR conference website is dynamic, that renders data in the frontend from database stored as json in the backend.

In [None]:
def cvpr_extraction(year):
  url = f"https://cvpr.thecvf.com/static/virtual/data/cvpr-{year}-orals-posters.json"
  response = requests.get(url)
  response.raise_for_status()
  data = response.json()
  with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/CVPR/CVPR{year}.json", 'w') as file:
        json.dump(data['results'],file)
def main():
  years = [2023, 2024, 2025]
  for year in years:
    cvpr_extraction(year)

if __name__ == "__main__":
  main()

## AAAI

AAAI websites host their database of accepted papers, in the form of pdf, which encountered strructuring issues on converting to csv. This problem has been handled by accessing the website : https://papercopilot.com.

In [None]:
def AAAI_extraction(year):
  url = f"https://raw.githubusercontent.com/papercopilot/paperlists/main/aaai/aaai{year}.json"
  response = requests.get(url)
  response.raise_for_status()
  data = response.json()
  df = pd.DataFrame(data)
  df.to_csv(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/AAAI/AAAI{year}.csv")

def AAAI_cleaning(year):
  df = pd.read_csv(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/AAAI/AAAI{year}.csv")
  df = df[["title", "abstract", "author", "aff"]]
  df = df.rename(columns={'title': 'Title', 'abstract': 'Abstract', 'author': 'Authors', 'aff': 'Institutions'})
  df.to_csv(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/AAAI/AAAI{year}.csv", index=False, encoding="utf-8")

def main ():
  years = [2023, 2024, 2025]
  for year in years:
    AAAI_extraction(year)
    AAAI_cleaning(year)

if __name__ == "__main__":
  main()

# OpenReview Script

OpenReview does not host their database in the form of HTML tags. The website renders content dynamically from their database, using JavaScript. This database is publicly available, which can be accessed from specific endpoint API URls. On hitting these API URLs, the data can be viewed in the form of JSON. The same approach has been scripted in Python to load and extract the JSON.

The following conferences conduct their Review process on OpenReview
- ICLR
- NeurIPS
- ICML (starting from 2025)

The modularized code for ICLR is illustrated in this notebook, the same process has been repeeated for ICML (2023-2025) and NeurIPS (2023-2024).

# ICLR

## ICLR 2023

### ICLR 2023 TOP 5%

In [None]:
def ICLR2023_top5_extraction(tabs):
  for i in range(tabs):
    offset = 25*(i)
    url = f'https://api.openreview.net/notes?content.venue=ICLR+2023+notable+top+5%25&details=replyCount&offset={offset}&limit=25&invitation=ICLR.cc%2F2023%2FConference%2F-%2FBlind_Submission'
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    year = 2023
    with open (f"ICLR{year}_top5.json", 'w') as file: #modify filename while running cell to prevent over writing on the same file
        json.dump(data, file)

def main():
  tabs = 5 #please check website
  ICLR2023_top5_extraction(tabs)

if __name__ == "__main__":
  main()


### ICLR 2023 Top 25%

In [None]:
def ICLR2023_top25_extraction(tabs):
  for i in range(tabs):
      offset = 25*(i)
      url = f'https://api.openreview.net/notes?content.venue=ICLR+2023+notable+top+25%25&details=replyCount&offset={offset}&limit=25&invitation=ICLR.cc%2F2023%2FConference%2F-%2FBlind_Submission'
      response = requests.get(url)
      response.raise_for_status()
      data = response.json()
      with open(f"ICLR2023_top25_page{i}.json", 'w') as file:
          json.dump(data,file)

def main():
  tabs = 12 #please check the website
  ICLR2023_top25_extraction(tabs)

if __name__ == "__main__":
  main()

### ICLR 2023 Poster

In [None]:
for i in range(1, 50):
    offset = 25*(i-1)
    url = f'https://api.openreview.net/notes?content.venue=ICLR+2023+poster&details=replyCount&offset={offset}&limit=25&invitation=ICLR.cc%2F2023%2FConference%2F-%2FBlind_Submission'
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    with open(f"ICLR 2023/Poster/ICLR2023_Posters_page{i}.json", 'w') as file:
        json.dump(data,file)

### ICLR 2023 Submitted

In [None]:
def ICLR2023_submitted_extraction(tabs):
  for i in range(tabs):
      offset = 25*(i-1)
      url = f'https://api.openreview.net/notes?content.venue=Submitted+to+ICLR+2023&details=replyCount&offset={offset}&limit=25&invitation=ICLR.cc%2F2023%2FConference%2F-%2FBlind_Submission'
      response = requests.get(url)
      response.raise_for_status()
      data = response.json()
      with open(f"ICLR 2023/Submitted/ICLR2023_Submitted_page{i}.json", 'w') as file:
          json.dump(data,file)

def main ():
  tabs = 90
  ICLR2023_submitted_extraction(tabs)

if __name__ == "__main__":
  main()

### ICLR 2023 Desk Rejected/Withdrawn Submission

In [None]:
def ICLR2023_rejected_extraction(tabs):
  for i in range(tabs):
      offset = 25*(i-1)
      url = f'https://api.openreview.net/notes?details=replyCount%2Cinvitation%2Coriginal&offset={offset}&limit=25&invitation=ICLR.cc%2F2023%2FConference%2F-%2FWithdrawn_Submission'
      response = requests.get(url)
      response.raise_for_status()
      data = response.json()
      with open(f"ICLR 2023/Withdrawn_Rejected/ICLR2023_withdrawn_page{i}.json", 'w') as file:
          json.dump(data,file)

def main ():
  tabs = 48
  ICLR2023_rejected_extraction(tabs)

if __name__ == "__main__":
  main()

## ICLR 2024

### ICLR 2024 (accepted oral)

In [None]:
for i in range(1, 5):
    offset = 25*(i-1)
    url = f'https://api2.openreview.net/notes?content.venue=ICLR%202024%20oral&details=replyCount%2Cpresentation%2Cwritable&domain=ICLR.cc%2F2024%2FConference&limit=25&offset={offset}'
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    with open(f"ICLR 2024/accepted(oral)/ICLR2024_oral_page{i}.json", 'w') as file:
        json.dump(data,file)

### ICLR 2024 (accepted spotlight)

In [None]:
for i in range(1, 16):
    offset = 25*(i-1)
    url = f'https://api2.openreview.net/notes?content.venue=ICLR%202024%20spotlight&details=replyCount%2Cpresentation%2Cwritable&domain=ICLR.cc%2F2024%2FConference&limit=25&offset={offset}'
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    with open(f"ICLR 2024/accepted(spotlight)/ICLR2024_spotlight_page{i}.json", 'w') as file:
        json.dump(data,file)

### ICLR 2024 (accepted poster)

In [None]:
for i in range(1, 74):
    offset = 25*(i-1)
    url = f'https://api2.openreview.net/notes?content.venue=ICLR%202024%20poster&details=replyCount%2Cpresentation%2Cwritable&domain=ICLR.cc%2F2024%2FConference&limit=25&offset={offset}'
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    with open(f"ICLR 2024/accepted(poster)/ICLR2024_poster_page{i}.json", 'w') as file:
        json.dump(data,file)

### ICLR 2024 (reject)

In [None]:
for i in range(1, 139):
    offset = 25*(i-1)
    url = f'https://api2.openreview.net/notes?content.venue=Submitted%20to%20ICLR%202024&details=replyCount%2Cpresentation%2Cwritable&domain=ICLR.cc%2F2024%2FConference&limit=25&offset={offset}'
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    with open(f"ICLR 2024/reject/ICLR2024_reject_page{i}.json", 'w') as file:
        json.dump(data,file)

### ICLR 2024 (withdrawn)

In [None]:
for i in range(61, 68):
    offset = 25*(i-1)
    url = f'https://api2.openreview.net/notes?content.venueid=ICLR.cc%2F2024%2FConference%2FWithdrawn_Submission&details=replyCount%2Cpresentation%2Cwritable&domain=ICLR.cc%2F2024%2FConference&limit=25&offset={offset}'
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    with open(f"ICLR 2024/withdrawn/ICLR2024_withdrawn_page{i}.json", 'w') as file:
        json.dump(data,file)

### ICLR 2024 (Desk Rejected Submissions)

In [None]:
for i in range(1, 4):
    offset = 25*(i-1)
    url = f'https://api2.openreview.net/notes?content.venueid=ICLR.cc%2F2024%2FConference%2FDesk_Rejected_Submission&details=replyCount%2Cpresentation%2Cwritable&domain=ICLR.cc%2F2024%2FConference&limit=25&offset={offset}'
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    with open(f"ICLR 2024/desk_rejected/ICLR2024_desk_rejected_page{i}.json", 'w') as file:
        json.dump(data,file)

## ICLR 2025

### ICLR 2025 (accepted oral)

In [None]:
def ICLR2025_oral_extraction(tabs):
  for i in range(tabs):
      offset = 25*(i-1)
      url = f'https://api2.openreview.net/notes?content.venue=ICLR%202025%20Oral&details=replyCount%2Cpresentation%2Cwritable&domain=ICLR.cc%2F2025%2FConference&limit=25&offset={offset}'
      response = requests.get(url)
      response.raise_for_status()
      data = response.json()
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/accepted(oral)/ICLR2025_oral_page{i}.json", 'w') as file:
          json.dump(data,file)

def main():
  tabs = 10
  ICLR2025_oral_extraction(tabs)

if __name__ == "__main__":
  main()

### ICLR 2025 (accepted spotlight)

In [None]:
def ICLR2025_spotlight_extraction(tabs):
  for i in range(tabs):
      offset = 25*(i)
      url = f'https://api2.openreview.net/notes?content.venue=ICLR%202025%20Spotlight&details=replyCount%2Cpresentation%2Cwritable&domain=ICLR.cc%2F2025%2FConference&limit=25&offset={offset}'
      response = requests.get(url)
      response.raise_for_status()
      data = response.json()
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/accepted(spotlight)/ICLR2025_spotlight_page{i}.json", 'w') as file:
          json.dump(data,file)

def main():
  tabs = 17
  ICLR2025_spotlight_extraction(tabs)

if __name__ == "__main__":

### ICLR 2025 (reject)

In [None]:
def ICLR2025_reject_extraction(tabs):
  for i in range(tabs):
      offset = 25*(i)
      url = f'https://api2.openreview.net/notes?content.venue=Submitted%20to%20ICLR%202025&details=replyCount%2Cpresentation%2Cwritable&domain=ICLR.cc%2F2025%2FConference&limit=25&offset={offset}'
      response = requests.get(url)
      response.raise_for_status()
      data = response.json()
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/reject/ICLR2025_reject_page{i}.json", 'w') as file:
          json.dump(data,file)

def main():
  tabs = 198
  ICLR2025_reject_extraction(tabs)

if __name__ == "__main__":
  main()

### ICLR 2025 (withdrawn submissions)

In [None]:
def ICLR2025_withdrawn_extraction(tabs):
  for i in range(tabs):
      offset = 25*(i)
      url = f'https://api2.openreview.net/notes?content.venueid=ICLR.cc%2F2025%2FConference%2FWithdrawn_Submission&details=replyCount%2Cpresentation%2Cwritable&domain=ICLR.cc%2F2025%2FConference&limit=25&offset={offset}'
      response = requests.get(url)
      response.raise_for_status()
      data = response.json()
      with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2025/withdrawn/ICLR2025_withdrawn_page{i}.json", 'w') as file:
          json.dump(data,file)

def main():
  tabs = 121
  ICLR2025_withdrawn_extraction(tabs)

if __name__ == "__main__":
  main()

# Reviews

Reviews have been collected for the conferences in 2023. The Review data are fetched from the OpenReview website, from endpoint API URLs in similar way. The paper ids of previously extracted jsons have been used to collected review data of every paper.

## ICLR 2023

In [None]:
def ICLR_reviews(category):
  directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2023/{category}")
  json_count = len(list(directory.glob("*.json")))
  for i in range(1, json_count+1):
    #change json name with every category update
    with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2023/{category}/ICLR2023_{category}_page{i}.json", 'r') as file:
      content = json.load(file)
      for note in content['notes']:
        id = note['id']
        url = f"https://api.openreview.net/notes?forum={id}&trash=true&details=replyCount%2Cwritable%2Crevisions%2Coriginal%2Coverwriting%2Cinvitation%2Ctags&limit=1000&offset=0"
        response = requests.get(url)

        #to avoid error 429 : too many requests in short time
        if response.status_code == 429:
            print("Rate limit hit. Waiting...")
            time.sleep(10)  # Wait 10 seconds before retry
            response = requests.get(url)

        response.raise_for_status()
        data = response.json()
        with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2023/Reviews/{category}/Reviews{id}.json", 'w') as file:
            json.dump(data, file)
        time.sleep(1) #to avoid error 429 : too many requests in short time

def main():
  categories = ["Poster", "Submitted", "Withdrawn_Rejected", "top25", "top_5"]
  for category in categories:
    ICLR_reviews(category)

if __name__ == "__main__":
  main()

## NeurIPS 2023

In [None]:
def NeurIPS_reviews(category):
  category = "accept(oral)" #["accept(oral)", "accept(poster)", "accept(spotlight)", "reject"]
  directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/{category}")
  json_count = len(list(directory.glob("*.json")))
  for i in range(1, json_count+1):
    #change json name with every category update
    with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/{category}/NeurIPS2023_oral_page{i}.json", 'r') as file:
      content = json.load(file)
      for note in content['notes']:
        id = note['id']
        url = f"https://api2.openreview.net/notes?count=true&details=writable%2Csignatures%2Cinvitation%2Cpresentation%2Ctags&domain=NeurIPS.cc%2F2023%2FConference&forum={id}&limit=1000&trash=true"
        response = requests.get(url)

        #to avoid error 429 : too many requests in short time
        if response.status_code == 429:
            print("Rate limit hit. Waiting...")
            time.sleep(10)  # Wait 10 seconds before retry
            response = requests.get(url)

        response.raise_for_status()
        data = response.json()
        with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/Reviews/{category}/Reviews{id}.json", 'w') as file:
            json.dump(data, file)
        time.sleep(1) #to avoid error 429 : too many requests in short time

def main():
  categories = ["accept(oral)", "accept(poster)", "accept(spotlight)", "reject"]
  for category in categories:
    NeurIPS_reviews(category)

if __name__ == "__main__":
  main()

# **DATA TRANSFORMATION** (json to csv)

The extracted data is converted to uniform data storage format, i.e., csv.

## CVPR

In [None]:
def cvpr_transform(year):
  url = f"https://cvpr.thecvf.com/static/virtual/data/cvpr-{year}-orals-posters.json"
  with open (f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/CVPR/CVPR{year}.csv", "w", newline = "", encoding = "utf-8") as f:
      writer=csv.writer(f)
      writer.writerow(["Title","Authors","Institutes"])
      response = requests.get(url)
      response.raise_for_status()
      data = response.json()
      data = data['results']
      for paper in data:
        title = paper['name']
        au = []
        ins = []
        for author in paper['authors']:
          au.append(author['fullname'])
          ins.append(author['institution'])
        writer.writerow([title, au, ins])
def main():
  years = [2023, 2024, 2025]
  for year in years:
    cvpr_transform(year)

if __name__ == "__main__":
  main()

## ICLR, ICML, NeurIPS (2023-2025)

In [None]:
def transformation(conference, year, category):
directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/{conference}/{conference} {year}/{category}")
json_count = len(list(directory.glob("*.json")))
with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/{conference}/{conference} {year}/{category}/{category}.csv", 'w') as file:
  writer = csv.writer(file)
  writer.writerow(["Title", "Authors", "Institutes"])
  for i in range(1, json_count+1):
    #change json name with every category update
    with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/{conference}/{conference} {year}/{category}/ICLR{year}_{category}s_page{i}.json", 'r') as file:
      all_content = json.load(file)
      for note in all_content['notes']:
        title = note['content']['title']
        au = []
        ins = []
        for author in note['content']['authors']:
          au.append(author)
        for author_id in note['content']['authorids']:
          url = f"https://openreview.net/profile?id={author_id}"
          response = requests.get(url)
          response.raise_for_status()
          soup = BeautifulSoup(response.text, 'html.parser')
          institution = soup.find('div', class_="institution")
          if institution:
            ins.append(institution.get_text(strip = True))
          else:
            ins.append("Not Found")

          #to avoid error 429 : too many requests in short time
          if response.status_code == 429:
              print("Rate limit hit. Waiting...")
              time.sleep(10)  # Wait 10 seconds before retry
              response = requests.get(url)

          response.raise_for_status()
        writer.writerow([title, au, ins])

        time.sleep(1) #to avoid error 429 : too many requests in short time

def main():
  conferences = ["ICLR", "NeurIPS", "ICML"]
  years = [2023, 2024, 2025]
  categories = ["Poster", "Submitted", "Withdrawn_Rejected", "top25", "top_5"] #change as per requirement
  for conference in conferences:
    for year in years:
      for category in categories:
        transformation(conference, year, category)

if __name__ == "__main__":
  main()

## Review Data Transformation

### ICLR

In [None]:

def iclr_review_transform(category):
  directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2023/Reviews/{category}")
  json_files = list(directory.glob("*.json"))
  with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/ICLR/ICLR 2023/Reviews/{category}/A_{category}_Reviews.csv", 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Title", "Rounds of Discussion", "Average Rating", "Average Confidence"])
    for json_file in json_files:
      with open(json_file, 'r') as f_json:
        all_content = json.load(f_json)

        title = "Unknown Title"
        rounds = 0
        if all_content and 'notes' in all_content and len(all_content['notes']) > 0:
            threads = len(all_content['notes'])

            # Robust title extraction
            submission_note = None
            for note in all_content['notes']:
                if 'id' in note and 'forum' in note and note['id'] == note['forum'] and 'content' in note and 'title' in note['content']:
                    submission_note = note
                    break
            if submission_note:
                title_content = submission_note['content']['title']
                if isinstance(title_content, dict) and 'value' in title_content:
                    title = title_content['value']
                elif isinstance(title_content, str):
                    title = title_content
            elif len(all_content['notes']) > 0 and 'content' in all_content['notes'][0] and 'title' in all_content['notes'][0]['content']:
                title_content = all_content['notes'][0]['content']['title']
                if isinstance(title_content, dict) and 'value' in title_content:
                    title = title_content['value']
                elif isinstance(title_content, str):
                    title = title_content


            for i in range(1, threads):
              current_note = all_content['notes'][i]
              previous_note = all_content['notes'][i-1]

              # Check if 'signatures' exists and is not empty before accessing index 0
              if 'signatures' in current_note and current_note['signatures'] and \
                'signatures' in previous_note and previous_note['signatures']:
                  if "Reviewer" in current_note['signatures'][0]:
                    if "Authors" in previous_note['signatures'][0]:
                      rounds += 1

        disc = rounds

        conf_values = []
        rating_values = []
        for note in all_content['notes']:
          if 'confidence' in note['content']:
            conf_values.append(int(note['content']['confidence'][0]))
          if 'recommendation' in note['content']:
            rating_values.append(int(note['content']['recommendation'][0]))

        avg_conf = sum(conf_values) / len(conf_values) if len(conf_values) > 0 else 0
        avg_rating = sum(rating_values) / len(rating_values) if len(rating_values) > 0 else 0

        writer.writerow([title, disc, avg_rating, avg_conf])

def main():
  categories = ["Poster", "Submitted", "Withdrawn_Rejected", "top25", "top_5"]
  for category in categories:
    iclr_review_transform(category)

if __name__ == "__main__":
  main()

### NeurIPS

In [None]:
def cvpr_review_transform(category):
  directory = Path(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/Reviews/{category}")
  json_files = list(directory.glob("*.json"))
  with open(f"/content/drive/MyDrive/Project Progress/Generated Dataset/Conferences/NeurIPS/NeurIPS 2023/Reviews/{category}/A_{category}_Reviews.csv", 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Title", "Rounds of Discussion", "Average Rating", "Average Confidence"])
    for json_file in json_files:
      with open(json_file, 'r') as f_json:
        all_content = json.load(f_json)

        title = "Unknown Title"
        rounds = 0
        if all_content and 'notes' in all_content and len(all_content['notes']) > 0:
            threads = len(all_content['notes']) # <--- Added this line
            submission_note = None
            for note in all_content['notes']:
                if 'id' in note and 'forum' in note and note['id'] == note['forum'] and 'content' in note and 'title' in note['content']:
                    submission_note = note
                    break
            if submission_note:
                title_content = submission_note['content']['title']
                if isinstance(title_content, dict) and 'value' in title_content:
                    title = title_content['value']
                elif isinstance(title_content, str):
                    title = title_content
            elif len(all_content['notes']) > 0 and 'content' in all_content['notes'][0] and 'title' in all_content['notes'][0]['content']:
                title_content = all_content['notes'][0]['content']['title']
                if isinstance(title_content, dict) and 'value' in title_content:
                    title = title_content['value']
                elif isinstance(title_content, str):
                    title = title_content
            for i in range(1, threads):
              current_note = all_content['notes'][i]
              previous_note = all_content['notes'][i-1]

              # Check if 'signatures' exists and is not empty before accessing index 0
              if 'signatures' in current_note and current_note['signatures'] and \
                'signatures' in previous_note and previous_note['signatures']:
                  if "Reviewer" in current_note['signatures'][0]:
                    if "Authors" in previous_note['signatures'][0]:
                      rounds += 1

        disc = rounds

        conf_values = []
        rating_values = []
        for note in all_content['notes']:
          if 'content' in note and 'confidence' in note['content']:
            # Robustly extract confidence value
            confidence_data = note['content']['confidence']
            if isinstance(confidence_data, dict) and 'value' in confidence_data and confidence_data['value']:
                conf_values.append(int(confidence_data['value'][0]))
            elif isinstance(confidence_data, list) and confidence_data:
                conf_values.append(int(confidence_data[0]))
          if 'content' in note and 'rating' in note['content']:
            # Robustly extract rating value
            rating_data = note['content']['rating']
            if isinstance(rating_data, dict) and 'value' in rating_data and rating_data['value']:
                rating_values.append(int(rating_data['value'][0]))
            elif isinstance(rating_data, list) and rating_data:
                rating_values.append(int(rating_data[0]))

        avg_conf = sum(conf_values) / len(conf_values) if len(conf_values) > 0 else 0
        avg_rating = sum(rating_values) / len(rating_values) if len(rating_values) > 0 else 0

        writer.writerow([title, disc, avg_rating, avg_conf])

def main():
  categories = ["accept(oral)", "accept(poster)", "accept(spotlight)", "reject"]
  for category in categories:
    cvpr_review_transform(category)

if __name__ == "__main__":
  main()