**Loading Libraries**


In [None]:
# Load Libraries
from docling.document_converter import DocumentConverter
converter = DocumentConverter()

import os
import pandas as pd
import sqlite3
import datetime

# Ensure the directory exists before connecting to the database
os.makedirs('doc-db', exist_ok=True)

# Connect to the database
doc_db = sqlite3.connect('doc-db/document-db.db')

In [None]:
# Step 1: One time Run
cursor = doc_db.cursor()
# Step 2: Create a table (if not already present)
cursor.execute('''
CREATE TABLE  IF NOT EXISTS ARXIVDOCS (
    DOCID INTEGER PRIMARY KEY AUTOINCREMENT,
    DOCNAME TEXT NOT NULL,
    EXTRACTEDTEXT TEXT NOT NULL,
    PROCESSTIME TEXT NOT NULL
)
''')

def sql_update(DOCNAME, EXTRACTEDTEXT, PROCESSTIME):
  cursor = doc_db.cursor()
  cursor.execute('''
  INSERT INTO ARXIVDOCS (DOCNAME, EXTRACTEDTEXT, PROCESSTIME)
  VALUES (?, ?, ?)
  ''', (DOCNAME, EXTRACTEDTEXT, PROCESSTIME))
  doc_db.commit()

In [None]:
files = os.listdir("arxiv-papers")
files = ["arxiv-papers/" + f for f in files]
files

In [None]:
for file in files:
  print(file)
  try:
    result = converter.convert(file)
    sql_update(file, result.document.export_to_markdown(), datetime.datetime.now())
  except Exception as e:
    print(e)
  # break

In [None]:
# Delete duplicates based on DOCID (keep the first one)
cursor.execute('''
DELETE FROM ARXIVDOCS
WHERE DOCID NOT IN (
    SELECT MIN(DOCID)
    FROM ARXIVDOCS
    GROUP BY DOCNAME
);
''')

doc_db.commit()



In [None]:

rows = cursor.execute("select * from ARXIVDOCS").fetchall()
columns = [desc[0] for desc in cursor.description]
df = pd.DataFrame(rows, columns=columns)
df