In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import os
from pdf2image import convert_from_path
import pytesseract
import re
import sqlite3

In [12]:
spark = SparkSession.builder.appName("text extract from pdf").getOrCreate()

In [13]:
def extract_text_ocr(pdf_path) -> str:
    images = convert_from_path(pdf_path)
    text = "\n".join([pytesseract.image_to_string(img) for img in images])
    return text

In [14]:
def clean_text(text) -> str:
    text = re.sub(r'[^A-Za-z0-9.,!?\'\"\n ]+', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)  # Reduce excessive newlines but keep paragraph breaks
    text = re.sub(r' +', ' ', text).strip()  # Normalize spaces
    return text

In [15]:
manga_dir = "./data/stored_manga/pdfs"
manga_names = [d for d in os.listdir(manga_dir) if os.path.isdir(os.path.join(manga_dir, d))]

In [16]:
manga_names

['solo_leveling']

In [17]:
pdf_path = os.path.join(manga_dir, 'solo_leveling')
pdf_files = os.listdir(pdf_path)
pdf_files = [f for f in pdf_files if f.endswith('.pdf')]
sorted_files = sorted(pdf_files, key=lambda x: int(x.split('_')[1].split('.')[0]))[0:10]

In [18]:
sorted_files

['chapter_1.pdf',
 'chapter_2.pdf',
 'chapter_3.pdf',
 'chapter_4.pdf',
 'chapter_5.pdf',
 'chapter_6.pdf',
 'chapter_7.pdf',
 'chapter_8.pdf',
 'chapter_9.pdf',
 'chapter_10.pdf']

In [19]:
file_path = []

for pdf_file in sorted_files:
    file_path.append(os.path.join(pdf_path, pdf_file))

In [20]:
file_path

['./data/stored_manga/pdfs/solo_leveling/chapter_1.pdf',
 './data/stored_manga/pdfs/solo_leveling/chapter_2.pdf',
 './data/stored_manga/pdfs/solo_leveling/chapter_3.pdf',
 './data/stored_manga/pdfs/solo_leveling/chapter_4.pdf',
 './data/stored_manga/pdfs/solo_leveling/chapter_5.pdf',
 './data/stored_manga/pdfs/solo_leveling/chapter_6.pdf',
 './data/stored_manga/pdfs/solo_leveling/chapter_7.pdf',
 './data/stored_manga/pdfs/solo_leveling/chapter_8.pdf',
 './data/stored_manga/pdfs/solo_leveling/chapter_9.pdf',
 './data/stored_manga/pdfs/solo_leveling/chapter_10.pdf']

In [21]:
df = spark.createDataFrame([(path,) for path in file_path], ["file_path"])

In [22]:
df.show(5,truncate=False)

                                                                                

+----------------------------------------------------+
|file_path                                           |
+----------------------------------------------------+
|./data/stored_manga/pdfs/solo_leveling/chapter_1.pdf|
|./data/stored_manga/pdfs/solo_leveling/chapter_2.pdf|
|./data/stored_manga/pdfs/solo_leveling/chapter_3.pdf|
|./data/stored_manga/pdfs/solo_leveling/chapter_4.pdf|
|./data/stored_manga/pdfs/solo_leveling/chapter_5.pdf|
+----------------------------------------------------+
only showing top 5 rows



In [32]:
%%time
ocr_udf = udf(extract_text_ocr, StringType())
clean_udf = udf(clean_text, StringType())

df_with_text = df.withColumn("extracted_text", clean_udf(ocr_udf(df["file_path"])))

df_with_text.show(5)



+--------------------+--------------------+
|           file_path|      extracted_text|
+--------------------+--------------------+
|./data/stored_man...|MY NAME\nIS SUNG\...|
|./data/stored_man...|ALL CLEAR!\nLET S...|
|./data/stored_man...|Merakiscans.com\n...|
|./data/stored_man...|Merakiscans.com\n...|
|./data/stored_man...|wa\na es at 7 AS\...|
+--------------------+--------------------+
only showing top 5 rows

CPU times: user 18.5 ms, sys: 33.7 ms, total: 52.2 ms
Wall time: 2min 2s


                                                                                

In [33]:
%%time
df = df_with_text.collect()



CPU times: user 23.6 ms, sys: 156 ms, total: 180 ms
Wall time: 2min 30s


                                                                                

In [34]:
# Connect to database
conn = sqlite3.connect("manga_data.db")
cursor = conn.cursor()

cursor.execute("PRAGMA table_info(manga)")
columns = [col[1] for col in cursor.fetchall()]
if "extracted_text" not in columns:
    cursor.execute("ALTER TABLE manga ADD COLUMN extracted_text TEXT;")

In [35]:
columns

['id',
 'manga_name',
 'chapter',
 'chapter_url',
 'image_url',
 'created_at',
 'extracted_text']

In [42]:
for row in df:
    file_path = row['file_path']
    extracted_text = row['extracted_text']
    
    manga_name = file_path.split('/')[-2].replace("_"," ") 
    print(manga_name)
    exit(0)
    chapter = int(file_path.split('/')[-1].split('_')[1].split('.')[0])  
    # Update the database
    cursor.execute("""
        UPDATE manga
        SET extracted_text = ?
        WHERE manga_name = ? AND chapter = ?;
    """, (extracted_text, manga_name, chapter))

conn.commit()


solo leveling
solo leveling
solo leveling
solo leveling
solo leveling
solo leveling
solo leveling
solo leveling
solo leveling
solo leveling
