In [21]:
import os
import pandas as pd
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup

In [23]:
def extract_text_from_epub(file_path):
    book = epub.read_epub(file_path)
    full_text = ""

    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            content = item.get_body_content()
            soup = BeautifulSoup(content, 'html.parser')
            full_text += soup.get_text(separator=' ')
    
    return full_text

def get_category(file_path, base_directory):
    relative_path = os.path.relpath(file_path, base_directory)
    parts = relative_path.split(os.sep)
    
    if len(parts) > 1:
        return parts[0]
    return "Unknown"

def process_epubs(directory):
    data = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".epub"):
                file_path = os.path.join(root, file)
                text = extract_text_from_epub(file_path)
                word_count = len(text.split())
                category = get_category(file_path, directory)
                data.append({
                    "title": file[:-5],
                    "text": text,
                    "word_count": word_count,
                    "category": category
                })
    return pd.DataFrame(data)

In [24]:
# Ruta a la carpeta 'dataset'
dataset_directory = 'dataset/epub'

# Procesar los EPUBs y obtener el DataFrame
df = process_epubs(dataset_directory)

# Mostrar el DataFrame
df

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


Unnamed: 0,title,text,word_count,category
0,Astronomaycienciageneralcoleccindetrabajoscien...,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,187635,Ciencias
1,Elmecanismodelareaccinqumicadiscursoledoenlaso...,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,9797,Ciencias
2,ElmundodelosvegetalesTextoimpresofenmenosymist...,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,48946,Ciencias
3,Hongoscomestiblesyvenenosos,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,27725,Ciencias
4,LateoriadarwinianaylaCreacinllamadaindependiente,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,83198,Ciencias
5,Zoologapintorescadragoneshidrasbasiliscosyserp...,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,39767,Ciencias
6,ApuntesdeHistoriaAmericanaTextoimpreso,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,105691,Historia
7,CompendiodehistoriadeAmricaTextoimpreso,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,371901,Historia
8,CompendiodelaHistoriadeEspaaTextoimpreso,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,47853,Historia
9,EpisodioshistricosdeAmricadescubrimientoconqui...,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,41814,Historia


In [19]:
df['source']="epub"
df

Unnamed: 0,title,text,word_count,category,source
0,Astronomaycienciageneralcoleccindetrabajoscien...,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,187635,Ciencias,epub
1,Elmecanismodelareaccinqumicadiscursoledoenlaso...,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,9797,Ciencias,epub
2,ElmundodelosvegetalesTextoimpresofenmenosymist...,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,48946,Ciencias,epub
3,Hongoscomestiblesyvenenosos,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,27725,Ciencias,epub
4,LateoriadarwinianaylaCreacinllamadaindependiente,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,83198,Ciencias,epub
5,Zoologapintorescadragoneshidrasbasiliscosyserp...,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,39767,Ciencias,epub
6,ApuntesdeHistoriaAmericanaTextoimpreso,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,105691,Historia,epub
7,CompendiodehistoriadeAmricaTextoimpreso,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,371901,Historia,epub
8,CompendiodelaHistoriadeEspaaTextoimpreso,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,47853,Historia,epub
9,EpisodioshistricosdeAmricadescubrimientoconqui...,\n \n \n \n \n\n \n \n \n\n \n \n Índice \n ...,41814,Historia,epub


In [20]:
# Se guardan los textos filtrados
df.to_csv('dataset/df_epub.csv', index=False)