In [None]:
import time
import re
from urllib.request import urlopen
from urllib.parse import urlparse
from pathlib import Path
import tempfile
import logging
import ssl

# ---- HTML parsing ----
from bs4 import BeautifulSoup

# ---- PDF parsing ----
from pdfminer.high_level import extract_text
from pdfminer.pdfpage import PDFPage
import requests
# Quiet down pdfminer warnings (e.g., gray stroke color issues)
logging.getLogger("pdfminer").setLevel(logging.ERROR)

# ---- NLP & Visualization ----
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as vis

# Try to ensure NLTK resources; fall back silently if downloads fail (e.g., SSL issues)
USE_NLTK = True
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    try:
        nltk.download("punkt", quiet=True)
    except Exception:
        USE_NLTK = False
try:
    nltk.data.find("corpora/stopwords")
except LookupError:
    try:
        nltk.download("stopwords", quiet=True)
    except Exception:
        USE_NLTK = False
if not USE_NLTK:
    # Minimal built-in stopword set as a fallback
    FALLBACK_STOPWORDS = {
        "the","and","to","of","in","a","is","it","that","for","on","as","with","this","by","an","are","be",
        "or","from","at","was","but","not","have","has","had","were","which","their","its","they","we","you",
        "your","our","can","will","would","should","could","about","into","over","than","so","no","yes","if",
        "when","while","what","who","whom","where","why","how","all","any","each","few","more","most","other",
        "some","such","only","own","same","both","very","s","t","just","don","now"
    }
    # Simple regex tokenizers as fallback (no NLTK corpora required)
    import re as _re
    def _simple_sent_tokenize(text: str):
        return [s.strip() for s in _re.split(r"(?<=[.!?])\s+", text) if s.strip()]
    def _simple_word_tokenize(text: str):
        return _re.findall(r"[A-Za-z']+", text)

# ---------------------------- USER CONFIG ----------------------------
ebook_url = "https://clubphysical.co.nz/wp-content/uploads/2025/06/How-to-build-your-body-E-Book-updated.pdf.pdf"
MAKE_CLOUDS = True           # generate a wordcloud visualization
CLOUD_INTERVAL = 10.0        # seconds between successive word clouds
RUN_FREQ_VIS = True          # show frequency distribution plot
# --------------------------------------------------------------------

class EBOOK_web_scraper:


    def __init__(self):
        self.cleaner_Data = ""
        self.all_words = []
        self.meaningful_words = []
        self._last_cloud_time = 0.0
        self._is_pdf = False
        self._tmp_pdf_path = None
        self._use_nltk = USE_NLTK
        if self._use_nltk:
            try:
                self._stopwords = set(stopwords.words("english"))
            except Exception:
                self._use_nltk = False
                self._stopwords = FALLBACK_STOPWORDS
        else:
            self._stopwords = FALLBACK_STOPWORDS

    def _is_pdf_url(self, url: str) -> bool:
        path = urlparse(url).path.lower()
        return path.endswith(".pdf")

    def accessData(self, url: str):
        """
        Load the ebook from URL.
        - If PDF: extract text page-by-page and concatenate.
        - If HTML: fetch, strip tags, and normalize whitespace.
        Stores the cleaned text in self.cleaner_Data.
        """
        self._is_pdf = self._is_pdf_url(url)

        if self._is_pdf:
            # ---- PDF path ----
            content = requests.get(url, timeout=60).content
            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
                tmp.write(content)
                self._tmp_pdf_path = Path(tmp.name)

            # Count pages (optional, not printed)
            with open(self._tmp_pdf_path, "rb") as f:
                _ = sum(1 for _ in PDFPage.get_pages(f))

            # Extract text from all pages and join
            all_pages = []
            text_all = ""
            try:
                text_all = extract_text(str(self._tmp_pdf_path)) or ""
            except Exception as e:
                print(f"[WARN] pdfminer failed: {e}")
                text_all = ""
            if not text_all.strip():
                try:
                    from pypdf import PdfReader
                    reader = PdfReader(str(self._tmp_pdf_path))
                    buf = []
                    for page in reader.pages:
                        try:
                            buf.append(page.extract_text() or "")
                        except Exception:
                            buf.append("")
                    text_all = " ".join(buf)
                except Exception as e:
                    print(f"[WARN] pypdf fallback failed: {e}")
            all_pages.append(text_all)

            joined = " ".join(p.strip() for p in all_pages if p)
            joined = re.sub(r"\s+", " ", joined)
            print('Character Length = ', len(joined))
            print('\n DISPLAY SOME TEXT: \n', joined[:100])
            print('\n Cleaner Text: ', joined[:200])

            self.cleaner_Data = joined

        else:
            # ---- HTML path (mimics the class pattern) ----
            html_xters = urlopen(url).read()
            html_Data = html_xters.decode("utf-8", "ignore")
            print('Character Length = ', len(html_Data))
            print('\n DISPLAY SOME TEXT: \n', html_Data[:100])

            cleaner_Data = BeautifulSoup(html_Data, features="lxml").get_text()
            cleaner_Data = ' '.join(cleaner_Data.split())

            print('\n Cleaner Text: ', cleaner_Data[:200])
            self.cleaner_Data = cleaner_Data

    def Sentence_tokenizer(self):
        """
        Tokenize into sentences, lowercase, and remove stray newlines.
        (Follows the user's method naming and printing style.)
        """
        sentences = sent_tokenize(self.cleaner_Data) if self._use_nltk else _simple_sent_tokenize(self.cleaner_Data)
        sentences = [w.replace('\n', '').lower() for w in sentences]
        print('\n Number of Sentences = ', len(sentences))
        print('\n Some Sentences: \n', sentences[:10])
        self._sentences = sentences

    def Word_tokenizer(self):
        """
        Tokenize into words and print some samples.
        """
        all_words = word_tokenize(self.cleaner_Data) if self._use_nltk else _simple_word_tokenize(self.cleaner_Data)
        print('\n Number of Words = ', len(all_words))
        print('\n Some Words: \n', all_words[:10])
        self.all_words = all_words

    def getRid_meaningless_xters(self):
        """
        Remove punctuation and English stopwords (keeps alphabetic tokens only).
        """
        words_minus_punct = []
        for wd in self.all_words:
            if wd.isalpha():
                words_minus_punct.append(wd.lower())

        words_minus_conjunctions = []
        conjunctions = self._stopwords

        for wd in words_minus_punct:
            if wd not in conjunctions:
                words_minus_conjunctions.append(wd)

        print('\n', words_minus_conjunctions[:20])
        self.meaningful_words = words_minus_conjunctions

    def Word_Distr_visualizer(self):
        """
        Plot top 20 words using NLTK's FreqDist (same style as user's code).
        """
        word_freq = FreqDist(self.meaningful_words)
        print('\n', word_freq.most_common(20))
        word_freq.plot(20)

    def decode_message(self):
        """
        Create a word cloud from the meaningful words.
        Mirrors the user's method name; uses a 10s interval throttle by default.
        """
        now = time.time()
        if now - self._last_cloud_time < CLOUD_INTERVAL:
            # respect interval; silently skip if too soon
            return
        self._last_cloud_time = now

        if not self.meaningful_words:
            return

        decision_keywords = WordCloud().generate(" ".join(self.meaningful_words))
        vis.figure(figsize=(14, 14))
        vis.axis("off")
        vis.imshow(decision_keywords)

        # ---------------------------- Drive the class ----------------------------

handle = EBOOK_web_scraper()

print('\n SCRAPING EBOOK')
handle.accessData(ebook_url)

# Tokenize Sentences
print('\n UNCLEAN TOKENIZED SENTENCES')
handle.Sentence_tokenizer()

# Tokenize Words
print('\n UNCLEAN TOKENIZED WORDS')
handle.Word_tokenizer()

# Get Rid of Meaningless Words and Visualize
print('\n MEANINGFUL WORDS')
handle.getRid_meaningless_xters()

# Frequency of Words and Visualization
if RUN_FREQ_VIS:
    print('\n FREQUENCY: MOST COMMON WORDS AND VISUALIZATION')
    handle.Word_Distr_visualizer()

# Visualize and Decode Messages (WordCloud)
if MAKE_CLOUDS:
    print('\n VISUAL DECODING FOR QUALITATIVE SENTIMENT')
    handle.decode_message()

# Clean up any temp PDF if created
try:
    if getattr(handle, "_tmp_pdf_path", None):
        Path(handle._tmp_pdf_path).unlink(missing_ok=True)
except Exception:
    pass