In [12]:
import wikipedia
import requests
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO


class Wikipedia:
    """
    Create with either search_query or url.
    If only search_query is provided, find_url() runs automatically.
    Attributes:
      - search_query: str | None
      - url: str | None
      - valid: bool
      - title: str | None (resolved page title when found)
    """

    def __init__(self, search_query: str | None = None, url: str | None = None, *, language: str = "en"):
        self.search_query = search_query
        self.url = url
        self.valid = False
        self.title = None
        wikipedia.set_lang(language)

        if not self.url and self.search_query:
            self.find_url()
        elif self.url:
            self.check_url()

    def find_url(self) -> None:
        """Resolve self.search_query to a canonical Wikipedia page URL, setting url/title/valid."""
        q = (self.search_query or "").strip()
        if not q:
            self.url = None
            self.title = None
            self.valid = False
            return

        try:
            results = wikipedia.search(q) or []
        except Exception:
            results = []

        candidates = []
        if results:
            candidates.append(("search_top", results[0]))
        candidates.append(("autosuggest_query", q))

        for source, title in candidates:
            try:
                page = wikipedia.page(title, auto_suggest=(source == "autosuggest_query"), redirect=True)
                self.url = page.url
                self.title = page.title
                self.valid = True
                return
            except wikipedia.DisambiguationError as e:
                if e.options:
                    try:
                        page = wikipedia.page(e.options[0], auto_suggest=False, redirect=True)
                        self.url = page.url
                        self.title = page.title
                        self.valid = True
                        return
                    except Exception:
                        pass
            except wikipedia.PageError:
                continue
            except Exception:
                continue

        self.url = None
        self.title = None
        self.valid = False

    def check_url(self) -> None:
        """Validate that self.url is a reachable Wikipedia page."""
        if not self.url or not self.url.startswith("https://en.wikipedia.org/wiki/"):
            self.valid = False
            return

        try:
            resp = requests.head(self.url, allow_redirects=True, timeout=5)
            if resp.status_code == 200 and "wikipedia.org" in resp.url:
                # Try to get the title from the Wikipedia library
                try:
                    page_title = self.url.split("/wiki/")[-1].replace("_", " ")
                    page = wikipedia.page(page_title, auto_suggest=False, redirect=True)
                    self.title = page.title
                except Exception:
                    self.title = None
                self.valid = True
            else:
                self.valid = False
        except Exception:
            self.valid = False

    def scrape(self) -> str | None:
        """Scrape the content of the Wikipedia page."""
        if not self.valid or not self.url:
            return None

        try:
            data = requests.get(self.url, timeout=10)
            return data.text
        except Exception:
            return None
        
    def scrapeTable(self) -> list[pd.DataFrame] | None:
        """Scrape tables from the given Wikipedia (or any) page and return as list of DataFrames."""
        if not self.valid or not self.url:
            return None

        try:
            response = requests.get(self.url, timeout=10)
            response.raise_for_status()  # Ensure we catch HTTP errors
            soup = BeautifulSoup(response.text, "lxml")
            for sup in soup.find_all("sup"):
                sup.decompose()
            html_io = StringIO(str(soup))
            tables = pd.read_html(html_io)
            tables_dict = {}
            for table in tables:
                # Get the column names as strings
                header_key = ":".join(str(col) for col in table.columns)
                tables_dict[header_key] = table
            return tables_dict if tables_dict else None
        except Exception:
            return None

    def __repr__(self) -> str:
        return f"Wikipedia(search_query={self.search_query!r}, url={self.url!r}, valid={self.valid}, title={self.title!r})"


In [None]:
# Validate from search query
w1 = Wikipedia(search_query="Highest grossing films")
print(w1)

# Validate from a direct URL
w2 = Wikipedia(url="https://en.wikipedia.org/wiki/List_of_highest-grossing_films")
print(w2)

# Invalid URL
w3 = Wikipedia(url="https://en.wikipedia.org/wiki/ThisPageDoesNotExist12345")
print(w3)


Wikipedia(search_query='Highest grossing films', url='https://en.wikipedia.org/wiki/List_of_highest-grossing_films', valid=True, title='List of highest-grossing films')
Wikipedia(search_query=None, url='https://en.wikipedia.org/wiki/List_of_highest-grossing_films', valid=True, title='List of highest-grossing films')
Wikipedia(search_query=None, url='https://en.wikipedia.org/wiki/ThisPageDoesNotExist12345', valid=False, title=None)


In [5]:
# Validate from search query
w1 = Wikipedia(search_query="list of highest grossing films from wikipedia")
print(w1)

Wikipedia(search_query='list of highest grossing films from wikipedia', url='https://en.wikipedia.org/wiki/List_of_highest-grossing_films', valid=True, title='List of highest-grossing films')


In [13]:
test = w1.scrapeTable()

  tables = pd.read_html(response.text)


In [14]:
print(test)

{'Rank:Peak:Title:Worldwide gross:Year:Ref':     Rank  Peak                                          Title  \
0      1     1                                         Avatar   
1      2     1                              Avengers: Endgame   
2      3     3                       Avatar: The Way of Water   
3      4     1                                        Titanic   
4      5     5                                       Ne Zha 2   
5      6     3                   Star Wars: The Force Awakens   
6      7     4                         Avengers: Infinity War   
7      8     6                        Spider-Man: No Way Home   
8      9     8                                   Inside Out 2   
9     10     3                                 Jurassic World   
10    11     7                                  The Lion King   
11    12     3                                   The Avengers   
12    13     4                                      Furious 7   
13    14    11                              T

In [22]:
def list_table_headers(tables: list[pd.DataFrame]) -> list[list[str]]:
    """Return a list of headers for each DataFrame in the list."""
    return [df.columns.tolist() for df in tables]

# Example usage:
# tables = scrape()  # your function returning list of DataFrames
# headers = list_table_headers(tables)
# print(headers)


In [24]:
headers = list_table_headers(test)

In [25]:
print(headers)

[['Rank', 'Peak', 'Title', 'Worldwide gross', 'Year', 'Ref'], ['Rank', 'Title', 'Worldwide gross (2024 $)', 'Year'], ['Year', 'Title', 'Worldwide gross', 'Budget', 'Ref'], ['Established', 'Title', 'Record-setting gross', 'Ref'], [0, 1, 2, 3, 4, 5], ['Rank', 'Series', 'Total worldwide gross', 'No. of films', 'Average of films', 'Highest-grossing film'], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1

In [26]:
test[5]

Unnamed: 0,Rank,Series,Total worldwide gross,No. of films,Average of films,Highest-grossing film


In [None]:
import pyarrow.dataset as ds

dataset = ds.dataset(
    "s3://indian-high-court-judgments/metadata/parquet",
    format="parquet",
    partitioning="hive",
)
table = dataset.to_table().replace_schema_metadata()
df = table.to_pandas()

In [3]:
import pathlib

In [25]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_highest-grossing_films", timeout=10)
response.raise_for_status()  # Ensure we catch HTTP errors


In [29]:
soup = BeautifulSoup(response.text, "lxml")
for sup in soup.find_all("sup"):
    sup.decompose()
for elem in soup.select('[style="display:none"]'):
    elem.decompose()

In [30]:
for td in soup.find_all("td"):
    if td.string:  # direct string node
        td.string = td.string.replace("$", "").replace(",", "")
    else:  # if the value is nested in tags
        td_text = td.get_text()
        td.clear()
        td.append(td_text.replace("$", "").replace(",", ""))

In [31]:
print(str(soup))

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of highest-grossing films - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-en

In [32]:
html_io = StringIO(str(soup))
tables = pd.read_html(html_io)
tables_dict = {}
for table in tables:
    # Get the column names as strings
    header_key = ":".join(str(col) for col in table.columns)
    tables_dict[header_key] = table

In [33]:
print(tables_dict)

{'Rank:Peak:Title:Worldwide gross:Year:Ref':     Rank  Peak                                          Title  \
0      1     1                                         Avatar   
1      2     1                              Avengers: Endgame   
2      3     3                       Avatar: The Way of Water   
3      4     1                                        Titanic   
4      5     5                                       Ne Zha 2   
5      6     3                   Star Wars: The Force Awakens   
6      7     4                         Avengers: Infinity War   
7      8     6                        Spider-Man: No Way Home   
8      9     8                                   Inside Out 2   
9     10     3                                 Jurassic World   
10    11     7                                  The Lion King   
11    12     3                                   The Avengers   
12    13     4                                      Furious 7   
13    14    11                              T

In [52]:
import wikipedia
import requests
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup
import re
import numpy as np
from typing import Any

class Wikipedia:
    """
    Create with either search_query or url.
    If only search_query is provided, find_url() runs automatically.
    Attributes:
      - search_query: str | None
      - url: str | None
      - valid: bool
      - title: str | None (resolved page title when found)
    """

    # ---------------------------
    # Regex helpers (reused)
    # ---------------------------
    _NUM_RE = re.compile(r"-?\d[\d,\.]*")
    _MONEY_RE = re.compile(r"(\d[\d,\.]*)\s*(million|billion)?", re.I)

    def __init__(self, search_query: str | None = None, url: str | None = None, *, language: str = "en"):
        self.search_query = search_query
        self.url = url
        self.valid = False
        self.title = None
        wikipedia.set_lang(language)

        if not self.url and self.search_query:
            self.find_url()
        elif self.url:
            self.check_url()

    # ---------------------------
    # URL discovery/validation
    # ---------------------------
    def find_url(self) -> None:
        """Resolve self.search_query to a canonical Wikipedia page URL, setting url/title/valid."""
        q = (self.search_query or "").strip()
        if not q:
            self.url = None
            self.title = None
            self.valid = False
            return

        try:
            results = wikipedia.search(q) or []
        except Exception:
            results = []

        candidates = []
        if results:
            candidates.append(("search_top", results[0]))
        candidates.append(("autosuggest_query", q))

        for source, title in candidates:
            try:
                page = wikipedia.page(title, auto_suggest=(source == "autosuggest_query"), redirect=True)
                self.url = page.url
                self.title = page.title
                self.valid = True
                return
            except wikipedia.DisambiguationError as e:
                if e.options:
                    try:
                        page = wikipedia.page(e.options[0], auto_suggest=False, redirect=True)
                        self.url = page.url
                        self.title = page.title
                        self.valid = True
                        return
                    except Exception:
                        pass
            except wikipedia.PageError:
                continue
            except Exception:
                continue

        self.url = None
        self.title = None
        self.valid = False

    def check_url(self) -> None:
        """Validate that self.url is a reachable Wikipedia page."""
        if not self.url or not self.url.startswith("https://en.wikipedia.org/wiki/"):
            self.valid = False
            return

        try:
            resp = requests.head(self.url, allow_redirects=True, timeout=5)
            if resp.status_code == 200 and "wikipedia.org" in resp.url:
                try:
                    page_title = self.url.split("/wiki/")[-1].replace("_", " ")
                    page = wikipedia.page(page_title, auto_suggest=False, redirect=True)
                    self.title = page.title
                except Exception:
                    self.title = None
                self.valid = True
            else:
                self.valid = False
        except Exception:
            self.valid = False

    # ---------------------------
    # Fetch & HTML cleanup
    # ---------------------------
    def _fetch_html(self) -> str | None:
        if not self.valid or not self.url:
            return None
        try:
            resp = requests.get(self.url, timeout=10)
            resp.raise_for_status()
            return resp.text
        except Exception:
            return None

    def _clean_html_for_tables(self, html: str) -> str:
        """Remove <sup> citations and elements with style='display:none'."""
        soup = BeautifulSoup(html, "lxml")

        # Remove citations/footnotes
        for sup in soup.find_all("sup"):
            sup.decompose()

        # Remove hidden nodes
        for elem in soup.find_all(style=lambda s: s and "display:none" in s.replace(" ", "").lower()):
            elem.decompose()

        return str(soup)

    # ---------------------------
    # Public scraping methods
    # ---------------------------
    def scrape(self) -> str | None:
        """Return raw HTML of the Wikipedia page."""
        return self._fetch_html()

    def scrapeTable(self) -> dict[str, pd.DataFrame] | None:
        """
        Scrape tables from the page, clean HTML, and return a dict:
        key = colon-joined header names,
        value = DataFrame where any cell containing multiple $-prefixed values
                is replaced by the maximum $ amount (retaining $ symbol).
        """
        if not self.valid or not self.url:
            return None

        import re
        from io import StringIO

        # Regex for matching $ amounts like $2,212,300,000 or $1.5 billion
        money_re = re.compile(r"\$[\d,]+(?:\.\d+)?")

        def max_dollar_value(cell):
            """Return max $ string if multiple present, else original."""
            import pandas as pd
            if pd.isna(cell):
                return cell
            s = str(cell)
            if "$" not in s:
                return cell
            matches = money_re.findall(s)
            if not matches:
                return cell
            # Convert to numeric for comparison, but keep original strings
            numeric_vals = [float(m.replace("$", "").replace(",", "")) for m in matches]
            max_val = max(numeric_vals)
            # Return the original string corresponding to the max numeric value
            for m in matches:
                if float(m.replace("$", "").replace(",", "")) == max_val:
                    return m
            return cell

        try:
            # Fetch and clean HTML
            response = requests.get(self.url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "lxml")

            # Remove <sup> tags
            for sup in soup.find_all("sup"):
                sup.decompose()
            # Remove hidden elements
            for elem in soup.find_all(style=lambda st: st and "display:none" in st.replace(" ", "").lower()):
                elem.decompose()

            # Parse tables
            tables = pd.read_html(StringIO(str(soup)))
            if not tables:
                return None

            tables_dict: dict[str, pd.DataFrame] = {}

            for idx, table in enumerate(tables):
                df = table.copy()
                df.columns = [str(c).strip() for c in df.columns]

                # Skip if all headers are numeric
                if all(pd.to_numeric(df.columns, errors="coerce").notna()):
                    continue

                # Clean only cells containing $
                for col in df.columns:
                    if df[col].dtype == "object":
                        df[col] = df[col].apply(max_dollar_value)

                # Create unique key from headers
                header_key = ":".join(df.columns)
                key = header_key if header_key not in tables_dict else f"{header_key}:{idx}"
                tables_dict[key] = df

            return tables_dict or None

        except Exception:
            return None

    # ---------------------------
    # Cleaning helpers (generic)
    # ---------------------------
    @staticmethod
    def _to_float_safe(s: str) -> float | None:
        try:
            return float(s)
        except Exception:
            return None

    @classmethod
    def numeric_max(cls, cell: Any) -> float | None:
        """
        Extract all numeric tokens from a messy cell (ranges, parentheses, +, commas)
        and return the MAX as float. No units handling.
        """
        if pd.isna(cell):
            return None
        s = str(cell).replace(",", "")
        nums = []
        for m in cls._NUM_RE.findall(s):
            v = cls._to_float_safe(m)
            if v is not None:
                nums.append(v)
        return float(max(nums)) if nums else None

    @classmethod
    def money_max_usd(cls, cell: Any) -> float | None:
        """
        Extract all amounts (supports 'million'/'billion' units) and return MAX in USD.
        Examples:
          '$50,000,000–100,000,000'   -> 100_000_000
          '$20,000,000+ ($5,200,000)' -> 20_000_000
          '$1.75 billion'             -> 1_750_000_000
        """
        if pd.isna(cell):
            return None
        s = (str(cell)
             .replace("$", "")
             .replace(",", "")
             .replace("\u2013", "-")
             .replace("–", "-")
             .replace("\u2212", "-"))
        vals: list[float] = []
        for num, unit in cls._MONEY_RE.findall(s):
            v = cls._to_float_safe(num)
            if v is None:
                continue
            unit = (unit or "").lower()
            if unit == "million":
                v *= 1_000_000
            elif unit == "billion":
                v *= 1_000_000_000
            vals.append(v)
        return float(max(vals)) if vals else None

    # ---------------------------
    # DataFrame-wide cleaners
    # ---------------------------
    def clean_tables_with_max(
        self,
        tables: dict[str, pd.DataFrame],
        *,
        money_cols: list[str] | None = None,
        generic_cols: list[str] | None = None,
        suffix: str = "_max",
    ) -> dict[str, pd.DataFrame]:
        """
        For each DataFrame in `tables`, add new columns with cleaned MAX values.
          - money_cols: parse via money_max_usd (million/billion aware)
          - generic_cols: parse via numeric_max
        If generic_cols is None, apply numeric_max to all object columns NOT in money_cols.
        """
        cleaned: dict[str, pd.DataFrame] = {}

        for key, df in tables.items():
            out = df.copy()
            # normalize headers once more
            out.columns = [str(c).strip() for c in out.columns]

            # money columns (case-insensitive match)
            if money_cols:
                money_map = {c for c in out.columns for m in money_cols if c.lower() == m.lower()}
            else:
                money_map = set()

            for c in money_map:
                out[c + suffix] = out[c].apply(self.money_max_usd)

            # generic columns default
            if generic_cols is None:
                candidates = [c for c in out.columns if out[c].dtype == "object" and c not in money_map]
            else:
                candidates = [c for c in out.columns for g in generic_cols if c.lower() == g.lower()]

            for c in candidates:
                out[c + suffix] = out[c].apply(self.numeric_max)

            cleaned[key] = out

        return cleaned

    def __repr__(self) -> str:
        return f"Wikipedia(search_query={self.search_query!r}, url={self.url!r}, valid={self.valid}, title={self.title!r})"

In [53]:
w = Wikipedia(search_query="List of highest-grossing films")
print(w.url, w.title, w.valid)

https://en.wikipedia.org/wiki/List_of_highest-grossing_films List of highest-grossing films True


In [54]:
tables = w.scrapeTable()

In [55]:
tables

{'Rank:Peak:Title:Worldwide gross:Year:Ref':     Rank  Peak                                          Title Worldwide gross  \
 0      1     1                                         Avatar  $2,923,706,026   
 1      2     1                              Avengers: Endgame  $2,797,501,328   
 2      3     3                       Avatar: The Way of Water  $2,320,250,281   
 3      4     1                                        Titanic  $2,257,844,554   
 4      5     5                                       Ne Zha 2  $2,212,300,000   
 5      6     3                   Star Wars: The Force Awakens  $2,068,223,624   
 6      7     4                         Avengers: Infinity War  $2,048,359,754   
 7      8     6                        Spider-Man: No Way Home  $1,922,598,800   
 8      9     8                                   Inside Out 2  $1,698,863,816   
 9     10     3                                 Jurassic World  $1,671,537,444   
 10    11     7                                  The L

In [38]:
cleaned = w.clean_tables_with_max(
    tables,
    money_cols=["Worldwide gross", "Budget"],   # case-insensitive
    # generic_cols=None -> auto-apply numeric_max to other text cols
)

In [40]:
k0 = next(iter(cleaned.keys()))
print(cleaned[k0])

    Rank  Peak                                          Title Worldwide gross  \
0      1     1                                         Avatar  $2,923,706,026   
1      2     1                              Avengers: Endgame  $2,797,501,328   
2      3     3                       Avatar: The Way of Water  $2,320,250,281   
3      4     1                                        Titanic  $2,257,844,554   
4      5     5                                       Ne Zha 2  $2,212,300,000   
5      6     3                   Star Wars: The Force Awakens  $2,068,223,624   
6      7     4                         Avengers: Infinity War  $2,048,359,754   
7      8     6                        Spider-Man: No Way Home  $1,922,598,800   
8      9     8                                   Inside Out 2  $1,698,863,816   
9     10     3                                 Jurassic World  $1,671,537,444   
10    11     7                                  The Lion King  $1,656,943,394   
11    12     3              