In [18]:
import wikipedia
import requests
import pandas as pd


class Wikipedia:
    """
    Create with either search_query or url.
    If only search_query is provided, find_url() runs automatically.
    Attributes:
      - search_query: str | None
      - url: str | None
      - valid: bool
      - title: str | None (resolved page title when found)
    """

    def __init__(self, search_query: str | None = None, url: str | None = None, *, language: str = "en"):
        self.search_query = search_query
        self.url = url
        self.valid = False
        self.title = None
        wikipedia.set_lang(language)

        if not self.url and self.search_query:
            self.find_url()
        elif self.url:
            self.check_url()

    def find_url(self) -> None:
        """Resolve self.search_query to a canonical Wikipedia page URL, setting url/title/valid."""
        q = (self.search_query or "").strip()
        if not q:
            self.url = None
            self.title = None
            self.valid = False
            return

        try:
            results = wikipedia.search(q) or []
        except Exception:
            results = []

        candidates = []
        if results:
            candidates.append(("search_top", results[0]))
        candidates.append(("autosuggest_query", q))

        for source, title in candidates:
            try:
                page = wikipedia.page(title, auto_suggest=(source == "autosuggest_query"), redirect=True)
                self.url = page.url
                self.title = page.title
                self.valid = True
                return
            except wikipedia.DisambiguationError as e:
                if e.options:
                    try:
                        page = wikipedia.page(e.options[0], auto_suggest=False, redirect=True)
                        self.url = page.url
                        self.title = page.title
                        self.valid = True
                        return
                    except Exception:
                        pass
            except wikipedia.PageError:
                continue
            except Exception:
                continue

        self.url = None
        self.title = None
        self.valid = False

    def check_url(self) -> None:
        """Validate that self.url is a reachable Wikipedia page."""
        if not self.url or not self.url.startswith("https://en.wikipedia.org/wiki/"):
            self.valid = False
            return

        try:
            resp = requests.head(self.url, allow_redirects=True, timeout=5)
            if resp.status_code == 200 and "wikipedia.org" in resp.url:
                # Try to get the title from the Wikipedia library
                try:
                    page_title = self.url.split("/wiki/")[-1].replace("_", " ")
                    page = wikipedia.page(page_title, auto_suggest=False, redirect=True)
                    self.title = page.title
                except Exception:
                    self.title = None
                self.valid = True
            else:
                self.valid = False
        except Exception:
            self.valid = False

    def scrape(self) -> str | None:
        """Scrape the content of the Wikipedia page."""
        if not self.valid or not self.url:
            return None

        try:
            data = requests.get(self.url, timeout=10)
            return data.text
        except Exception:
            return None
        
    def scrapeTable(self) -> list[pd.DataFrame] | None:
        """Scrape tables from the given Wikipedia (or any) page and return as list of DataFrames."""
        if not self.valid or not self.url:
            return None

        try:
            response = requests.get(self.url, timeout=10)
            response.raise_for_status()  # Ensure we catch HTTP errors
            tables = pd.read_html(response.text)
            return tables if tables else None
        except Exception:
            return None

    def __repr__(self) -> str:
        return f"Wikipedia(search_query={self.search_query!r}, url={self.url!r}, valid={self.valid}, title={self.title!r})"


In [None]:
# Validate from search query
w1 = Wikipedia(search_query="Highest grossing films")
print(w1)

# Validate from a direct URL
w2 = Wikipedia(url="https://en.wikipedia.org/wiki/List_of_highest-grossing_films")
print(w2)

# Invalid URL
w3 = Wikipedia(url="https://en.wikipedia.org/wiki/ThisPageDoesNotExist12345")
print(w3)


Wikipedia(search_query='Highest grossing films', url='https://en.wikipedia.org/wiki/List_of_highest-grossing_films', valid=True, title='List of highest-grossing films')
Wikipedia(search_query=None, url='https://en.wikipedia.org/wiki/List_of_highest-grossing_films', valid=True, title='List of highest-grossing films')
Wikipedia(search_query=None, url='https://en.wikipedia.org/wiki/ThisPageDoesNotExist12345', valid=False, title=None)


In [19]:
# Validate from search query
w1 = Wikipedia(search_query="list of highest grossing films from wikipedia")
print(w1)

Wikipedia(search_query='list of highest grossing films from wikipedia', url='https://en.wikipedia.org/wiki/List_of_highest-grossing_films', valid=True, title='List of highest-grossing films')


In [20]:
test = w1.scrapeTable()

  tables = pd.read_html(response.text)


In [21]:
print(test)

[    Rank  Peak                                          Title  \
0      1     1                                         Avatar   
1      2     1                              Avengers: Endgame   
2      3     3                       Avatar: The Way of Water   
3      4     1                                        Titanic   
4      5     5                                       Ne Zha 2   
5      6     3                   Star Wars: The Force Awakens   
6      7     4                         Avengers: Infinity War   
7      8     6                        Spider-Man: No Way Home   
8      9     8                                   Inside Out 2   
9     10     3                                 Jurassic World   
10    11     7                                  The Lion King   
11    12     3                                   The Avengers   
12    13     4                                      Furious 7   
13    14    11                              Top Gun: Maverick   
14    15    10          

In [22]:
def list_table_headers(tables: list[pd.DataFrame]) -> list[list[str]]:
    """Return a list of headers for each DataFrame in the list."""
    return [df.columns.tolist() for df in tables]

# Example usage:
# tables = scrape()  # your function returning list of DataFrames
# headers = list_table_headers(tables)
# print(headers)


In [24]:
headers = list_table_headers(test)

In [25]:
print(headers)

[['Rank', 'Peak', 'Title', 'Worldwide gross', 'Year', 'Ref'], ['Rank', 'Title', 'Worldwide gross (2024 $)', 'Year'], ['Year', 'Title', 'Worldwide gross', 'Budget', 'Ref'], ['Established', 'Title', 'Record-setting gross', 'Ref'], [0, 1, 2, 3, 4, 5], ['Rank', 'Series', 'Total worldwide gross', 'No. of films', 'Average of films', 'Highest-grossing film'], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1

In [26]:
test[5]

Unnamed: 0,Rank,Series,Total worldwide gross,No. of films,Average of films,Highest-grossing film


In [None]:
import pyarrow.dataset as ds

dataset = ds.dataset(
    "s3://indian-high-court-judgments/metadata/parquet",
    format="parquet",
    partitioning="hive",
)
table = dataset.to_table().replace_schema_metadata()
df = table.to_pandas()