# NOTE:
Attempting to collect tables from Wikipedia using pandas.read_html() resulted in a 403 Forbidden error.

This happens because some websites block automated requests made without a browser-like user agent.

To overcome this, I created this class to fetch and parse tables manually,

allowing controlled requests and improved compatibility with different websites.

In [None]:
from bs4 import BeautifulSoup 
import requests
import pandas as pd


def Symbols_df_maker():
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

    headers = {"User-Agent": "AliWikiBot/1.0 (https://github.com/ali)"}

    page = requests.get(url, headers=headers)

    soup = BeautifulSoup(page.text, "html")
    table = soup.find_all("table")[0]
    word_titles = table.find_all("th")
    word_titles_list = [title.text.strip() for title in word_titles]
    df = pd.DataFrame(columns=word_titles_list)
    row_datas = table.find_all("tr")
    for data in row_datas[1:]:
        row_data = [d.text.strip() for d in data if d.text.strip() != ""]
        # print(row_data)
        lenght = len(df)
        df.loc[lenght] = row_data
    df.to_csv("Symbols.csv", index=False)

bs4 stands for Beautiful Soup 4 — it’s a Python library used to parse HTML and XML documents.

Its main purpose: extract data from web pages in a structured way (like tables, links, text, etc.).

In [None]:
from bs4 import BeautifulSoup 
import requests
import pandas as pd

class TabelCollector:
    
    def __inti__(self , URL):
        self.URL = URL
        self.tables = []
        
    def fetch (self):
        headers = {"User-Agent": "AliWikiBot/1.0 (https://github.com/ali)"}
        page = requests.get(self.URL, headers=headers)
        soup = BeautifulSoup(page.text, "html")
        self.tables = soup.find_all("table")
        return(self.tables)
        
TabelCollector.fetch("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")

In [None]:
from bs4 import BeautifulSoup 
import requests
import pandas as pd

class TableCollector:
    
    # Corrected: Changed __inti__ to __init__
    def __init__(self, URL):
        self.URL = URL
        self.tables = []
        
    def fetch(self):
        headers = {"User-Agent": "AliWikiBot/1.0 (https://github.com/ali)"} #??
        
        try:
            page = requests.get(self.URL, headers=headers, timeout=20) # Added a timeout for robustness
            page.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
            soup = BeautifulSoup(page.text, "html.parser") # Explicitly set parser
            self.tables = soup.find_all("table")
            #counter = 0
            #while soup.find_all("table")[0]:
            table = soup.find_all("table")[0]
            word_titles = table.find_all("th")
            word_titles_list = [title.text.strip() for title in word_titles]
            df = pd.DataFrame(columns=word_titles_list)
            row_datas = table.find_all("tr")
            for data in row_datas[1:]:
                row_data = [d.text.strip() for d in data if d.text.strip() != ""]
                # print(row_data)
                lenght = len(df)
                df.loc[lenght] = row_data
            self.tables = df
                #counter += 1
            return self.tables
        except requests.exceptions.RequestException as e: #??
            print(f"An error occurred during the request: {e}")
            self.tables = []
            return [] # Return an empty list on failure
    
    def select(self):
        if self.tables == []:
            print(f"No tables have been fetched!")
        else:
            print("Fetched table(s) will be showen along a number!")
            print("\n Enter the number of the table that you want to work on it")
            counter = 0
            for table in self.tables:
                temp = pd.DataFrame(table)
                counter += 1
                print(f"table {counter}:")
                temp
                
# The correct way to instantiate the class and call the instance method:
URL_TO_SCRAPE = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

# 1. Instantiate the class (create an object)
collector = TableCollector(URL_TO_SCRAPE)

# 2. Call the fetch method on the object instance
found_tables = collector.fetch()

# Output the result
print(f"✅ Successfully found {len(found_tables)} table(s) on the page.")

# Example: Print a snippet of the first table's HTML
if found_tables:
    print("\nSnippet of the first table found:")
    # print(found_tables[0].prettify()[:500] + '...') # uncomment to see the HTML
    
    # Bonus: You can use pandas to easily read a table into a DataFrame
    try:
        dataframes = pd.read_html(str(found_tables[0]))
        if dataframes:
            print("\nFirst table converted to a Pandas DataFrame (first 5 rows):")
            print(dataframes[0].head(10))
    except ValueError as ve:
        print(f"\nCould not parse the first table with pandas: {ve}")

        
collector.select()

# there is an important problem

if there is a table whose title contains indexed title the first row can't be fill

In [None]:
from bs4 import BeautifulSoup 
import requests
import pandas as pd

class TableCollector:
    
    # Corrected: Changed __inti__ to __init__
    def __init__(self, URL):
        self.URL = URL
        self.tables = []
        
    def fetch(self):
        headers = {"User-Agent": "AliWikiBot/1.0 (https://github.com/ali)"} #??
        
        try:
            page = requests.get(self.URL, headers=headers, timeout=20) # Added a timeout for robustness
            page.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
            soup = BeautifulSoup(page.text, "html.parser") # Explicitly set parser
            self.tables = soup.find_all("table")
            counter = 0
            while True:
                try:
                    soup.find_all("table")[counter]
                except:
                    break
                table = soup.find_all("table")[counter]
                word_titles = table.find_all("th")
                word_titles_list = [title.text.strip() for title in word_titles]
                df = pd.DataFrame(columns=word_titles_list)
                print(df)
                row_datas = table.find_all("tr")
                for data in row_datas[1:]:
                    row_data = [d.text.strip() for d in data if d.text.strip() != ""]
                    print(row_data)
                    lenght = len(df)
                    df.loc[lenght] = row_data
                print(df)
                counter += 1
            return self.tables
        except requests.exceptions.RequestException as e: #??
            print(f"An error occurred during the request: {e}")
            self.tables = []
            return [] # Return an empty list on failure
        
# The correct way to instantiate the class and call the instance method:
URL_TO_SCRAPE = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
        
# 1. Instantiate the class (create an object)
collector = TableCollector(URL_TO_SCRAPE)

# 2. Call the fetch method on the object instance
found_tables = collector.fetch()



Empty DataFrame
Columns: [Symbol, Security, GICS Sector, GICS Sub-Industry, Headquarters Location, Date added, CIK, Founded]
Index: []
['MMM', '3M', 'Industrials', 'Industrial Conglomerates', 'Saint Paul, Minnesota', '1957-03-04', '0000066740', '1902']
['AOS', 'A. O. Smith', 'Industrials', 'Building Products', 'Milwaukee, Wisconsin', '2017-07-26', '0000091142', '1916']
['ABT', 'Abbott Laboratories', 'Health Care', 'Health Care Equipment', 'North Chicago, Illinois', '1957-03-04', '0000001800', '1888']
['ABBV', 'AbbVie', 'Health Care', 'Biotechnology', 'North Chicago, Illinois', '2012-12-31', '0001551152', '2013 (1888)']
['ACN', 'Accenture', 'Information Technology', 'IT Consulting & Other Services', 'Dublin, Ireland', '2011-07-06', '0001467373', '1989']
['ADBE', 'Adobe Inc.', 'Information Technology', 'Application Software', 'San Jose, California', '1997-05-05', '0000796343', '1982']
['AMD', 'Advanced Micro Devices', 'Information Technology', 'Semiconductors', 'Santa Clara, California',

ValueError: cannot set a row with mismatched columns

In [17]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

class TableCollector:
    def __init__(self, URL):
        self.URL = URL
        self.tables = []

    def fetch(self):
        headers = {"User-Agent": "AliWikiBot/1.0 (https://github.com/ali)"}
        
        try:
            page = requests.get(self.URL, headers=headers, timeout=20)
            page.raise_for_status()
            soup = BeautifulSoup(page.text, "html.parser")
            self.tables = soup.find_all("table")

            for counter, table in enumerate(self.tables, start=1):
                word_titles = table.find_all("th")
                word_titles_list = [title.text.strip() for title in word_titles]
                df = pd.DataFrame(columns=word_titles_list)

                row_datas = table.find_all("tr")
                for data in row_datas[1:]:
                    row_data = [d.text.strip() for d in data if d.text.strip() != ""]
                    df.loc[len(df)] = row_data

                print(f"\n--- Table {counter} ---")
                print(df)

            return self.tables

        except requests.exceptions.RequestException as e:
            print(f"An error occurred during the request: {e}")
            self.tables = []
            return []

# 1. Instantiate the class (create an object)
collector = TableCollector(URL_TO_SCRAPE)

# 2. Call the fetch method on the object instance
found_tables = collector.fetch()


--- Table 1 ---
    Symbol             Security             GICS Sector  \
0      MMM                   3M             Industrials   
1      AOS          A. O. Smith             Industrials   
2      ABT  Abbott Laboratories             Health Care   
3     ABBV               AbbVie             Health Care   
4      ACN            Accenture  Information Technology   
..     ...                  ...                     ...   
498    XYL           Xylem Inc.             Industrials   
499    YUM          Yum! Brands  Consumer Discretionary   
500   ZBRA   Zebra Technologies  Information Technology   
501    ZBH        Zimmer Biomet             Health Care   
502    ZTS               Zoetis             Health Care   

                                GICS Sub-Industry    Headquarters Location  \
0                        Industrial Conglomerates    Saint Paul, Minnesota   
1                               Building Products     Milwaukee, Wisconsin   
2                           Health Care 

ValueError: cannot set a row with mismatched columns