# 1. Creating a BeautifulSoup Object

In [1]:
import requests
from bs4 import BeautifulSoup

# Fetch the webpage content
url = "https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_life_expectancy"
response = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
print("BeautifulSoup object created successfully.")


BeautifulSoup object created successfully.


# 2. Navigating and Searching

## 2.1 find()

Finds the first occurrence of a tag, such as the first <table>

In [4]:
# Find the first table on the page
first_table = soup.find("table")
print(first_table.prettify())  # Prettified HTML of the first table



<table class="wikitable sortable static-row-numbers sort-under" style="text-align:right;">
 <tbody>
  <tr class="static-row-header" style="text-align:center;vertical-align:bottom;">
   <th rowspan="2" style="vertical-align:middle;">
    <a href="/wiki/List_of_states_and_territories_of_the_United_States#States" title="List of states and territories of the United States">
     state
    </a>
   </th>
   <th colspan="4">
    2019
   </th>
   <th rowspan="2" style="vertical-align:middle;border-left-width:2px;">
    <style data-mw-deduplicate="TemplateStyles:r1038841319">
     .mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}
    </style>
    <span class="rt-commentedText tooltip tooltip-dotted" title="Change in overall life expectancy between 2019 and 2020">
     2019
     <br/>
     →2020
    </span>
   </th>
   <th colspan="4" style="border-left-width:2px;">
    2020
   </th>
  </tr>
  <tr class="static-row-header" style="text-align:center;vertical-align:bottom;">
 

## 2.2. find_all()

Finds all tables on the page.

In [5]:
# Find all tables
tables = soup.find_all("table")
print(f"Number of tables found: {len(tables)}")


Number of tables found: 19


## 2.3. find_parent()

Finds the parent of a specific tag, such as a table.

In [6]:
# Find the parent of the first table
parent = first_table.find_parent()
print(parent.name)  # Output: div


meta


## 2.4. find_next_sibling() and find_previous_sibling()

Finds adjacent elements.

In [7]:
# Find the next sibling after the first table
next_sibling = first_table.find_next_sibling()
print(next_sibling.name)  # Outputs the next tag type

# Find the previous sibling
prev_sibling = first_table.find_previous_sibling()
print(prev_sibling)


div
<style data-mw-deduplicate="TemplateStyles:r1245584064">@media screen{html.client-js .mw-parser-output .sort-under.sortable.wikitable th.headerSort,html.client-js .mw-parser-output .sort-under-right.sortable.wikitable th.headerSort,html.client-js .mw-parser-output .sort-under-center.sortable.wikitable th.headerSort{padding-right:0.4em}html.client-js .mw-parser-output .sort-under.sortable:not(.wikitable) th.headerSort,html.client-js .mw-parser-output .sort-under-right.sortable:not(.wikitable) th.headerSort,html.client-js .mw-parser-output .sort-under-center.sortable:not(.wikitable) th.headerSort{padding-right:1px}html.client-js body.skin-minerva .mw-parser-output .sort-under.sortable.wikitable th.headerSort,html.client-js body.skin-minerva .mw-parser-output .sort-under-right.sortable.wikitable th.headerSort,html.client-js body.skin-minerva .mw-parser-output .sort-under-center.sortable.wikitable th.headerSort{padding-right:0.2em}html.client-js body.skin-timeless .mw-parser-output .so

# 3. CSS Selectors

## 3.1. select()

Use CSS selectors to fetch elements, such as all tables with the class wikitable.



In [8]:
# Select tables with a specific class
tables_with_class = soup.select("table.wikitable")
print(f"Number of 'wikitable' tables: {len(tables_with_class)}")


Number of 'wikitable' tables: 5


## 3.2. select_one()

Fetch the first matching element using a CSS selector.

In [10]:
# Fetch the first 'wikitable'
first_wikitable = soup.select_one("table.wikitable")
print(first_wikitable.prettify())


<table class="wikitable sortable static-row-numbers sort-under" style="text-align:right;">
 <tbody>
  <tr class="static-row-header" style="text-align:center;vertical-align:bottom;">
   <th rowspan="2" style="vertical-align:middle;">
    <a href="/wiki/List_of_states_and_territories_of_the_United_States#States" title="List of states and territories of the United States">
     state
    </a>
   </th>
   <th colspan="4">
    2019
   </th>
   <th rowspan="2" style="vertical-align:middle;border-left-width:2px;">
    <style data-mw-deduplicate="TemplateStyles:r1038841319">
     .mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}
    </style>
    <span class="rt-commentedText tooltip tooltip-dotted" title="Change in overall life expectancy between 2019 and 2020">
     2019
     <br/>
     →2020
    </span>
   </th>
   <th colspan="4" style="border-left-width:2px;">
    2020
   </th>
  </tr>
  <tr class="static-row-header" style="text-align:center;vertical-align:bottom;">
 

# 4. Accessing Attributes and Text

## 4.1. .text

Extract all text within a tag, such as the text of the first table.

In [11]:
# Extract all text from the first table
print(first_table.text.strip())


state

2019

2019→2020

2020


overall

males

females

F Δ M

overall

males

females

F Δ M


US on average
78.8
76.3
81.4
3.1
−1.8
77.0
76.2
79.9
3.7


California
80.9
78.4
83.3
4.9
−1.9
79.0
76.2
82.0
5.8


Hawaii
80.9
78.0
83.9
5.9
−0.2
80.7
77.6
83.8
6.2


New York
80.7
78.2
83.1
4.9
−3.0
77.7
74.8
80.7
5.9


Minnesota
80.4
78.3
82.6
4.3
−1.3
79.1
76.8
81.4
4.6


Massachusetts
80.4
77.9
82.8
4.9
−1.4
79.0
76.4
81.5
5.1


Connecticut
80.3
77.7
82.8
5.1
−1.9
78.4
75.6
81.3
5.7


New Jersey
80.1
77.6
82.5
4.9
−2.6
77.5
74.6
80.5
5.9


Washington (state)
80.0
77.9
82.1
4.2
−0.8
79.2
76.9
81.6
4.7


Colorado
80.0
77.8
82.2
4.4
−1.7
78.3
75.8
80.9
5.1


Vermont
79.8
77.2
82.3
5.1
−1.0
78.8
76.1
81.4
5.3


Utah
79.7
78.0
81.5
3.5
−1.1
78.6
76.7
80.6
3.9


Oregon
79.6
77.3
81.9
4.6
−0.8
78.8
76.3
81.3
5.0


Idaho
79.5
77.5
81.5
4.0
−1.1
78.4
76.1
80.8
4.7


Rhode Island
79.5
77.0
81.8
4.8
−1.3
78.2
75.5
80.9
5.4


New Hampshire
79.4
77.1
81.6
4.5
−0.4
79.0
76.5
81.5
5.0


Wisconsin
79.3


## 4.2. .get()

Retrieve a specific attribute, such as the class of a table.

In [12]:
# Get the class attribute of the first table
print(first_table.get("class"))


['wikitable', 'sortable', 'static-row-numbers', 'sort-under']


## 4.3. attrs

Access all attributes of a tag.

In [13]:
# Print all attributes of the first table
print(first_table.attrs)


{'class': ['wikitable', 'sortable', 'static-row-numbers', 'sort-under'], 'style': 'text-align:right;'}


# Extract data of first table

In [14]:
# Locate the first 'wikitable'
table = soup.find("table", {"class": "wikitable"})

# Extract headers
headers = [header.text.strip() for header in table.find_all("th")]

# Extract rows
rows = []
for row in table.find_all("tr")[1:]:
    cells = row.find_all("td")
    rows.append([cell.text.strip() for cell in cells])

# Convert to a DataFrame
import pandas as pd
df = pd.DataFrame(rows, columns=headers)
print(df)


ValueError: 12 columns passed, passed data had 10 columns

In [15]:
# Locate the first 'wikitable'
table = soup.find("table", {"class": "wikitable"})

# Extract headers
headers = [header.text.strip() for header in table.find_all("th")]
print(f"Number of headers: {len(headers)}")
print("Headers:", headers)

# Extract rows and adjust mismatched lengths
rows = []
for row in table.find_all("tr")[1:]:  # Skip the header row
    cells = row.find_all(["td", "th"])
    row_data = [cell.text.strip() for cell in cells]

    # Adjust row length to match headers
    if len(row_data) < len(headers):
        row_data.extend([""] * (len(headers) - len(row_data)))  # Pad with empty strings
    elif len(row_data) > len(headers):
        row_data = row_data[:len(headers)]  # Trim excess cells

    rows.append(row_data)

# Convert to a DataFrame
import pandas as pd
df = pd.DataFrame(rows, columns=headers)

# Display the DataFrame
print(df)

# Optionally save to a CSV file
df.to_csv("us_states_and_territories_life_expectancy.csv", index=False)
print("Table data saved to 'us_states_and_territories_life_expectancy.csv'.")


Number of headers: 12
Headers: ['state', '2019', '2019→2020', '2020', 'overall', 'males', 'females', 'F Δ M', 'overall', 'males', 'females', 'F Δ M']
                 state   2019 2019→2020   2020  overall  males  females  \
0              overall  males   females  F Δ M  overall  males  females   
1        US on average   78.8      76.3   81.4      3.1   −1.8     77.0   
2           California   80.9      78.4   83.3      4.9   −1.9     79.0   
3               Hawaii   80.9      78.0   83.9      5.9   −0.2     80.7   
4             New York   80.7      78.2   83.1      4.9   −3.0     77.7   
5            Minnesota   80.4      78.3   82.6      4.3   −1.3     79.1   
6        Massachusetts   80.4      77.9   82.8      4.9   −1.4     79.0   
7          Connecticut   80.3      77.7   82.8      5.1   −1.9     78.4   
8           New Jersey   80.1      77.6   82.5      4.9   −2.6     77.5   
9   Washington (state)   80.0      77.9   82.1      4.2   −0.8     79.2   
10            Colorado   

# Extract data of second table

In [16]:
# Locate all 'wikitable' tables on the page
tables = soup.find_all("table", {"class": "wikitable"})

# Check if there are at least two tables
if len(tables) < 2:
    print("Less than two tables found on the page.")
else:
    # Select the second table
    second_table = tables[1]
    print("Second table found successfully.")

    # Extract headers from the second table
    headers = [header.text.strip() for header in second_table.find_all("th")]
    print(f"Number of headers: {len(headers)}")
    print("Headers:", headers)

    # Extract rows and adjust mismatched lengths
    rows = []
    for row in second_table.find_all("tr")[1:]:  # Skip the header row
        cells = row.find_all(["td", "th"])
        row_data = [cell.text.strip() for cell in cells]

        # Adjust row length to match headers
        if len(row_data) < len(headers):
            row_data.extend([""] * (len(headers) - len(row_data)))  # Pad with empty strings
        elif len(row_data) > len(headers):
            row_data = row_data[:len(headers)]  # Trim excess cells

        rows.append(row_data)

    # Convert to a DataFrame
    import pandas as pd
    df = pd.DataFrame(rows, columns=headers)

    # Display the DataFrame
    print(df)

    # Optionally save to a CSV file
    df.to_csv("second_table_data.csv", index=False)
    print("Second table data saved to 'second_table_data.csv'.")


Second table found successfully.
Number of headers: 5
Headers: ['Rank', 'State/Territory', 'Life Expectancy 2019[9]', 'Male', 'Female']
   Rank               State/Territory Life Expectancy 2019[9]      Male  \
0    1.                        Hawaii                    81.6      78.6   
1    2.                    California                    81.2      78.7   
2    3.                      New York                    81.2      78.6   
3    5.                     Minnesota                    80.6      78.4   
4    6.                   Connecticut                    80.6      78.0   
5    4.                 Massachusetts                    80.7      78.1   
6     8                      Colorado                    80.2      78.0   
7    7.                    New Jersey                    80.5      78.0   
8    9.                    Washington                    80.2      78.1   
9   15.                       Florida                    79.6      76.9   
10  11.                          Utah  