In [1]:
!pip3 install beautifulsoup4
!pip3 install requests



In [2]:
import sys

import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

In [7]:
def date_time(table_cells):
    
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    
    out=''.join([booster_version for i,booster_version in enumerate( table_cells.strings) if i%2==0][0:-1])
    return out

def landing_status(table_cells):
    
    out=[i for i in table_cells.strings][0]
    return out


def get_mass(table_cells):
    mass=unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass.find("kg")
        new_mass=mass[0:mass.find("kg")+2]
    else:
        new_mass=0
    return new_mass


def extract_column_from_header(row):
    
    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
        
    colunm_name = ' '.join(row.contents)
    
    # Filter the digit and empty names
    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
        return colunm_name    


In [4]:
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

In [5]:
response = requests.get(static_url)

In [6]:
soup = BeautifulSoup(response.text, 'html.parser')

In [8]:
print(soup.title)

<title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>


In [9]:
html_tables = soup.find_all('table', "wikitable plainrowheaders collapsible")  

In [10]:
# Let's print the third table and check its content
first_launch_table = html_tables[2]
print(first_launch_table)

<table class="wikitable plainrowheaders collapsible" style="width: 100%;">
<tbody><tr>
<th scope="col">Flight No.
</th>
<th scope="col">Date and<br/>time (<a href="/wiki/Coordinated_Universal_Time" title="Coordinated Universal Time">UTC</a>)
</th>
<th scope="col"><a href="/wiki/List_of_Falcon_9_first-stage_boosters" title="List of Falcon 9 first-stage boosters">Version,<br/>Booster</a><sup class="reference" id="cite_ref-booster_11-2"><a href="#cite_note-booster-11">[b]</a></sup>
</th>
<th scope="col">Launch site
</th>
<th scope="col">Payload<sup class="reference" id="cite_ref-Dragon_12-2"><a href="#cite_note-Dragon-12">[c]</a></sup>
</th>
<th scope="col">Payload mass
</th>
<th scope="col">Orbit
</th>
<th scope="col">Customer
</th>
<th scope="col">Launch<br/>outcome
</th>
<th scope="col"><a href="/wiki/Falcon_9_first-stage_landing_tests" title="Falcon 9 first-stage landing tests">Booster<br/>landing</a>
</th></tr>
<tr>
<th rowspan="2" scope="row" style="text-align:center;">14
</th>
<td>

In [11]:
column_names = []

header_th_elements = first_launch_table.find_all('th')

for th in header_th_elements:
    column_name = extract_column_from_header(th)
    
    if column_name is not None and len(column_name) > 0:
        column_names.append(column_name)


In [12]:
print(column_names)

['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


In [13]:
launch_dict= dict.fromkeys(column_names)

# Remove an irrelvant column
del launch_dict['Date and time ( )']

# Let's initial the launch_dict with each value to be an empty list
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []
# Added some new columns
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]

In [14]:
# Create a list to store the dictionaries
launch_records = []

# Extract each table
for table_number, table in enumerate(soup.find_all('table', "wikitable plainrowheaders collapsible")):
    # Get table rows
    for rows in table.find_all("tr"):
        # Check to see if the first table heading is a number corresponding to launch a number
        if rows.th and rows.th.string:
            flight_number = rows.th.string.strip()
            flag = flight_number.isdigit()
        else:
            flag = False
        
        # Get table elements
        row = rows.find_all('td')
        
        # If it is a number, save cells in a dictionary
        if flag:
            # Create a dictionary for the current row
            launch_entry = {}
            
            # Flight Number value
            launch_entry['Flight No.'] = flight_number
            
            datatimelist = date_time(row[0])
            
            # Date value
            launch_entry['Date'] = datatimelist[0].strip(',')
            
            # Time value
            launch_entry['Time'] = datatimelist[1]
            
            # Booster version
            bv = booster_version(row[1])
            if not bv:
                bv = row[1].a.string
            launch_entry['Version Booster'] = bv
            
            # Launch Site
            launch_entry['Launch site'] = row[2].a.string
            
            # Payload
            launch_entry['Payload'] = row[3].a.string
            
            # Payload Mass
            launch_entry['Payload mass'] = get_mass(row[4])
            
            # Orbit
            launch_entry['Orbit'] = row[5].a.string
            
            # Customer
            customer_element = row[6].a
            launch_entry['Customer'] = customer_element.string if customer_element else ""
            
            # Launch outcome
            launch_entry['Launch outcome'] = list(row[7].strings)[0]
            
            # Booster landing
            launch_entry['Booster landing'] = landing_status(row[8])
            
            # Append the dictionary to the list
            launch_records.append(launch_entry)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(launch_records)

# Print the DataFrame
print(df)


    Flight No.             Date   Time Version Booster Launch site  \
0            1      4 June 2010  18:45  F9 v1.0B0003.1       CCAFS   
1            2  8 December 2010  15:43  F9 v1.0B0004.1       CCAFS   
2            3      22 May 2012  07:44  F9 v1.0B0005.1       CCAFS   
3            4   8 October 2012  00:35  F9 v1.0B0006.1       CCAFS   
4            5     1 March 2013  15:10  F9 v1.0B0007.1       CCAFS   
..         ...              ...    ...             ...         ...   
116        117       9 May 2021  06:42   F9 B5B1051.10       CCSFS   
117        118      15 May 2021  22:56    F9 B5B1058.8         KSC   
118        119      26 May 2021  18:59    F9 B5B1063.2       CCSFS   
119        120      3 June 2021  17:29    F9 B5B1067.1         KSC   
120        121      6 June 2021  04:26           F9 B5       CCSFS   

                                  Payload Payload mass Orbit   Customer  \
0    Dragon Spacecraft Qualification Unit            0   LEO     SpaceX   
1        

In [None]:
df.to_csv('spacex_web_scraped.csv', index=False)