## Data Collection - Web Scrapping

#### Importing Required Packages and Defining Auxiliary functions

In [None]:
# Will web scrap Falcon 9 launch records with BeautifulSoup as a HTML table and then convert it to Pandas dataframe

#!pip3 install beautifulsoup4
#!pip3 install requests

In [None]:
import sys
import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

In [None]:
# Defining some helper functions to process web scrapped HTML table

def date_time(table_cells):
  '''
  This function returns date and time from the HTML table cell.
  Input: The element of a table data cell extracts extra row
  '''
  return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
  '''
  This function returns the booster version from the HTML table cell.
  Input: The element of a table data cell extracts extra row
  '''
  out = ''.join([booster_version for i, booster_version in enumerate(list(table_cells.strings)) if i%2 == 0][0:-1])
  return out

def landing_status(table_cells):
  '''
  This function returns the landing status from the HTML table cell.
  Input: The element of a table data cell extracts extra row
  '''
  out = [i for i in table_cells.strings][0]
  return out

def get_mass(table_cells):
  mass = unicodedata.normalize('NFKD', table_cells.text).strip()
  if mass:
    mass.find('kg')
    new_mass = mass[0: mass.find('kg') + 2]
  else:
    new_mass = 0
  return new_mass

def extract_column_from_header(row):
  '''
  This function returns the landing status from the HTML table cell.
  Input: The element of a table data cell extracts extra row
  '''
  if (row.br):
    row.br.extract()
  if row.a:
    row.a.extract()
  if row.sup:
    row.sup.extract()

  column_name = ' '.join(row.contents)

  # Filter the digit and empty names
  if not(column_name.strip().isdigit()):
    column_name = column_name.strip()
    return column_name

#### Task 1: Request the Falcon 9 Launch Wiki Page from its URL

In [None]:
# To keep the lab tasks consistent, scrap the data from a snapshot of the Wiki page from 9th June 2021
static_url = 'https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922'

In [None]:
# Perform a HTTP GET request to get the Falcon 9 HTML page as an HTTP response
response = requests.get(static_url)

# Create a BeautfulSoup object from the HTML response
soup = BeautifulSoup(response.text, 'html.parser')

# Print the page title to verify the BeautifulSoup object was created properly
print(soup.title)

<title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>


#### Task 2: Extract All Column/Variable Names from the HTML Header Table

In [None]:
# Finding all the tables on the wiki page
html_tables = soup.find_all(name = 'table')
#html_tables[0]   # Uncomment this line it to verify that the list is created properly

In [None]:
# Printing third table to check its content
first_launch_table = html_tables[2]
# print(first_launch_table)

In [None]:
# Checking the column names embedded in table header elements <th>
first_launch_table.find_all(name = 'th')

In [None]:
column_names = []
first_launch_table_headers = first_launch_table.find_all(name = 'th')
for row in first_launch_table_headers:
  header = extract_column_from_header(row)
  if header:
    column_names.append(header)

In [None]:
print(column_names)

['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


#### Task 3: Create a Dataframe by Parsing the Launch HTML Tables

In [None]:
# Creating an empty dictionary with keys from the extracted column names.
# Later, this dictionary will be converted into a Pandas dataframe.

launch_dict = dict.fromkeys(column_names)

# Remove an irrelevant column
del launch_dict['Date and time ( )']

# Initializing the launch_dict with each value to be an empty list
for key_ in launch_dict.keys():
  launch_dict[key_] = []

# Adding some new columns
launch_dict['Version Booster'] = []
launch_dict['Booster Landing'] = []
launch_dict['Date'] = []
launch_dict['Time'] = []

launch_dict

{'Flight No.': [],
 'Launch site': [],
 'Payload': [],
 'Payload mass': [],
 'Orbit': [],
 'Customer': [],
 'Launch outcome': [],
 'Version Booster': [],
 'Booster Landing': [],
 'Date': [],
 'Time': []}

In [None]:
# Filling up the launch_dict with launch records from table records.
# HTML tables on Wiki pages are likely to contain unexpected annotations and other types of noises - will be dealing with thos noises as well.

extracted_row = 0

# Extract each table
for table_number, table in enumerate(soup.find_all('table', 'wikitable plainrowheaders collapsible')):

  # Get table row
  for rows in table.find_all('tr'):

    # Check to see if first table heading is as number corresponding to a launch number
    if rows.th:
      if rows.th.string:
        flight_number = rows.th.string.strip()
        flag = flight_number.isdigit()
    else:
      flag = False

    # Get table element
    row = rows.find_all('td')

    # If it is a nuber, save cells in a dictionary
    if flag:
      extracted_row += 1
      # Flight number value
      launch_dict['Flight No.'].append(flight_number)
      #print(flight_number)
      datatimelist = date_time(row[0])

      # Date value
      date = datatimelist[0].strip(',')
      launch_dict['Date'].append(date)
      #print(date)

      # Time value
      time = datatimelist[1]
      launch_dict['Time'].append(time)
      #print(time)

      # Booster version
      bv = booster_version(row[1])
      if not(bv):
        bv = row[2].a.string
      launch_dict['Version Booster'].append(bv)
      #print(bv)

      # Launch site
      launch_site = row[2].a.string
      launch_dict['Launch site'].append(launch_site)
      #print(launch_site)

      # Payload
      payload = row[3].a.string
      launch_dict['Payload'].append(payload)
      #print(payload)

      # Payload mass
      payload_mass = get_mass(row[4])
      launch_dict['Payload mass'].append(payload_mass)
      #print(payload_mass)

      # Orbit
      orbit = row[5].a.string
      launch_dict['Orbit'].append(orbit)
      #print(orbit)

      # Customer
      # Adding this extra try-except block as there is a value in table 8 under Customer column which does not have an anchor tag and therefore gives error
      try:
        customer = row[6].a.string
      except:
        customer = row[6].string.strip()
      launch_dict['Customer'].append(customer)
      #print(customer)

      # Launch outcome
      launch_outcome = list(row[7].strings)[0]
      launch_dict['Launch outcome'].append(launch_outcome)
      #print(launch_outcome)

      # Booster landing
      booster_landing = landing_status(row[8])
      launch_dict['Booster Landing'].append(booster_landing)
      #print(booster_landing)

In [None]:
# Creating a dataframe using launch_dict

df = pd.DataFrame({key: pd.Series(value) for key, value in launch_dict.items()})
df.head()

Unnamed: 0,Flight No.,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster Landing,Date,Time
0,1,CCAFS,Dragon Spacecraft Qualification Unit,0,LEO,SpaceX,Success\n,F9 v1.0B0003.1,Failure,4 June 2010,18:45
1,2,CCAFS,Dragon,0,LEO,NASA,Success,F9 v1.0B0004.1,Failure,8 December 2010,15:43
2,3,CCAFS,Dragon,525 kg,LEO,NASA,Success,F9 v1.0B0005.1,No attempt\n,22 May 2012,07:44
3,4,CCAFS,SpaceX CRS-1,"4,700 kg",LEO,NASA,Success\n,F9 v1.0B0006.1,No attempt,8 October 2012,00:35
4,5,CCAFS,SpaceX CRS-2,"4,877 kg",LEO,NASA,Success\n,F9 v1.0B0007.1,No attempt\n,1 March 2013,15:10
