## Data Collection - API

#### Imorting Libraries and Defining Auxiliary Functions

In [None]:
# Requests allows you to make HTTP requests which will be used to get data from an API
import requests

import pandas as pd
import numpy as np

# Datetime is a library that allows you to represent dates.
import datetime

# Setting this option will allow you to print all the columns of a dataframe
pd.set_option('display.max_columns', None)
# Setting this option will allow you to print all of the data in a feature
pd.set_option('display.max_colwidth', None)

Defining a series of helper functions that will allow you to use the API to extract information using identification numbers in the launch data.

In [None]:
# To get the booster name from the rocket column

# Takes the dataset and uses the rocket column to call an API and append the data to the list
def getBoosterVersion(data):
  for x in data['rocket']:
    if x:
      response = requests.get('https://api.spacexdata.com/v4/rockets/' + str(x)).json()
      BoosterVersion.append(response['name'])

In [None]:
# To get the name of the launch site being used, the longitude and the latitude from the launchpad column

# Takes the dataset and uses the launchpad column to call an API and append the data to the lists
def getLaunchSite(data):
  for x in data['launchpad']:
    response = requests.get('https://api.spacexdata.com/v4/launchpads/' + str(x)).json()
    Longitude.append(response['longitude'])
    Latitude.append(response['latitude'])
    LaunchSite.append(response['name'])

In [None]:
# To get the mass of the payload and the orbit that it is going to from the payload column

# Takes the dataset and uses the payloads column to call an API and appends the data to the lists
def getPayloadData(data):
  for load in data['payloads']:
    if load:
      response = requests.get('https://api.spacexdata.com/v4/payloads/' + str(load)).json()
      PayloadMass.append(response['mass_kg'])
      Orbit.append(response['orbit'])

In [None]:
# To get the outcome of the landing, the type of the landing, number of flights with that core, whether gridfins were used,
# whether the core is reused, whether legs were reused, the landing pad used, the bloack of the core which is a number used to
# separate versions of a core, the nuber of times this specific core has been used, and the serial of the core from the cores column

# Takes the dataset and uses the cores column to call an API and append the data to the lists
def getCoreData(data):
  for core in data['cores']:
    if core['core'] != None:
      response = requests.get('https://api.spacexdata.com/v4/cores/' + core['core']).json()
      Block.append(response['block'])
      ReusedCount.append(response['reuse_count'])
      Serial.append(response['serial'])
    else:
      Block.append[None]
      ReusedCount.append(None)
      Serial.append(None)
    Outcome.append(str(core['landing_success']) + ' ' + str(core['landing_type']))
    Flights.append(core['flight'])
    GridFins.append(core['gridfins'])
    Reused.append(core['reused'])
    Legs.append(core['legs'])
    LandingPad.append(core['landpad'])

#### Task 1: Requesting Rocket Launch Data from SpaceX API

In [None]:
spacex_url = 'https://api.spacexdata.com/v4/launches/past'
response = requests.get(spacex_url)

In [None]:
# Check the content of the response
# print(response.content)

Request and Parse the SpaceX Launch Data using the GET Request

In [None]:
# To make the requested JSON results more consistent, use the following static response object for this project
static_json_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/API_call_spacex_api.json'

In [None]:
response.status_code    # 200 status respinse code means that request was successfull

200

In [None]:
# Turning the response content into a pandas dataframe

data = pd.json_normalize(response.json())
data.head(1)

Unnamed: 0,static_fire_date_utc,static_fire_date_unix,net,window,rocket,success,failures,details,crew,ships,capsules,payloads,launchpad,flight_number,name,date_utc,date_unix,date_local,date_precision,upcoming,cores,auto_update,tbd,launch_library_id,id,fairings.reused,fairings.recovery_attempt,fairings.recovered,fairings.ships,links.patch.small,links.patch.large,links.reddit.campaign,links.reddit.launch,links.reddit.media,links.reddit.recovery,links.flickr.small,links.flickr.original,links.presskit,links.webcast,links.youtube_id,links.article,links.wikipedia,fairings
0,2006-03-17T00:00:00.000Z,1142554000.0,False,0.0,5e9d0d95eda69955f709d1eb,False,"[{'time': 33, 'altitude': None, 'reason': 'merlin engine failure'}]",Engine failure at 33 seconds and loss of vehicle,[],[],[],[5eb0e4b5b6c3bb0006eeb1e1],5e9e4502f5090995de566f86,1,FalconSat,2006-03-24T22:30:00.000Z,1143239400,2006-03-25T10:30:00+12:00,hour,False,"[{'core': '5e9e289df35918033d3b2623', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}]",True,False,,5eb87cd9ffd86e000604b32a,False,False,False,[],https://images2.imgbox.com/94/f2/NN6Ph45r_o.png,https://images2.imgbox.com/5b/02/QcxHUb5V_o.png,,,,,[],[],,https://www.youtube.com/watch?v=0a_00nJ_Y88,0a_00nJ_Y88,https://www.space.com/2196-spacex-inaugural-falcon-1-rocket-lost-launch.html,https://en.wikipedia.org/wiki/DemoSat,


In [None]:
# Will use the API again to get information about the launches using the IDs given for each launch.
# Specific columns to be used: rockets, payloads, launchpad and cores

# Taking only the subset of data that will be used for further analysis
data = data[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]

# Removing the rows with multiple cores because those are falcon rockets 2 extra rocket boostersand ros that have multiple payloads in a single rocket.
data = data[data['cores'].map(len) == 1]
data = data[data['payloads'].map(len) == 1]

# Since payloads and cores are lists of size 1, you will also have to extract the single value in the list and replace the feature.
data['cores'] = data['cores'].map(lambda x:x[0])
data['payloads'] = data['payloads'].map(lambda x:x[0])

# Converting the date_utc to datetime data type and then keeping the date and dropping the time
data['date'] = pd.to_datetime(data['date_utc']).dt.date

# Restricting the dates of the launches using the date
data = data[data['date'] <= datetime.date(2020, 11, 13)]

In [None]:
#Global variables
BoosterVersion = []
PayloadMass = []
Orbit = []
LaunchSite = []
Outcome = []
Flights = []
GridFins = []
Reused = []
Legs = []
LandingPad = []
Block = []
ReusedCount = []
Serial = []
Longitude = []
Latitude = []

Using the auxiliary functions created above to get additiona data from the API.

* From the <code>rocket</code> we would like to learn the booster name

* From the <code>payload</code> we would like to learn the mass of the payload and the orbit that it is going to

* From the <code>launchpad</code> we would like to know the name of the launch site being used, the longitude, and the latitude.

* From <code>cores</code> we would like to learn the outcome of the landing, the type of the landing, number of flights with that core, whether gridfins were used, whether the core is reused, whether legs were used, the landing pad used, the block of the core which is a number used to seperate version of cores, the number of times this specific core has been reused, and the serial of the core.

The data from these requests will be stored in lists above and will be used to create a new dataframe.

In [None]:
getBoosterVersion(data)
getLaunchSite(data)
getPayloadData(data)
getCoreData(data)

In [None]:
# Creating the final dataframe

launch_dict = {'FlightNumber': list(data['flight_number']),
               'Date': list(data['date']),
               'BoosterVersion': BoosterVersion,
               'PayloadMass': PayloadMass,
               'Orbit': Orbit,
               'LaunchSite': LaunchSite,
               'Outcome': Outcome,
               'Flights': Flights,
               'GridFins': GridFins,
               'Reused': Reused,
               'Legs': Legs,
               'LandingPad': LandingPad,
               'Block': Block,
               'ReusedCount': ReusedCount,
               'Serial': Serial,
               'Longitude': Longitude,
               'Latitude': Latitude}

launch_df = pd.DataFrame(launch_dict)
launch_df.describe(include = 'all')

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
count,94.0,94,94,88.0,94,94,94,94.0,94,94,94,64,90.0,94.0,94,94.0,94.0
unique,,94,2,,11,4,8,,2,2,2,5,,,57,,
top,,2006-03-24,Falcon 9,,GTO,CCSFS SLC 40,True ASDS,,True,False,True,5e9e3032383ecb6bb234e7ca,,,B1049,,
freq,,1,90,,27,55,41,,70,57,71,35,,,6,,
mean,54.202128,,,5919.165341,,,,1.755319,,,,,3.5,3.053191,,-75.553302,28.581782
std,30.589048,,,4909.689575,,,,1.197544,,,,,1.595288,4.153938,,53.39188,4.639981
min,1.0,,,20.0,,,,1.0,,,,,1.0,0.0,,-120.610829,9.047721
25%,28.25,,,2406.25,,,,1.0,,,,,2.0,0.0,,-80.603956,28.561857
50%,52.5,,,4414.0,,,,1.0,,,,,4.0,1.0,,-80.577366,28.561857
75%,81.5,,,9543.75,,,,2.0,,,,,5.0,4.0,,-80.577366,28.608058


#### Task 2: Filter the dataframe to only include `Falcon 9` launches

In [None]:
# Filtering the dataframe generated above to only keep the rows corresponding to Falcon 9
data_falcon9 = launch_df[launch_df['BoosterVersion'] == 'Falcon 9']

In [None]:
data_falcon9.loc[:, 'FlightNumber'] = list(range(1, data_falcon9.shape[0] + 1))
data_falcon9

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
4,1,2010-06-04,Falcon 9,,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857
5,2,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0005,-80.577366,28.561857
6,3,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0007,-80.577366,28.561857
7,4,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,,1.0,0,B1003,-120.610829,34.632093
8,5,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B1004,-80.577366,28.561857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,86,2020-09-03,Falcon 9,15600.0,VLEO,KSC LC 39A,True ASDS,2,True,True,True,5e9e3032383ecb6bb234e7ca,5.0,12,B1060,-80.603956,28.608058
90,87,2020-10-06,Falcon 9,15600.0,VLEO,KSC LC 39A,True ASDS,3,True,True,True,5e9e3032383ecb6bb234e7ca,5.0,13,B1058,-80.603956,28.608058
91,88,2020-10-18,Falcon 9,15600.0,VLEO,KSC LC 39A,True ASDS,6,True,True,True,5e9e3032383ecb6bb234e7ca,5.0,12,B1051,-80.603956,28.608058
92,89,2020-10-24,Falcon 9,15600.0,VLEO,CCSFS SLC 40,True ASDS,3,True,True,True,5e9e3033383ecbb9e534e7cc,5.0,12,B1060,-80.577366,28.561857


#### Task 3: Dealing with Missing Values

In [None]:
# Checking for null values in each column
data_falcon9.isnull().sum()

FlightNumber       0
Date               0
BoosterVersion     0
PayloadMass        5
Orbit              0
LaunchSite         0
Outcome            0
Flights            0
GridFins           0
Reused             0
Legs               0
LandingPad        26
Block              0
ReusedCount        0
Serial             0
Longitude          0
Latitude           0
dtype: int64

In [None]:
# Replacing the null values in the PayloadMass column with the average of the column
data_falcon9['PayloadMass'].replace(np.NaN, data_falcon9['PayloadMass'].mean(), inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_falcon9['PayloadMass'].replace(np.NaN, data_falcon9['PayloadMass'].mean(), inplace = True)


In [None]:
# Rechecking if all the null values have been replaced
data_falcon9.isnull().sum()

FlightNumber       0
Date               0
BoosterVersion     0
PayloadMass        0
Orbit              0
LaunchSite         0
Outcome            0
Flights            0
GridFins           0
Reused             0
Legs               0
LandingPad        26
Block              0
ReusedCount        0
Serial             0
Longitude          0
Latitude           0
dtype: int64

In [None]:
# Exporting the dataset to be used for the next section.
data_falcon9.to_csv('dataset_part1.csv', index = False)