# Data Collection via API 

**Content of Notebook:**
- Collect/Request data from the SpaceX API
- Clean the requested data

#### Import Libraries

In [17]:
import requests
# requests make HTTP requests to get data from an API
import pandas as pd
import numpy as np
import datetime

### Request SpaceX Launch Data with GET Request

Request rocket launch data from SpaceX API:

In [21]:
spacex_url="https://api.spacexdata.com/v4/launches/past"
response = requests.get(spacex_url)

Can check the content of the request, but it contains a large amount of data:

In [24]:
# print(response.content)

In [26]:
response.status_code  # Request was successfull with the 200 status response code

200

In [29]:
print(response.request.headers)  # 

{'User-Agent': 'python-requests/2.31.0', 'Accept-Encoding': 'gzip, deflate, br, zstd', 'Accept': '*/*', 'Connection': 'keep-alive'}


Turn response content (json result) into a Pandas dataframe using <code>.json_normalize()</code>

In [95]:
data = pd.json_normalize(response.json())  
data.shape

(187, 43)

In [97]:
data.columns # names of 43 columns

Index(['static_fire_date_utc', 'static_fire_date_unix', 'net', 'window',
       'rocket', 'success', 'failures', 'details', 'crew', 'ships', 'capsules',
       'payloads', 'launchpad', 'flight_number', 'name', 'date_utc',
       'date_unix', 'date_local', 'date_precision', 'upcoming', 'cores',
       'auto_update', 'tbd', 'launch_library_id', 'id', 'fairings.reused',
       'fairings.recovery_attempt', 'fairings.recovered', 'fairings.ships',
       'links.patch.small', 'links.patch.large', 'links.reddit.campaign',
       'links.reddit.launch', 'links.reddit.media', 'links.reddit.recovery',
       'links.flickr.small', 'links.flickr.original', 'links.presskit',
       'links.webcast', 'links.youtube_id', 'links.article', 'links.wikipedia',
       'fairings'],
      dtype='object')

Keep only what we need:

In [99]:
# Subset the dataframe keeping only the features we want, along with the flight number and date_utc.
data = data[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]

# Remove rows with multiple cores because those are falcon rockets with 2 extra rocket boosters and rows that have multiple payloads in a single rocket.
data = data[data['cores'].map(len)==1]
data = data[data['payloads'].map(len)==1]

# Since payloads and cores are lists of size 1, further extract the single value in the list and replace the feature.
data['cores'] = data['cores'].map(lambda x : x[0])
data['payloads'] = data['payloads'].map(lambda x : x[0])

# Convert the date_utc to a datetime datatype and then extracting the date leaving the time
data['date'] = pd.to_datetime(data['date_utc']).dt.date

# Restrict the dates of the launches
data = data[data['date'] <= datetime.date(2020, 11, 13)]

In [101]:
data.shape

(94, 7)

### Extract Further Information with ID numbers

In [106]:
data.head()

Unnamed: 0,rocket,payloads,launchpad,cores,flight_number,date_utc,date
0,5e9d0d95eda69955f709d1eb,5eb0e4b5b6c3bb0006eeb1e1,5e9e4502f5090995de566f86,"{'core': '5e9e289df35918033d3b2623', 'flight':...",1,2006-03-24T22:30:00.000Z,2006-03-24
1,5e9d0d95eda69955f709d1eb,5eb0e4b6b6c3bb0006eeb1e2,5e9e4502f5090995de566f86,"{'core': '5e9e289ef35918416a3b2624', 'flight':...",2,2007-03-21T01:10:00.000Z,2007-03-21
3,5e9d0d95eda69955f709d1eb,5eb0e4b7b6c3bb0006eeb1e5,5e9e4502f5090995de566f86,"{'core': '5e9e289ef3591855dc3b2626', 'flight':...",4,2008-09-28T23:15:00.000Z,2008-09-28
4,5e9d0d95eda69955f709d1eb,5eb0e4b7b6c3bb0006eeb1e6,5e9e4502f5090995de566f86,"{'core': '5e9e289ef359184f103b2627', 'flight':...",5,2009-07-13T03:35:00.000Z,2009-07-13
5,5e9d0d95eda69973a809d1ec,5eb0e4b7b6c3bb0006eeb1e7,5e9e4501f509094ba4566f84,"{'core': '5e9e289ef359185f2b3b2628', 'flight':...",6,2010-06-04T18:45:00.000Z,2010-06-04


A lot of the data are IDs. For example the rocket column has no information about the rocket just an identification number.  
Now use the API again to get information about the launches using the IDs for each launch.  
Will do this for four columns: <code>rocket</code>, <code>payloads</code>, <code>launchpad</code>, and <code>cores</code>.


#### Define helper functions that will use API to extract information using ID numbers in previous launch data.

From the <code>rocket</code> column --- extract booster version/name.

In [112]:
# Takes the dataset and uses the rocket column to call the API and append the data to the list
def getBoosterVersion(data):
    for x in data['rocket']:
       if x:
        response = requests.get("https://api.spacexdata.com/v4/rockets/"+str(x)).json()
        BoosterVersion.append(response['name'])

From the <code>launchpad</code> --- extract name of the launch site being used, the logitude, and the latitude.

In [115]:
# Takes the dataset and uses the launchpad column to call the API and append the data to the list
def getLaunchSite(data):
    for x in data['launchpad']:
       if x:
         response = requests.get("https://api.spacexdata.com/v4/launchpads/"+str(x)).json()
         Longitude.append(response['longitude'])
         Latitude.append(response['latitude'])
         LaunchSite.append(response['name'])

From the <code>payload</code> --- extract the mass of the payload and the destination orbit.

In [118]:
# Takes the dataset and uses the payloads column to call the API and append the data to the lists
def getPayloadData(data):
    for load in data['payloads']:
       if load:
        response = requests.get("https://api.spacexdata.com/v4/payloads/"+load).json()
        PayloadMass.append(response['mass_kg'])
        Orbit.append(response['orbit'])

From <code>cores</code> --- extract outcome of the landing, the type of the landing, number of flights with that core, whether gridfins were used, wheter the core is reused, wheter legs were used, the landing pad used, the block of the core which is a number used to seperate version of cores, the number of times this specific core has been reused, and the serial of the core.

In [121]:
# Takes the dataset and uses the cores column to call the API and append the data to the lists
def getCoreData(data):
    for core in data['cores']:
            if core['core'] != None:
                response = requests.get("https://api.spacexdata.com/v4/cores/"+core['core']).json()
                Block.append(response['block'])
                ReusedCount.append(response['reuse_count'])
                Serial.append(response['serial'])
            else:
                Block.append(None)
                ReusedCount.append(None)
                Serial.append(None)
            Outcome.append(str(core['landing_success'])+' '+str(core['landing_type']))
            Flights.append(core['flight'])
            GridFins.append(core['gridfins'])
            Reused.append(core['reused'])
            Legs.append(core['legs'])
            LandingPad.append(core['landpad'])

As an example, with API URL https://api.spacexdata.com/v4/payloads/5eb0e4b5b6c3bb0006eeb1e1  
You can see 'mass_kg' and 'orbit' info for this launch (launch id 5eb0e4b5b6c3bb0006eeb1e1, contained in previous version of data under 'payload')  
With the getPayLaodData function, thes two variables get extracted and appended into the next version of data under 'PayLoadMass', 'Orbit'.  
"  
<code> PayloadMass.append(response['mass_kg']) </code>  
<code> Orbit.append(response['orbit']) </code>  
"

#### Extracting Additional Inforation in Place of (/with) ID numbers in Previous Data

In [123]:
#Global variables 
BoosterVersion = []
PayloadMass = []
Orbit = []
LaunchSite = []
Outcome = []
Flights = []
GridFins = []
Reused = []
Legs = []
LandingPad = []
Block = []
ReusedCount = []
Serial = []
Longitude = []
Latitude = []

In [125]:
data.shape

(94, 7)

In [127]:
# Call getBoosterVersion function to get booster version
getBoosterVersion(data)

# Call getLaunchSite function to get launch site, longitude, latitude
getLaunchSite(data)

# Call getPayloadData function to get mass and orbit
getPayloadData(data)

# Call getCoreData function to get outcome, landing type, number of flights, fridfins use, core reuse, legs use, landing pad use, core version, number of core use, serial of core.
getCoreData(data)

In [130]:
len(BoosterVersion), len(LandingPad)

(94, 94)

In [132]:
BoosterVersion[0:5], LandingPad[0:5]

(['Falcon 1', 'Falcon 1', 'Falcon 1', 'Falcon 1', 'Falcon 9'],
 [None, None, None, None, None])

Now combine the columns into a dictionary.

In [135]:
launch_dict = {'FlightNumber': list(data['flight_number']),
'Date': list(data['date']),
'BoosterVersion':BoosterVersion,
'PayloadMass':PayloadMass,
'Orbit':Orbit,
'LaunchSite':LaunchSite,
'Outcome':Outcome,
'Flights':Flights,
'GridFins':GridFins,
'Reused':Reused,
'Legs':Legs,
'LandingPad':LandingPad,
'Block':Block,
'ReusedCount':ReusedCount,
'Serial':Serial,
'Longitude': Longitude,
'Latitude': Latitude}


Now can create a Pandas data frame from the dictionary.

In [138]:
launchdata = pd.DataFrame.from_dict(launch_dict)  

In [140]:
launchdata.shape

(94, 17)

In [142]:
launchdata.head()

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
0,1,2006-03-24,Falcon 1,20.0,LEO,Kwajalein Atoll,None None,1,False,False,False,,,0,Merlin1A,167.743129,9.047721
1,2,2007-03-21,Falcon 1,,LEO,Kwajalein Atoll,None None,1,False,False,False,,,0,Merlin2A,167.743129,9.047721
2,4,2008-09-28,Falcon 1,165.0,LEO,Kwajalein Atoll,None None,1,False,False,False,,,0,Merlin2C,167.743129,9.047721
3,5,2009-07-13,Falcon 1,200.0,LEO,Kwajalein Atoll,None None,1,False,False,False,,,0,Merlin3C,167.743129,9.047721
4,6,2010-06-04,Falcon 9,,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857


In [145]:
launchdata.describe()  

Unnamed: 0,FlightNumber,PayloadMass,Flights,Block,ReusedCount,Longitude,Latitude
count,94.0,88.0,94.0,90.0,94.0,94.0,94.0
mean,54.202128,5919.165341,1.755319,3.5,3.053191,-75.553302,28.581782
std,30.589048,4909.689575,1.197544,1.595288,4.153938,53.39188,4.639981
min,1.0,20.0,1.0,1.0,0.0,-120.610829,9.047721
25%,28.25,2406.25,1.0,2.0,0.0,-80.603956,28.561857
50%,52.5,4414.0,1.0,4.0,1.0,-80.577366,28.561857
75%,81.5,9543.75,2.0,5.0,4.0,-80.577366,28.608058
max,106.0,15600.0,6.0,5.0,13.0,167.743129,34.632093


### Data Cleaning

#### Filter the dataframe to only include `Falcon 9` launches

Filter the dataframe using the <code>BoosterVersion</code> column to only keep the Falcon 9 launches (removing Falcon 1 launches). 

In [161]:
data_falcon9 = launchdata[launchdata['BoosterVersion']!='Falcon 1']  

In [163]:
data_falcon9.shape

(90, 17)

Now that we have removed some values we should reset the FlgihtNumber column

In [168]:
data_falcon9.loc[:,'FlightNumber'] = list(range(1, data_falcon9.shape[0]+1))
data_falcon9.head()

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
4,1,2010-06-04,Falcon 9,,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857
5,2,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0005,-80.577366,28.561857
6,3,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0007,-80.577366,28.561857
7,4,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,,1.0,0,B1003,-120.610829,34.632093
8,5,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B1004,-80.577366,28.561857


#### Missing Data

In [173]:
data_falcon9.isnull().sum()

FlightNumber       0
Date               0
BoosterVersion     0
PayloadMass        5
Orbit              0
LaunchSite         0
Outcome            0
Flights            0
GridFins           0
Reused             0
Legs               0
LandingPad        26
Block              0
ReusedCount        0
Serial             0
Longitude          0
Latitude           0
dtype: int64

The <code>LandingPad</code> column has 'None' values when landing pads were not used.   
So only need to deal with missing in <code>PayLoadMass</code>: replace the np.nan values with its mean value

In [183]:
# Calculate the mean value of PayloadMass column
avg_plm = data_falcon9["PayloadMass"].astype("float").mean(axis=0)   
data_falcon9["PayloadMass"].replace(np.nan, avg_plm, inplace=True)  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_falcon9["PayloadMass"].replace(np.nan, avg_plm, inplace=True)


In [185]:
data_falcon9.isnull().sum()

FlightNumber       0
Date               0
BoosterVersion     0
PayloadMass        0
Orbit              0
LaunchSite         0
Outcome            0
Flights            0
GridFins           0
Reused             0
Legs               0
LandingPad        26
Block              0
ReusedCount        0
Serial             0
Longitude          0
Latitude           0
dtype: int64

#### Cleaning complete. Export Data to csv:

In [None]:
# data_falcon9.to_csv('sw_dataset_part_1.csv', index=False)