### Import Required Libraries and Set Up Environment Variables

In [106]:
# Dependencies
import requests
import time
import os
import pandas as pd
import json
import os
from dotenv import load_dotenv
from datetime import datetime

## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv("NASA_API_KEY")

### CME Data

In [136]:
# Dedicated function for making get requests
def fetch_data(url: str, params: dict = {}, jdumps: bool = False):
    """
    Fetch data from from url, params are optional,
    setting jdumps to True will return print the 
    first result w/ an indent of 4
    """
    print("Attempting to connect.")

    if params != {}:
        print('using parameters')
        res = requests.get(url, params)

    print('no request parameters')
    res = requests.get(url)

    if res.status_code != 200:
        print(f"Connection Error!! Code: {res.status_code}")

    if jdumps is True:
        print("Returning first result:")
        print(json.dumps(res.json(), indent=4))

    print("Successfully connected")
    return res.json()

In [108]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2023-05-01"
endDate = "2024-05-01"

# Build URL for CME
query_url = (
    f"{base_url}{CME}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"
)
print(query_url)

https://api.nasa.gov/DONKI/CME?startDate=2023-05-01&endDate=2024-05-01&api_key=17LT6HvZHgUZnEPmhuAUME9kRtVzSKhrLU9sqRKc


In [109]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json = fetch_data(query_url, {}, True)

Attempting to connect.
no request parameters
Returning json.dumps()
[
    {
        "activityID": "2023-05-01T01:48:00-CME-001",
        "catalog": "M2M_CATALOG",
        "startTime": "2023-05-01T01:48Z",
        "instruments": [
            {
                "displayName": "SOHO: LASCO/C2"
            },
            {
                "displayName": "SOHO: LASCO/C3"
            },
            {
                "displayName": "STEREO A: SECCHI/COR2"
            }
        ],
        "sourceLocation": "",
        "activeRegionNum": null,
        "note": "Visible in the E of SOHO LASCO C2/C3 and STEREO A COR2. The source may be on or just beyond the NE limb as seen from SDO. The source signature can be seen as opening/rising field lines around N15 on the east limb starting around 2023-05-01T01:00Z in SDO AIA 171/193. Source not clearly visible in STEREO A EUV imagery due to a data gap.",
        "submissionTime": "2023-05-01T17:36Z",
        "versionId": 1,
        "link": "https://webtool

In [110]:
# Convert cme_json to a Pandas DataFrame
df = pd.DataFrame(cme_json)
df.info()

# Keep only the columns: activityID, startTime, linkedEvents
df = df[["activityID", "startTime", "linkedEvents"]]
df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1228 entries, 0 to 1227
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   activityID       1228 non-null   object 
 1   catalog          1228 non-null   object 
 2   startTime        1228 non-null   object 
 3   instruments      1228 non-null   object 
 4   sourceLocation   1228 non-null   object 
 5   activeRegionNum  301 non-null    float64
 6   note             1228 non-null   object 
 7   submissionTime   1228 non-null   object 
 8   versionId        1228 non-null   int64  
 9   link             1228 non-null   object 
 10  cmeAnalyses      1228 non-null   object 
 11  linkedEvents     265 non-null    object 
dtypes: float64(1), int64(1), object(10)
memory usage: 115.3+ KB


activityID      object
startTime       object
linkedEvents    object
dtype: object

In [111]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
df = df.dropna(how='any')
df.isna().sum()
# df.isnull().sum()


activityID      0
startTime       0
linkedEvents    0
dtype: int64

In [112]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents'
# and adds the elements individually to a list of dictionaries where each row is one element

# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over each index in the DataFrame
for i in range(len(df)):
  
    # Iterate over each dictionary in the list
    for e in df.iloc[i]["linkedEvents"]:

        # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
        expanded_rows.append(e)

In [113]:
# Create a new DataFrame from the expanded rows
events_df = pd.DataFrame(expanded_rows)
events_df.sample(n=5)

Unnamed: 0,activityID
179,2023-09-01T04:49:00-SEP-001
472,2024-03-24T14:10:00-IPS-001
502,2024-04-21T12:11:00-FLR-001
278,2023-12-01T00:11:00-IPS-001
21,2023-05-09T11:59:00-FLR-001


In [114]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors
def extract_activityID_from_dict(events: list):
    """
      Extracts and returns the activityId from
      linkedEvents dictionary.
      
      @params(event: [])
      
      @returns(activityID: [] | read_errors: [])
    """
    ids = []
    for e in range(len(events)):
        try:
            ids.append(events[e]["activityID"])
        except KeyError:
            ids.append("ID ERROR!")
    return ids

In [115]:
df.isnull().sum()

activityID      0
startTime       0
linkedEvents    0
dtype: int64

In [116]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:

df["GST_ActivityID"] = df.linkedEvents.apply(
    lambda x: extract_activityID_from_dict(x)
)

df.sample(n=5)

Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
381,2023-08-25T11:48:00-CME-001,2023-08-25T11:48Z,[{'activityID': '2023-08-25T11:11:00-FLR-001'}],[2023-08-25T11:11:00-FLR-001]
1054,2024-03-17T03:36:00-CME-001,2024-03-17T03:36Z,[{'activityID': '2024-03-21T01:50:00-IPS-001'}],[2024-03-21T01:50:00-IPS-001]
624,2023-11-15T12:36:00-CME-001,2023-11-15T12:36Z,[{'activityID': '2023-11-15T12:05:00-FLR-001'}],[2023-11-15T12:05:00-FLR-001]
493,2023-09-30T04:44:00-CME-001,2023-09-30T04:44Z,[{'activityID': '2023-09-30T03:57:00-FLR-001'}],[2023-09-30T03:57:00-FLR-001]
266,2023-07-18T20:12:00-CME-001,2023-07-18T20:12Z,[{'activityID': '2023-07-18T19:32:00-FLR-001'}...,"[2023-07-18T19:32:00-FLR-001, 2023-07-18T20:00..."


In [117]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
df.isnull().sum()

activityID        0
startTime         0
linkedEvents      0
GST_ActivityID    0
dtype: int64

In [118]:
# print out the datatype of each column in this DataFrame:
df.dtypes

activityID        object
startTime         object
linkedEvents      object
GST_ActivityID    object
dtype: object

In [119]:
'''
  NOTE: This cell is difficult to control.
  I am going to wrap this in a function so
  it can be used more surgically

  I am coping the original df because it will
  allow the user to either store the copy as
  a new variable or save the cdf to the 
  original df. Making the choice of how 
  they want to use the result a little
  easier.
'''
def cleanUp(df):
    # Copy the original df (clean dataframe).
    cdf = df.copy()
    
    # Convert the 'GST_ActivityID' column to string format 
    cdf.GST_ActivityID = cdf.GST_ActivityID.astype('string')

    if 'startTime' in df.columns:
      # Convert startTime to datetime format  
      cdf.startTime = pd.to_datetime(df.startTime)

      # Rename startTime to startTime_CME and activityID to cmeID
      cdf = cdf.rename(columns={'startTime':'startTime_CME', 'activityID':'cmeID'})

    # Drop linkedEvents
    cdf = cdf.drop('linkedEvents', axis=1)

    # Verify that all steps were executed correctly
    cdf.info()

    # Return cdf so it can be saved as a variable
    return cdf

clean_df = cleanUp(df)

<class 'pandas.core.frame.DataFrame'>
Index: 265 entries, 1 to 1227
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   cmeID           265 non-null    object             
 1   startTime_CME   265 non-null    datetime64[ns, UTC]
 2   GST_ActivityID  265 non-null    string             
dtypes: datetime64[ns, UTC](1), object(1), string(1)
memory usage: 8.3+ KB


In [120]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 265 entries, 1 to 1227
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   cmeID           265 non-null    object             
 1   startTime_CME   265 non-null    datetime64[ns, UTC]
 2   GST_ActivityID  265 non-null    string             
dtypes: datetime64[ns, UTC](1), object(1), string(1)
memory usage: 8.3+ KB


In [121]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.

# save all strings that contain 'GST'
gst_events = clean_df[clean_df.GST_ActivityID.str.contains('GST')]

# loop through the events and print each event
for event in gst_events.GST_ActivityID:
  print(event)

['2023-08-05T02:10:00-IPS-001', '2023-08-05T03:00:00-GST-001', '2023-08-05T04:57:00-IPS-001', '2023-08-09T15:20:00-RBE-001', '2023-08-11T13:05:00-RBE-001']
['2023-09-14T21:17:00-FLR-001', '2023-09-18T17:54:00-MPC-001', '2023-09-19T00:00:00-GST-001', '2023-09-19T14:55:00-RBE-001']
['2023-09-18T09:04:00-IPS-001', '2023-09-18T12:58:00-IPS-001', '2023-09-18T17:54:00-MPC-001', '2023-09-19T00:00:00-GST-001', '2023-09-19T14:55:00-RBE-001']
['2023-11-05T03:38:00-IPS-001', '2023-11-05T08:10:00-IPS-001', '2023-11-05T09:00:00-GST-001', '2023-11-05T10:34:00-MPC-001', '2023-11-07T15:35:00-RBE-001']
['2023-11-03T04:40:00-FLR-001', '2023-11-05T09:00:00-GST-001', '2023-11-05T11:45:00-IPS-001', '2023-11-05T12:35:00-IPS-001', '2023-11-05T14:52:00-MPC-001', '2023-11-07T15:35:00-RBE-001']
['2023-11-22T18:58:00-FLR-001', '2023-11-25T04:30:00-IPS-001', '2023-11-25T07:59:00-IPS-001', '2023-11-25T09:28:00-MPC-001', '2023-11-25T18:00:00-GST-001']
['2023-11-28T19:07:00-FLR-001', '2023-11-28T19:35:00-FLR-001', '

### GST Data

In [122]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST
gst_query_url = (
    f"{base_url}{GST}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"
)

In [123]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response = fetch_data(gst_query_url)

Attempting to connect.
no request parameters
Successfully connected


In [124]:
# Convert the response variable to json and store it as a variable named gst_json
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data
gst_json = fetch_data(gst_query_url, {}, True)

Attempting to connect.
no request parameters
Returning json.dumps()
[
    {
        "gstID": "2013-06-01T01:00:00-GST-001",
        "startTime": "2013-06-01T01:00Z",
        "allKpIndex": [
            {
                "observedTime": "2013-06-01T01:00Z",
                "kpIndex": 6.0,
                "source": "NOAA"
            }
        ],
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/326/-1",
        "linkedEvents": [
            {
                "activityID": "2013-05-31T15:45:00-HSS-001"
            }
        ],
        "submissionTime": "2013-07-15T19:26Z",
        "versionId": 1
    },
    {
        "gstID": "2013-06-07T03:00:00-GST-001",
        "startTime": "2013-06-07T03:00Z",
        "allKpIndex": [
            {
                "observedTime": "2013-06-07T03:00Z",
                "kpIndex": 6.0,
                "source": "NOAA"
            }
        ],
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/330/-1",
        "linkedEvent

In [125]:
# Convert gst_json to a Pandas DataFrame  

# Keep only the columns: activityID, startTime, linkedEvents


In [126]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME


In [127]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.


In [128]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:


In [129]:
# Convert the 'CME_ActivityID' column to string format 

# Convert the 'gstID' column to string format 

# Convert startTime to datetime format  

# Rename startTime to startTime_GST 

# Drop linkedEvents

# Verify that all steps were executed correctly


In [130]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  


### Merge both datatsets

In [131]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.


In [132]:
# Verify that the new DataFrame has the same number of rows as cme and gst


### Computing the time it takes for a CME to cause a GST

In [133]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.


In [134]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 


### Exporting data in csv format

In [135]:
# Export data to CSV without the index
