### Import Required Libraries and Set Up Environment Variables

In [247]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

In [203]:
NASA_API_KEY

'4HphRoXt1ifct0a6hVm59Jvi0Lm7LkQnZbS8xidZ'

### CME Data

In [204]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
cme = "CME"


# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"
#startDate = "2024-09-01"
#endDate   = "2024-09-22"

# Build URL for CME:
url = base_url + cme + "?start_date=" + startDate + "&end_date=" + endDate + "&api_key=" + NASA_API_KEY
print(url)
#query_url = f"{base_url}{cme}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"
#print(query_url)


https://api.nasa.gov/DONKI/CME?start_date=2013-05-01&end_date=2024-05-01&api_key=4HphRoXt1ifct0a6hVm59Jvi0Lm7LkQnZbS8xidZ


In [205]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(url)

In [206]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json = cme_response.json()


In [207]:
# Preview the first result in JSON format
print(json.dumps(cme_json[0], indent=4))

# Create a list to store the CME data
cme_data = []
for cme in cme_json:
    cme_data.append({
        "activityID": cme["activityID"],
        "startTime": cme["startTime"],
        "sourceLocation": cme["sourceLocation"],
        "activeRegionNum": cme["activeRegionNum"],
        "link": cme["link"]
    })
    # Print the CME data
    print(cme_data)



{
    "activityID": "2024-08-26T12:12:00-CME-001",
    "catalog": "M2M_CATALOG",
    "startTime": "2024-08-26T12:12Z",
    "instruments": [
        {
            "displayName": "SOHO: LASCO/C2"
        },
        {
            "displayName": "SOHO: LASCO/C3"
        },
        {
            "displayName": "STEREO A: SECCHI/COR2"
        }
    ],
    "sourceLocation": "",
    "activeRegionNum": null,
    "note": "Streamer blowout CME to the S/SE in SOHO LASCO C2, C3, and STEREO A COR2 imagery. The start time of the CME could be as early as ~10Z, as that's when the shape first starts appearing in difference imagery, however it doesn't quite start progressing out in the field of view until roughly 12:12Z. The source appears to be moving/opening field lines visible in GOES SUVI 284 in the SE quadrant of the Earth-facing disk with notable changes in the corona's structure over the course of several hours. Most notably, these changes are visible starting around 14:00Z. Brightening is visible

In [208]:
# Convert cme_json to a Pandas DataFrame
df = pd.DataFrame(cme_json)
cme_df = pd.DataFrame(cme_json)
print(cme_df)

# Keep only the columns: activityID, startTime, linkedEvents
cme_df = cme_df[["activityID", "startTime", "linkedEvents"]]
print(cme_df)


                     activityID      catalog          startTime  \
0   2024-08-26T12:12:00-CME-001  M2M_CATALOG  2024-08-26T12:12Z   
1   2024-08-28T04:12:00-CME-001  M2M_CATALOG  2024-08-28T04:12Z   
2   2024-08-28T11:48:00-CME-001  M2M_CATALOG  2024-08-28T11:48Z   
3   2024-08-28T12:48:00-CME-001  M2M_CATALOG  2024-08-28T12:48Z   
4   2024-08-28T18:15:00-CME-001  M2M_CATALOG  2024-08-28T18:15Z   
..                          ...          ...                ...   
93  2024-09-22T21:36:00-CME-001  M2M_CATALOG  2024-09-22T21:36Z   
94  2024-09-23T02:00:00-CME-001  M2M_CATALOG  2024-09-23T02:00Z   
95  2024-09-23T09:12:00-CME-001  M2M_CATALOG  2024-09-23T09:12Z   
96  2024-09-23T20:48:00-CME-001  M2M_CATALOG  2024-09-23T20:48Z   
97  2024-09-24T04:12:00-CME-001  M2M_CATALOG  2024-09-24T04:12Z   

                                          instruments sourceLocation  \
0   [{'displayName': 'SOHO: LASCO/C2'}, {'displayN...                  
1   [{'displayName': 'SOHO: LASCO/C2'}, {'displayN.

In [209]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
cme_df = cme_df.dropna(subset=["linkedEvents"])
print(cme_df)


                     activityID          startTime  \
9   2024-08-30T12:53:00-CME-001  2024-08-30T12:53Z   
15  2024-09-01T03:24:00-CME-001  2024-09-01T03:24Z   
16  2024-09-01T12:23:00-CME-001  2024-09-01T12:23Z   
20  2024-09-02T14:00:00-CME-001  2024-09-02T14:00Z   
22  2024-09-03T08:12:00-CME-001  2024-09-03T08:12Z   
30  2024-09-05T07:24:00-CME-001  2024-09-05T07:24Z   
37  2024-09-07T07:36:00-CME-001  2024-09-07T07:36Z   
38  2024-09-07T08:00:00-CME-001  2024-09-07T08:00Z   
42  2024-09-08T01:36:00-CME-001  2024-09-08T01:36Z   
44  2024-09-09T01:25:00-CME-001  2024-09-09T01:25Z   
45  2024-09-09T05:23:00-CME-001  2024-09-09T05:23Z   
48  2024-09-10T00:23:00-CME-001  2024-09-10T00:23Z   
53  2024-09-11T02:23:00-CME-001  2024-09-11T02:23Z   
55  2024-09-11T16:48:00-CME-001  2024-09-11T16:48Z   
56  2024-09-11T19:48:00-CME-001  2024-09-11T19:48Z   
58  2024-09-12T19:48:00-CME-001  2024-09-12T19:48Z   
60  2024-09-13T02:00:00-CME-001  2024-09-13T02:00Z   
62  2024-09-13T10:12:00-CME-

In [210]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element
# Initialize an empty list called expanded_rows to store the expanded rows
expanded_rows = []
# Iterate over each index in the DataFrame
for i in cme_df.index:
    extract_activityID_from_dict = cme_df.loc[i, 'activityID']
    extract_startTime_from_dict = cme_df.loc[i, 'startTime']
    # Iterate over each dictionary in the 'linkedEvents' list
    for event in cme_df.loc[i, 'linkedEvents']:
        # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
        expanded_rows.append({
            'activityID': extract_activityID_from_dict,
            'startTime': extract_startTime_from_dict,
            'linkedEvents': event
        })
# Print the expanded_rows list
print(expanded_rows)
         
# Create a new DataFrame from the expanded row
expanded_df = pd.DataFrame(expanded_rows)
print(expanded_df)

# Initialize an empty list to store the expanded rows
expanded_rows = []
      
# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)
print(expanded_df)


[{'activityID': '2024-08-30T12:53:00-CME-001', 'startTime': '2024-08-30T12:53Z', 'linkedEvents': {'activityID': '2024-08-30T12:13:00-FLR-001'}}, {'activityID': '2024-09-01T03:24:00-CME-001', 'startTime': '2024-09-01T03:24Z', 'linkedEvents': {'activityID': '2024-09-04T09:40:00-IPS-001'}}, {'activityID': '2024-09-01T03:24:00-CME-001', 'startTime': '2024-09-01T03:24Z', 'linkedEvents': {'activityID': '2024-09-04T13:17:00-IPS-001'}}, {'activityID': '2024-09-01T12:23:00-CME-001', 'startTime': '2024-09-01T12:23Z', 'linkedEvents': {'activityID': '2024-09-01T11:45:00-FLR-001'}}, {'activityID': '2024-09-01T12:23:00-CME-001', 'startTime': '2024-09-01T12:23Z', 'linkedEvents': {'activityID': '2024-09-01T23:14:00-SEP-001'}}, {'activityID': '2024-09-02T14:00:00-CME-001', 'startTime': '2024-09-02T14:00Z', 'linkedEvents': {'activityID': '2024-09-02T13:34:00-FLR-001'}}, {'activityID': '2024-09-03T08:12:00-CME-001', 'startTime': '2024-09-03T08:12Z', 'linkedEvents': {'activityID': '2024-09-03T08:39:00-SEP

In [211]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors
def extract_activityID_from_dict(dict):
        try:
                return dict["activityID"]
        except:
                return None

        # Log the error or print it for debugging
        print(f"Error in row {dict}")


In [212]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:
expanded_df['GST_ActivityID'] = df['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))

print(expanded_df)


   GST_ActivityID
0            None
1            None
2            None
3            None
4            None
..            ...
93           None
94           None
95           None
96           None
97           None

[98 rows x 1 columns]


In [213]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
expanded_df = expanded_df.dropna(subset=["GST_ActivityID"])
print(expanded_df)


Empty DataFrame
Columns: [GST_ActivityID]
Index: []


In [214]:
# print out the datatype of each column in this DataFrame:
print(expanded_df.dtypes)


GST_ActivityID    object
dtype: object


In [215]:
# Convert the 'GST_ActivityID' column to string format
expanded_df['GST_ActivityID'] = expanded_df['GST_ActivityID'].astype(str)
print(expanded_df.dtypes)

# Convert startTime to datetime format
#expanded_df['startTime'] = pd.to_datetime(expanded_df['startTime'])
#print(expanded_df.dtypes)

# Rename startTime to startTime_CME and activityID to cmeID
expanded_df = expanded_df.rename(columns={"startTime": "startTime_CME", "activityID": "cmeID"})
print(expanded_df)

# Drop linkedEvents
#expanded_df = expanded_df.drop(columns=["linkedEvent"])
#print(expanded_df)

# Verify that all steps were executed correctly
print(expanded_df.dtypes)


GST_ActivityID    object
dtype: object
Empty DataFrame
Columns: [GST_ActivityID]
Index: []
GST_ActivityID    object
dtype: object


In [216]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.
expanded_df = expanded_df[expanded_df['GST_ActivityID'].str.contains('GST')]
print(expanded_df.head())


Empty DataFrame
Columns: [GST_ActivityID]
Index: []


### GST Data

In [217]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST
url = base_url + GST + "?startDate=" + startDate + "&endDate=" + endDate + "&api_key=" + NASA_API_KEY
print(url)


https://api.nasa.gov/DONKI/GST?startDate=2013-05-01&endDate=2024-05-01&api_key=4HphRoXt1ifct0a6hVm59Jvi0Lm7LkQnZbS8xidZ


In [218]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response = requests.get(url)


In [219]:
# Convert the response variable to json and store it as a variable named gst_json
gst_json = gst_response.json()

# Preview the first result in JSON format
print(json.dumps(gst_json[0], indent=4))
# Use json.dumps with argument indent=4 to format data
print(json.dumps(gst_json, indent=4))


{
    "gstID": "2013-06-01T01:00:00-GST-001",
    "startTime": "2013-06-01T01:00Z",
    "allKpIndex": [
        {
            "observedTime": "2013-06-01T01:00Z",
            "kpIndex": 6.0,
            "source": "NOAA"
        }
    ],
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/326/-1",
    "linkedEvents": [
        {
            "activityID": "2013-05-31T15:45:00-HSS-001"
        }
    ],
    "submissionTime": "2013-07-15T19:26Z",
    "versionId": 1
}
[
    {
        "gstID": "2013-06-01T01:00:00-GST-001",
        "startTime": "2013-06-01T01:00Z",
        "allKpIndex": [
            {
                "observedTime": "2013-06-01T01:00Z",
                "kpIndex": 6.0,
                "source": "NOAA"
            }
        ],
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/326/-1",
        "linkedEvents": [
            {
                "activityID": "2013-05-31T15:45:00-HSS-001"
            }
        ],
        "submissionTime": "2013-07-15T19

In [222]:
# Convert gst_json to a Pandas DataFrame 
gst_df = pd.DataFrame(gst_json)
print(gst_df)

# Keep only the columns: activityID, startTime, linkedEvents
gst_df = gst_df[["gstID", "startTime", "linkedEvents"]]
print(gst_df)


                           gstID          startTime  \
0    2013-06-01T01:00:00-GST-001  2013-06-01T01:00Z   
1    2013-06-07T03:00:00-GST-001  2013-06-07T03:00Z   
2    2013-06-29T03:00:00-GST-001  2013-06-29T03:00Z   
3    2013-10-02T03:00:00-GST-001  2013-10-02T03:00Z   
4    2013-12-08T00:00:00-GST-001  2013-12-08T00:00Z   
..                           ...                ...   
112  2023-12-18T06:00:00-GST-001  2023-12-18T06:00Z   
113  2024-03-03T18:00:00-GST-001  2024-03-03T18:00Z   
114  2024-03-23T21:00:00-GST-001  2024-03-23T21:00Z   
115  2024-03-24T12:00:00-GST-001  2024-03-24T12:00Z   
116  2024-04-19T18:00:00-GST-001  2024-04-19T18:00Z   

                                            allKpIndex  \
0    [{'observedTime': '2013-06-01T01:00Z', 'kpInde...   
1    [{'observedTime': '2013-06-07T03:00Z', 'kpInde...   
2    [{'observedTime': '2013-06-29T03:00Z', 'kpInde...   
3    [{'observedTime': '2013-10-02T06:00Z', 'kpInde...   
4    [{'observedTime': '2013-12-08T03:00Z', 'kpIn

In [223]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME
gst_df = gst_df.dropna(subset=["linkedEvents"])
print(gst_df)


                           gstID          startTime  \
0    2013-06-01T01:00:00-GST-001  2013-06-01T01:00Z   
1    2013-06-07T03:00:00-GST-001  2013-06-07T03:00Z   
3    2013-10-02T03:00:00-GST-001  2013-10-02T03:00Z   
4    2013-12-08T00:00:00-GST-001  2013-12-08T00:00Z   
5    2014-02-19T03:00:00-GST-001  2014-02-19T03:00Z   
..                           ...                ...   
112  2023-12-18T06:00:00-GST-001  2023-12-18T06:00Z   
113  2024-03-03T18:00:00-GST-001  2024-03-03T18:00Z   
114  2024-03-23T21:00:00-GST-001  2024-03-23T21:00Z   
115  2024-03-24T12:00:00-GST-001  2024-03-24T12:00Z   
116  2024-04-19T18:00:00-GST-001  2024-04-19T18:00Z   

                                          linkedEvents  
0      [{'activityID': '2013-05-31T15:45:00-HSS-001'}]  
1      [{'activityID': '2013-06-02T20:24:00-CME-001'}]  
3    [{'activityID': '2013-09-29T22:40:00-CME-001'}...  
4    [{'activityID': '2013-12-04T23:12:00-CME-001'}...  
5    [{'activityID': '2014-02-16T14:15:00-CME-001'}...

In [224]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.
gst_df = gst_df.explode("linkedEvents").reset_index(drop=True)
print(gst_df)


                           gstID          startTime  \
0    2013-06-01T01:00:00-GST-001  2013-06-01T01:00Z   
1    2013-06-07T03:00:00-GST-001  2013-06-07T03:00Z   
2    2013-10-02T03:00:00-GST-001  2013-10-02T03:00Z   
3    2013-10-02T03:00:00-GST-001  2013-10-02T03:00Z   
4    2013-10-02T03:00:00-GST-001  2013-10-02T03:00Z   
..                           ...                ...   
200  2024-03-24T12:00:00-GST-001  2024-03-24T12:00Z   
201  2024-03-24T12:00:00-GST-001  2024-03-24T12:00Z   
202  2024-03-24T12:00:00-GST-001  2024-03-24T12:00Z   
203  2024-04-19T18:00:00-GST-001  2024-04-19T18:00Z   
204  2024-04-19T18:00:00-GST-001  2024-04-19T18:00Z   

                                      linkedEvents  
0    {'activityID': '2013-05-31T15:45:00-HSS-001'}  
1    {'activityID': '2013-06-02T20:24:00-CME-001'}  
2    {'activityID': '2013-09-29T22:40:00-CME-001'}  
3    {'activityID': '2013-10-02T01:54:00-IPS-001'}  
4    {'activityID': '2013-10-02T02:47:00-MPC-001'}  
..                   

In [225]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
df['linkedEvents'] = df['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))
# and create a new column called 'CME_ActivityID' using loc indexer:
gst_df['CME_ActivityID'] = df['linkedEvents']
print(gst_df)

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:
gst_df = gst_df.dropna(subset=["CME_ActivityID"])
print(gst_df)


                           gstID          startTime  \
0    2013-06-01T01:00:00-GST-001  2013-06-01T01:00Z   
1    2013-06-07T03:00:00-GST-001  2013-06-07T03:00Z   
2    2013-10-02T03:00:00-GST-001  2013-10-02T03:00Z   
3    2013-10-02T03:00:00-GST-001  2013-10-02T03:00Z   
4    2013-10-02T03:00:00-GST-001  2013-10-02T03:00Z   
..                           ...                ...   
200  2024-03-24T12:00:00-GST-001  2024-03-24T12:00Z   
201  2024-03-24T12:00:00-GST-001  2024-03-24T12:00Z   
202  2024-03-24T12:00:00-GST-001  2024-03-24T12:00Z   
203  2024-04-19T18:00:00-GST-001  2024-04-19T18:00Z   
204  2024-04-19T18:00:00-GST-001  2024-04-19T18:00Z   

                                      linkedEvents CME_ActivityID  
0    {'activityID': '2013-05-31T15:45:00-HSS-001'}           None  
1    {'activityID': '2013-06-02T20:24:00-CME-001'}           None  
2    {'activityID': '2013-09-29T22:40:00-CME-001'}           None  
3    {'activityID': '2013-10-02T01:54:00-IPS-001'}           None  

In [226]:
# Convert the 'CME_ActivityID' column to string format 
gst_df['CME_ActivityID'] = gst_df['CME_ActivityID'].astype(str)
print(gst_df.dtypes)

# Convert the 'gstID' column to string format
gst_df['gstID'] = gst_df['gstID'].astype(str)
print(gst_df.dtypes)

# Convert startTime to datetime format 
gst_df['startTime'] = pd.to_datetime(gst_df['startTime'])

# Rename startTime to startTime_GST 
gst_df = gst_df.rename(columns={"startTime": "startTime_GST"})
print(gst_df)

# Drop linkedEvents
gst_df = gst_df.drop(columns=["linkedEvents"])
print(gst_df)

# Verify that all steps were executed correctly
print(gst_df.dtypes)


gstID             object
startTime         object
linkedEvents      object
CME_ActivityID    object
dtype: object
gstID             object
startTime         object
linkedEvents      object
CME_ActivityID    object
dtype: object
Empty DataFrame
Columns: [gstID, startTime_GST, linkedEvents, CME_ActivityID]
Index: []
Empty DataFrame
Columns: [gstID, startTime_GST, CME_ActivityID]
Index: []
gstID                     object
startTime_GST     datetime64[ns]
CME_ActivityID            object
dtype: object


In [239]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  
gst_df = gst_df[gst_df['CME_ActivityID'].str.contains('CME')]
print(gst_df.head())


Empty DataFrame
Columns: [gstID, startTime_GST, CME_ActivityID]
Index: []


### Merge both datatsets

In [240]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.
merged_df = pd.merge(gst_df, expanded_df, left_on='CME_ActivityID', right_on='GST_ActivityID')
print(merged_df.head())


Empty DataFrame
Columns: [gstID, startTime_GST, CME_ActivityID, GST_ActivityID]
Index: []


In [241]:
# Verify that the new DataFrame has the same number of rows as cme and gst
print(len(merged_df))


0


### Computing the time it takes for a CME to cause a GST

In [None]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.
merged_df['timeDiff'] = merged_df['startTime_GST'] - merged_df['startTime_CME']
print(merged_df.head())


In [None]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST.
mean_time = df['time'].describe()['mean']
median_time = df['time'].describe()['50%']
print(mean_time)
# that it takes for a CME to cause a GST.
mean_time = merged_df['timeDiff'].describe()['mean']
median_time = merged_df['timeDiff'].describe()['50%']
print(mean_time)


### Exporting data in csv format

In [245]:
# Export data to CSV without the index
merged_df.to_csv("merged_data.csv", index=False)
print("Data exported to CSV!")


Data exported to CSV!
