### Import Required Libraries and Set Up Environment Variables

In [None]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

### CME Data

In [None]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for CME
url = f"{base_url}{CME}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"


In [None]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(url)

In [None]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json = cme_response.json()
cme_json

In [None]:
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data

print(json.dumps(cme_json[0], indent=8))


In [None]:
# Convert cme_json to a Pandas DataFrame 

cme_df = pd.json_normalize(cme_json)

# Keep only the columns: activityID, startTime, linkedEvents

cme_df = cme_df[["activityID", "startTime", "linkedEvents"]]

cme_df.head()


In [None]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs

cme_df = cme_df.dropna(subset=["linkedEvents"])

cme_df.head(10)

In [None]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element 

# Initialize an empty list to store the expanded rows


# Iterate over each index in the DataFrame

    # Iterate over each dictionary in the list
    
        # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
      
# Create a new DataFrame from the expanded rows

expanded_rows = []

for index in cme_df.index:
    
    activityID = cme_df.loc[index, 'activityID']
    startTime = cme_df.loc[index, 'startTime']
    linkedEvents = cme_df.loc[index, 'linkedEvents']
    # Iterate over each dictionary in the 'linkedEvents' list
    for event in linkedEvents:
        # Append a new dictionary to the expanded_rows list for each dictionary item
        expanded_rows.append({
            'activityID': activityID,
            'startTime': startTime,
            'linkedEventID': event['activityID'], 
            # 'linkedEventType': event['type'] 
        })

# need to retrieve data from linkedEvents column from dictionary... somehow...

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Display the expanded DataFrame
expanded_df.head(15)


In [None]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors

        # Log the error or print it for debugging


# Function to extract 'activityID' from a dictionary
def extract_activityID_from_dict(event_dict):
    try:
        # Attempt to extract the 'activityID' key from the dictionary
        activity_id = event_dict['activityID']
        return activity_id
    except KeyError as e:
        # Handle the case where the key 'activityID' is missing
        print(f"KeyError: The key 'activityID' was not found in the dictionary. Error: {e}")
        return None  # Return None if the key is missing
    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {e}")
        return None

# Test the function on one event in the list
for event in linkedEvents:
    activity_id = extract_activityID_from_dict(event)
    print(f"Extracted activityID: {activity_id}")


In [None]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:


cme_df['GST_ActivityID'] = cme_df['linkedEvents'].apply(lambda events: extract_activityID_from_dict(events[0]) if events else None)


cme_df[['activityID','startTime','linkedEvents', 'GST_ActivityID',]].head()

In [None]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:

cme_df_cleaned = cme_df.dropna(subset=['GST_ActivityID'])

# Display the first few rows of the cleaned DataFrame
cme_df_cleaned[['activityID', 'startTime', 'linkedEvents', 'GST_ActivityID']].head()

In [None]:
# print out the datatype of each column in this DataFrame:

print(cme_df_cleaned.dtypes)



In [None]:
print(cme_df_cleaned.columns)


In [None]:
# Convert the 'GST_ActivityID' column to string format 
cme_df_cleaned['GST_ActivityID'] = cme_df_cleaned['GST_ActivityID'].astype(str)
# Convert startTime to datetime format  
cme_df_cleaned['startTime'] = pd.to_datetime(cme_df_cleaned['startTime'], errors='coerce')
# Rename startTime to startTime_CME and activityID to cmeID
cme_df_cleaned.rename(columns={'startTime': 'startTime_CME', 'activityID': 'cmeID'}, inplace=True)
# Drop linkedEvents
cme_df_cleaned.drop(columns=['linkedEvents'], inplace=True)
# Verify that all steps were executed correctly

cme_df_cleaned.head()


In [None]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.  

cme_df_filtered = cme_df_cleaned[cme_df_cleaned['GST_ActivityID'].str.contains('GST', na=False)]

# Verify the filtered DataFrame
cme_df_filtered.head() 
# print(cme_df_filtered.shape)   


### GST Data

In [None]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST

url = f"{base_url}{GST}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"


In [None]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response

gst_response = requests.get(url)


In [None]:
# Convert the response variable to json and store it as a variable named gst_json

gst_json = gst_response.json()

# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data

print(json.dumps(gst_json[0], indent=4))

In [None]:
# Convert gst_json to a Pandas DataFrame  

# Keep only the columns: activityID, startTime, linkedEvents

gst_df = pd.json_normalize(gst_json)

gst_df['activityID'] = gst_df['linkedEvents'].apply(lambda x: x[0]['activityID'] if x else None)

gst_df = gst_df[["activityID", "startTime", "linkedEvents"]]

gst_df.head()


In [None]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME


In [None]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.


In [None]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:


In [None]:
# Convert the 'CME_ActivityID' column to string format 

# Convert the 'gstID' column to string format 

# Convert startTime to datetime format  

# Rename startTime to startTime_GST 

# Drop linkedEvents

# Verify that all steps were executed correctly


In [None]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  


### Merge both datatsets

In [None]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.


In [None]:
# Verify that the new DataFrame has the same number of rows as cme and gst


### Computing the time it takes for a CME to cause a GST

In [None]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.


In [None]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 


### Exporting data in csv format

In [None]:
# Export data to CSV without the index
