In [2]:
import requests
import pandas  as pd
from datetime import datetime

In [23]:
API_key = "L7JRob1GBsDhgZD45Cy4hCbuYD0NwSGqvt8FHi0z"
start_date = "2024-01-01"
end_date = "2024-12-31"

url = f"https://api.nasa.gov/DONKI/FLR?startDate={start_date}&endDate={end_date}&api_key={API_key}"

# Fetch data
response = requests.get(url)
data = response.json()

# Preview first flare
print(data[0])

{'flrID': '2024-01-01T08:33:00-FLR-001', 'catalog': 'M2M_CATALOG', 'instruments': [{'displayName': 'GOES-P: EXIS 1.0-8.0'}], 'beginTime': '2024-01-01T08:33Z', 'peakTime': '2024-01-01T08:54Z', 'endTime': '2024-01-01T09:04Z', 'classType': 'M2.3', 'sourceLocation': 'N03E70', 'activeRegionNum': 13536, 'note': '', 'submissionTime': '2024-01-01T13:28Z', 'versionId': 1, 'link': 'https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/FLR/28428/-1', 'linkedEvents': None}


In [39]:
flare_records = []

for flare in data:
    flare_records.append({
        "flare_id": flare.get("flrID"),
        "begin": flare.get("beginTime"),
        "peak": flare.get("peakTime"),
        "end": flare.get("endTime"),
        "class_type": flare.get("classType"),
        "location": flare.get("sourceLocation"),
        "region": flare.get("activeRegionNum"),
        "instrument": flare["instruments"][0]["displayName"] if flare.get("instruments") else None
    })

df = pd.DataFrame(flare_records)
print(df.head())

                      flare_id              begin               peak  \
0  2024-01-01T08:33:00-FLR-001  2024-01-01T08:33Z  2024-01-01T08:54Z   
1  2024-01-01T11:54:00-FLR-001  2024-01-01T11:54Z  2024-01-01T12:25Z   
2  2024-01-02T18:02:00-FLR-001  2024-01-02T18:02Z  2024-01-02T18:30Z   
3  2024-01-04T01:08:00-FLR-001  2024-01-04T01:08Z  2024-01-04T01:16Z   
4  2024-01-04T01:22:00-FLR-001  2024-01-04T01:22Z  2024-01-04T01:55Z   

                 end class_type location   region            instrument  
0  2024-01-01T09:04Z       M2.3   N03E70  13536.0  GOES-P: EXIS 1.0-8.0  
1  2024-01-01T12:35Z       M4.7   N03E68  13536.0  GOES-P: EXIS 1.0-8.0  
2  2024-01-02T18:56Z       M1.1   N05E59  13536.0  GOES-P: EXIS 1.0-8.0  
3  2024-01-04T01:22Z       M1.1   N02E38  13536.0  GOES-P: EXIS 1.0-8.0  
4  2024-01-04T02:12Z       M3.8   N04E38  13536.0  GOES-P: EXIS 1.0-8.0  


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   flare_id    1128 non-null   object 
 1   begin       1128 non-null   object 
 2   peak        1128 non-null   object 
 3   end         1128 non-null   object 
 4   class_type  1128 non-null   object 
 5   location    1128 non-null   object 
 6   region      1102 non-null   float64
 7   instrument  1128 non-null   object 
dtypes: float64(1), object(7)
memory usage: 70.6+ KB


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   flare_id    1128 non-null   object 
 1   begin       1128 non-null   object 
 2   peak        1128 non-null   object 
 3   end         1128 non-null   object 
 4   class_type  1128 non-null   object 
 5   location    1128 non-null   object 
 6   region      1102 non-null   float64
 7   instrument  1128 non-null   object 
dtypes: float64(1), object(7)
memory usage: 70.6+ KB


In [45]:
# New column : flare_Class (B, C,M,X)
df['flare_class'] =df['class_type'].str[0]

### Calculate Flare Duration

In [59]:
import numpy as np


def preprocess_flare_data(df):
    # Class Label
    df['class_label'] = df['class_type'].str[0] #'M2.3'-> 'M'

    # Duration in minutes
    df['begin'] =pd.to_datetime(df['begin'])
    df['end'] = pd.to_datetime(df['end'])
    df['duration_min'] = (df['end'] - df['begin']).dt.total_seconds() /60


    #Location Split
    df['north_south'] = df['location'].str.extract(r'([NS])')
    df['east_west'] = df['location'].str.extract(r'([EW])')

    #Fill missing region values with -1
    df['region'].fillna(-1, inplace =True)

    #Drop unsused columns
    df_model =df[[
        'duration_min', 'region', 'instrument',
        'north_south', 'east_west', 'class_label'
    ]]

    return df_model
    

In [71]:
df_model =preprocess_flare_data(df)
df_model.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['region'].fillna(-1, inplace =True)


Unnamed: 0,duration_min,region,instrument,north_south,east_west,class_label
0,31.0,13536.0,GOES-P: EXIS 1.0-8.0,N,E,M
1,41.0,13536.0,GOES-P: EXIS 1.0-8.0,N,E,M
2,54.0,13536.0,GOES-P: EXIS 1.0-8.0,N,E,M
3,14.0,13536.0,GOES-P: EXIS 1.0-8.0,N,E,M
4,50.0,13536.0,GOES-P: EXIS 1.0-8.0,N,E,M


In [73]:
from sklearn.utils import resample

df_m =df_model[df_model['class_label']=='M']
df_c =df_model[df_model['class_label']=='C']
df_x =df_model[df_model['class_label']=='X']

df_c_upsampled =resample (df_c, replace =True, n_samples =100, random_state=42)
df_x_upsampled =resample (df_x, replace =True, n_samples =100, random_state=42)
df_m_downsampled =resample (df_m, replace =True, n_samples =100, random_state=42)

#Combine balanced Dataframe
df_balanced = pd.concat([df_m_downsampled, df_c_upsampled, df_x_upsampled])

#use df_balanced for training
X = df_balanced.drop("class_label", axis =1)
y = df_balanced["class_label"]

### Model Training and Evaluatioon

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


#Step 1: Define feature columns 
X =df_balanced.drop("class_label", axis =1)
y =df_balanced['class_label']

#Step 2: Identify categorical columns
categorical =['instrument' ,'north_south','east_west']
numerical = ['duration_min', 'region']

#Step 3: One-hot encode categoricals
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown = 'ignore'), categorical)], remainder ='passthrough')

#Step 4: Define pipeline with classifier
pipeline =Pipeline([
    ("preprocess", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators =100, random_state=42))
])

#Step 5: Train-Test Split
X_train, X_test, y_train, y_test= train_test_split(X,y, stratify =y, test_size =0.2, random_state =42)

#Step 6:Train
pipeline.fit(X_train, y_train)

#Step 7: Predict & Evaluate
y_pred =pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           C       0.73      0.80      0.76        20
           M       0.64      0.35      0.45        20
           X       0.67      0.90      0.77        20

    accuracy                           0.68        60
   macro avg       0.68      0.68      0.66        60
weighted avg       0.68      0.68      0.66        60



In [80]:
import joblib

#save pipeline
joblib.dump(pipeline ,"solar_flare_classifier.pkl")
print("Model Saved!")

Model Saved!
