In [1]:
# data processing
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# regression / matching
import statsmodels.formula.api as smf

## Load the data

In [2]:
data_path = "DATA/VENUS_EXPRESS/"

vex_df = pd.read_csv(data_path+"VEX_NDMW0D0A_2023_01_11_12_55_55.900.txt", sep='\t')

## Adding EDAC data
We calculate the counts per day and remove the resetting of the counter at 0 for EDAC:

In [3]:
#vex_df['DATE_TIME'] = vex_df.DATE_TIME.apply(lambda d: datetime.strptime(d, '%Y-%m-%d %H:%M:%S.%f')) #takes more time for the graphs to show
vex_df['EDAC'] = vex_df['NDMW0D0A'].diff().bfill().astype(int).map(lambda x: max(0, x))
vex_df.drop('NDMW0D0A', axis=1, inplace=True)

Since the magneto data is grouped by hour, we aggregate the EDAC data by hour too:

In [4]:
# first we group by hour:
vex_df['DATE_TIME'] = pd.to_datetime(vex_df['DATE_TIME'])
hourly_grouped = vex_df.groupby(pd.Grouper(key='DATE_TIME', freq='H'))


In [5]:
# sanity check:
first_group = hourly_grouped.get_group(list(hourly_grouped.groups.keys())[0])
print(first_group)

                 DATE_TIME  EDAC
0  2005-11-09 00:09:04.575     0
1  2005-11-09 00:10:08.575     0
2  2005-11-09 00:11:12.576     0
3  2005-11-09 00:12:16.576     0
4  2005-11-09 00:13:20.577     0
5  2005-11-09 00:14:24.577     0
6  2005-11-09 00:15:28.577     0
7  2005-11-09 00:16:32.578     0
8  2005-11-09 00:17:36.578     0
9  2005-11-09 00:18:40.579     0
10 2005-11-09 00:19:44.579     0
11 2005-11-09 00:20:48.579     0
12 2005-11-09 00:21:52.580     0
13 2005-11-09 00:22:56.580     0
14 2005-11-09 00:24:00.581     0
15 2005-11-09 00:25:04.581     0
16 2005-11-09 00:26:08.581     0
17 2005-11-09 00:27:12.582     0
18 2005-11-09 00:28:16.582     0
19 2005-11-09 00:29:20.583     0
20 2005-11-09 00:30:24.583     0
21 2005-11-09 00:31:28.583     0
22 2005-11-09 00:32:32.584     0
23 2005-11-09 00:33:36.584     0
24 2005-11-09 00:34:40.585     0
25 2005-11-09 00:35:44.585     0
26 2005-11-09 00:36:48.585     0
27 2005-11-09 00:37:52.586     0
28 2005-11-09 00:38:56.586     0
29 2005-11

In [6]:
# then we agregate the groups:
hourly_vex_df = hourly_grouped.agg({'DATE_TIME': lambda x: x.dt.round('H').min(),
                                    'EDAC': 'sum'})

In [7]:
# we drop lines containing NaNs, then check that no Nans are left in our dataframe:
hourly_vex_df.dropna(inplace=True)

has_nan = hourly_vex_df.isna().any().any()
print(has_nan)

False


In [8]:
# sanity check:
hourly_vex_df.head()

Unnamed: 0_level_0,DATE_TIME,EDAC
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1
2005-11-09 00:00:00,2005-11-09 00:00:00,0
2005-11-09 01:00:00,2005-11-09 01:00:00,0
2005-11-09 02:00:00,2005-11-09 02:00:00,0
2005-11-09 03:00:00,2005-11-09 03:00:00,0
2005-11-09 05:00:00,2005-11-09 05:00:00,1


## Adding CME data
Now let's add the events of CME :

In [9]:
vex_cme_df = pd.read_csv(data_path+"VEX_CME_date_time.csv")
hourly_vex_df['cme'] = 0

for index, row in vex_cme_df.iterrows():
    start_time = pd.to_datetime(row['start_time']).round('H')
    end_time = pd.to_datetime(row['end_time']).round('H')
    
    # Set 'cme' column values to 1 between start_time and end_time
    hourly_vex_df.loc[(hourly_vex_df.index >= start_time) & (hourly_vex_df.index <= end_time), 'cme'] = 1

In [10]:
# sanity check:
hourly_vex_df.head()

Unnamed: 0_level_0,DATE_TIME,EDAC,cme
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-11-09 00:00:00,2005-11-09 00:00:00,0,0
2005-11-09 01:00:00,2005-11-09 01:00:00,0,0
2005-11-09 02:00:00,2005-11-09 02:00:00,0,0
2005-11-09 03:00:00,2005-11-09 03:00:00,0,0
2005-11-09 05:00:00,2005-11-09 05:00:00,1,0


##### Parenthesis : class imbalance
Let's have a look at the proportion of CME events: we see that only 1.8% of our data corresponds to a CME.

Since we have this huge class imbalance, we can expect the models we will train to be biased towards predicting no CMEs.
We will have to implement various methods to mitigate this class imbalance.

In [11]:
cme_count = hourly_vex_df[hourly_vex_df['cme'] == 1].shape[0]
total_count = len(hourly_vex_df)
percentage = (cme_count / total_count) * 100

print(f"CME count: {cme_count}")
print(f"Total count: {total_count}")
print(f"Percentage of CME events: {percentage:.2f}%")


CME count: 1402
Total count: 77852
Percentage of CME events: 1.80%


End of parenthesis, back to our dataframe:

In [12]:
# sanity check:
hourly_vex_df.head()


Unnamed: 0_level_0,DATE_TIME,EDAC,cme
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-11-09 00:00:00,2005-11-09 00:00:00,0,0
2005-11-09 01:00:00,2005-11-09 01:00:00,0,0
2005-11-09 02:00:00,2005-11-09 02:00:00,0,0
2005-11-09 03:00:00,2005-11-09 03:00:00,0,0
2005-11-09 05:00:00,2005-11-09 05:00:00,1,0


We save this data in a first csv that we will use to train our models (predict CME events based on EDAC):

In [13]:
train_file_path = "DATA/training_data/"
hourly_vex_df.to_csv(train_file_path+'VEX_edac_labeled.csv', index=False)

## Adding magneto data
Now let's add the magneto data to a second dataframe (to assess the performance of our predictor: we will see how much gap in performance we have if we predict CME events based on EDAC and magneto data):

In [45]:
# Load magneto data
magneto_path = "DATA/VEX_MAGNETO/VEX-V-Y-MAG-4.csv"
magneto_df = pd.read_csv(magneto_path)

In [46]:
# we also set the date as index to merge it with our EDAC dataframe
magneto_df.rename(columns={'date': 'DATE_TIME'}, inplace=True)
magneto_df.set_index('DATE_TIME', inplace=True)

# we convert the index to datetime to be able to merge it with our EDAC dataframe
magneto_df.index = pd.to_datetime(magneto_df.index)

In [51]:
# sanity check:

print("magneto_df index type:", magneto_df.index.dtype)
print("hourly_vex_df index type:", hourly_vex_df.index.dtype)

magneto_df index type: datetime64[ns]
hourly_vex_df index type: datetime64[ns]


In [48]:
magneto_df.head()

Unnamed: 0_level_0,BX,BY,BZ,BT,XSC,YSC,ZSC,RSC
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-04-24 00:00:00,901.170761,887.078621,886.592311,901.582144,-3076.407309,-2872.475157,-67213.437339,67351.778022
2006-04-24 01:00:00,122.866044,107.284161,109.65445,123.584061,-56.789406,-2147.243992,-70446.888658,70485.459712
2006-04-24 02:00:00,230.88153,218.765052,216.810378,234.327848,2966.294489,-1402.796963,-72831.017578,72910.16416
2006-04-24 03:00:00,451.576201,437.057761,440.66227,455.704559,5959.518042,-649.828442,-74422.30215,74668.263456
2006-04-24 04:00:00,560.491882,547.190941,554.806417,566.054039,8894.553522,103.003431,-75259.615454,75788.153991


We merge this data with our first dataframe of ENACs and CMEs.

We will have many NaNs because the magneto data covers a shorter time span than the EDAC data : we will remove those.

In [49]:
merged_df = hourly_vex_df.merge(magneto_df, left_index=True, right_index=True, how='left')

In [50]:
merged_df.dropna(inplace=True) # drop lines containing NaNs
merged_df.head()

Unnamed: 0_level_0,DATE_TIME,EDAC,cme,BX,BY,BZ,BT,XSC,YSC,ZSC,RSC
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2006-04-24 00:00:00,2006-04-24 00:00:00,0,0,901.170761,887.078621,886.592311,901.582144,-3076.407309,-2872.475157,-67213.437339,67351.778022
2006-04-24 01:00:00,2006-04-24 01:00:00,0,0,122.866044,107.284161,109.65445,123.584061,-56.789406,-2147.243992,-70446.888658,70485.459712
2006-04-24 02:00:00,2006-04-24 02:00:00,0,0,230.88153,218.765052,216.810378,234.327848,2966.294489,-1402.796963,-72831.017578,72910.16416
2006-04-24 03:00:00,2006-04-24 03:00:00,0,0,451.576201,437.057761,440.66227,455.704559,5959.518042,-649.828442,-74422.30215,74668.263456
2006-04-24 04:00:00,2006-04-24 04:00:00,0,0,560.491882,547.190941,554.806417,566.054039,8894.553522,103.003431,-75259.615454,75788.153991


We save this data in a second csv that we will use to train the same models.

This will allow to assess the performance of our predictor: we will see how much gap in performance we have if we predict CME events based on only EDAC data compared to predicting it using both EDAC and magneto data

In [53]:
merged_df.to_csv(train_file_path+'VEX_edac_mag_labeled.csv', index=False)