In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the original data again
original_data = pd.read_csv("../cme_and_electron/new_data/SEP10MeV_Features.csv")

| Feature/Target Variable        | Preprocessing Step                                                                 |
|-------------------------------|------------------------------------------------------------------------------------|
| peak_intensity                 | Take the natural log, then divide by the max of the natural log                    |
| CMEs_over_1000_past_9_hrs      | Divide by max (2)                                                                  |
| CMEs_past_9_hours              | Divide by max (6)                                                                  |
| V log V                        | Take the natural log, then divide by the max of the natural log                     |
| longitude                      | Normalize to range \([-1, 1]\) by dividing by 180                                  |
| MPA                            | Normalize to range \([0, 1]\) by dividing by 360                                   |
| latitude                       | Normalize to range \([-1, 1]\) by dividing by 90                                   |
| Acceleration (Accel)           | Divide by max                                                                      |
| Linear Speed (donki_speed)     | Divide by max                                                                      |
| Richardson's Equation          | Take the natural log, then divide by the absolute value of the min of the natural log |
| 2nd Order Speed Final          | Divide by max                                                                      |
| 2nd Order Speed at 20 Solar Radii | Divide by max                                                              |
| Max Speed Past Day             | Divide by max                                                                      |
| CMEs in the Past Month         | Divide by max                                                                      |
| Daily Sunspot Count            | Divide by max                                                                      |
| Half Width (donki_ha)          | Divide by max                                                                      |
| CPA (Central_PA)               | Divide by max                                                                      |
| Diffusive Shock (V^V^2_replacement) | Take the natural log, then divide by the absolute value of the min of the natural log |
| Halo                           | No transformation (categorical)                                                    |
| Type II Visualization Area     | If zero, leave as zero. Else, take the natural log, then divide by the max of the natural log |

In [13]:
# # Define the updated preprocessing functions based on the new table
# def updated_preprocess_data(df):
#     new_data = pd.DataFrame()
# 
#     # Apply transformations as specified in the updated table
#     log_peak_intensity_max = np.log(df['peak_intensity']).max()
#     new_data['log_peak_intensity_norm'] = np.log(df['peak_intensity']) / log_peak_intensity_max
# 
#     new_data['CMEs_over_1000_past_9_hrs_norm'] = df['CMEs_over_1000_past_9_hrs'] / 2
#     new_data['CMEs_past_9_hours_norm'] = df['CMEs_past_9_hours'] / 6
# 
#     log_v_log_v_max = np.log(df['V log V']).max()
#     new_data['log_V_log_V_norm'] = np.log(df['V log V']) / log_v_log_v_max
# 
#     new_data['longitude_norm'] = df['longitude'] / 180
#     new_data['MPA_norm'] = df['MPA'] / 360
#     new_data['latitude_norm'] = df['latitude'] / 90
#     new_data['Accel_norm'] = df['Accel'] / df['Accel'].max()
#     new_data['donki_speed_norm'] = df['donki_speed'] / df['donki_speed'].max()
# 
#     # Take the natural log of the 'V^V^2_replacement' (Diffusive Shock) and 'richardson_formula_1.0_c' (Richardson's Equation)
#     df['log_diffusive_shock'] = np.log(df['V^V^2_replacement'])
#     df['log_richardson_formula'] = np.log(df['richardson_formula_1.0_c'])
# 
#     # Find the absolute value of the minimum of these logged features
#     abs_min_log_diffusive_shock = np.abs(df['log_diffusive_shock'].min())
#     abs_min_log_richardson_formula = np.abs(df['log_richardson_formula'].min())
# 
#     # Divide by the absolute value of the min
#     new_data['log_diffusive_shock_norm'] = df['log_diffusive_shock'] / abs_min_log_diffusive_shock
#     new_data['log_richardson_formula_norm'] = df['log_richardson_formula'] / abs_min_log_richardson_formula
# 
#     new_data['2nd_order_speed_final_norm'] = df['2nd_order_speed_final'] / df['2nd_order_speed_final'].max()
#     new_data['2nd_order_speed_20R_norm'] = df['2nd_order_speed_20R'] / df['2nd_order_speed_20R'].max()
#     new_data['Max_speed_past_day_norm'] = df['Max_speed_past_day'] / df['Max_speed_past_day'].max()
#     new_data['CMEs_past_month_norm'] = df['CMEs_past_month'] / df['CMEs_past_month'].max()
#     new_data['sunspots_norm'] = df['sunspots'] / df['sunspots'].max()
#     new_data['donki_ha_norm'] = df['donki_ha'] / df['donki_ha'].max()
#     new_data['Central_PA_norm'] = df['Central_PA'] / df['Central_PA'].max()
# 
#     new_data['HALO'] = df['HALO']
# 
#     log_type_2_area_max = np.log(df[df['Type_2_Area'] > 0]['Type_2_Area']).max()
#     new_data['log_Type_2_Area_norm'] = df['Type_2_Area'].apply(
#         lambda x: 0 if x == 0 else np.log(x + 1) / log_type_2_area_max)
# 
#     return new_data

| Feature/Target Variable            | Preprocessing Step                                                                                                  |
|------------------------------------|---------------------------------------------------------------------------------------------------------------------|
| peak_intensity                     | Take the natural log                                                                                                |
| CMEs_over_1000_past_9_hrs          | Map to 0-1 using min-max normalization                                                                               |
| CMEs_past_9_hours                  | Map to 0-1 using min-max normalization                                                                               |
| V log V                            | Map to 0-1 using min-max normalization                                                                               |
| longitude                          | Map to 0-1 using min-max normalization                                                                               |
| MPA                                | Map to 0-1 using min-max normalization                                                                               |
| latitude                           | Map to 0-1 using min-max normalization                                                                               |
| Acceleration (Accel)               | Map to 0-1 using min-max normalization                                                                               |
| Linear Speed (donki_speed)         | Map to 0-1 using min-max normalization                                                                               |
| Richardson's Equation              | Take the natural log, map to 0-1 using min-max normalization based on natural log                                    |
| 2nd Order Speed Final              | Map to 0-1 using min-max normalization                                                                               |
| 2nd Order Speed at 20 Solar Radii  | Map to 0-1 using min-max normalization                                                                               |
| Max Speed Past Day                 | Map to 0-1 using min-max normalization                                                                               |
| CMEs in the Past Month             | Map to 0-1 using min-max normalization                                                                               |
| Daily Sunspot Count                | Map to 0-1 using min-max normalization                                                                               |
| Half Width (donki_ha)              | Map to 0-1 using min-max normalization                                                                               |
| CPA (Central_PA)                   | Map to 0-1 using min-max normalization                                                                               |
| Diffusive Shock (V^V^2_replacement) | Take the natural log, map to 0-1 using min-max normalization based on natural log                                    |
| Halo                               | No transformation (categorical)                                                                                      |
| Type II Visualization Area         | If zero, map to 1. Take the natural log, then map to 0-1 using min-max normalization based on the natural log values |


In [3]:
def updated_preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply efficient preprocessing steps to the given dataframe based on the specified scheme table.
    :param df (pd.DataFrame): Original dataframe with raw features.
    :return pd.DataFrame: Dataframe with features processed according to the scheme table.
    """

    # Preallocate a dictionary to store preprocessed data
    preprocessed_data = {}

        # Lambda function for min-max normalization
    min_max_norm = lambda x: (x - x.min()) / (x.max() - x.min())

    # Natural Log Transformations
    preprocessed_data['log_peak_intensity'] = np.log(df['peak_intensity'])
    preprocessed_data['log_half_richardson_value'] = np.log(-df['half_richardson_value'])
    preprocessed_data['log_diffusive_shock'] = np.log(df['diffusive_shock'])
    preprocessed_data['log_Type2_Viz_Area'] = df['Type2_Viz_Area'].apply(lambda x: np.log(x) if x != 0 else np.log(1))

    # Apply Min-Max normalization on all features, including the log-transformed ones
    for feature, proper_name in {'VlogV': 'VlogV',
                                 'CME_DONKI_speed': 'CME_DONKI_speed',
                                 '2nd_order_speed_final': '2nd_order_speed_final',
                                 '2nd_order_speed_20R': '2nd_order_speed_20R',
                                 'CMEs_with_speed_over_1000_in_past_9hours': 'CMEs_with_speed_over_1000_in_past_9hours',
                                 'max_CME_speed_in_past_day': 'max_CME_speed_in_past_day',
                                 'CMEs_in_past_month': 'CMEs_in_past_month',
                                 'CME_DONKI_longitude': 'CME_DONKI_longitude',
                                 'CME_CDAW_MPA': 'CME_CDAW_MPA',
                                 'daily_sunspots': 'daily_sunspots',
                                 'DONKI_half_width': 'DONKI_half_width',
                                 'CME_DONKI_latitude': 'CME_DONKI_latitude',
                                 'Accelaration': 'Accelaration',
                                 'CPA': 'CPA',
                                 'CMEs_in_past_9hours': 'CMEs_in_past_9hours'}.items():
        preprocessed_data[f"{feature}_norm"] = min_max_norm(df[proper_name])

    # Apply min-max normalization to log-transformed features
    preprocessed_data['log_richardson_value_norm'] = min_max_norm(preprocessed_data['log_half_richardson_value'])
    preprocessed_data['log_diffusive_shock_norm'] = min_max_norm(preprocessed_data['log_diffusive_shock'])
    preprocessed_data['log_Type2_Viz_Area_norm'] = min_max_norm(preprocessed_data['log_Type2_Viz_Area'])

    # No transformation for 'Halo'
    preprocessed_data['Halo'] = df['Halo']
    
    # drop log_richardson_formula_1.0_c, diffusive shock, log_Type_2_Area because they are not needed anymore
    preprocessed_data.pop('log_half_richardson_value')
    preprocessed_data.pop('log_diffusive_shock')
    preprocessed_data.pop('log_Type2_Viz_Area')

    return pd.DataFrame(preprocessed_data)


In [4]:
# Apply the updated preprocessing
updated_preprocessed_data = updated_preprocess_data(original_data)
updated_preprocessed_data.head()

Unnamed: 0,log_peak_intensity,VlogV_norm,CME_DONKI_speed_norm,2nd_order_speed_final_norm,2nd_order_speed_20R_norm,CMEs_with_speed_over_1000_in_past_9hours_norm,max_CME_speed_in_past_day_norm,CMEs_in_past_month_norm,CME_DONKI_longitude_norm,CME_CDAW_MPA_norm,daily_sunspots_norm,DONKI_half_width_norm,CME_DONKI_latitude_norm,Accelaration_norm,CPA_norm,CMEs_in_past_9hours_norm,log_richardson_value_norm,log_diffusive_shock_norm,log_Type2_Viz_Area_norm,Halo
0,-1.609438,0.167678,0.20438,0.212945,0.177307,0.0,0.194085,0.0,0.522222,0.475,0.120603,0.241379,0.533708,0.251235,1.0,0.0,0.687284,0.48571,0.0,1
1,-1.609438,0.077094,0.160584,0.155663,0.116685,0.0,0.149723,0.0,0.825,0.819444,0.125628,0.287356,0.589888,0.25859,0.816667,0.0,0.789503,0.443832,0.0,0
2,-1.609438,0.146279,0.186131,0.164725,0.146996,0.0,0.175601,0.012987,0.277778,0.266667,0.090452,0.172414,0.421348,0.245561,0.3,0.0,0.946667,0.469356,0.0,0
3,-1.609438,0.060073,0.206204,0.176052,0.119903,0.0,0.195933,0.025974,0.597222,0.655556,0.065327,0.206897,0.47191,0.261427,0.663889,0.0,0.257511,0.487249,0.0,0
4,-1.609438,0.02088,0.255474,0.032362,0.0,0.0,0.245841,0.012987,0.438889,0.261111,0.120603,0.494253,0.533708,0.236524,0.25,0.0,0.79741,0.84053,0.0,0


In [5]:
# Calculate the new min and max for each of the new columns in the preprocessed data
updated_min_max_values = updated_preprocessed_data.agg([np.min, np.max]).T
updated_min_max_values


Unnamed: 0,amin,amax
log_peak_intensity,-1.609438,8.732079
VlogV_norm,0.0,1.0
CME_DONKI_speed_norm,0.0,1.0
2nd_order_speed_final_norm,0.0,1.0
2nd_order_speed_20R_norm,0.0,1.0
CMEs_with_speed_over_1000_in_past_9hours_norm,0.0,1.0
max_CME_speed_in_past_day_norm,0.0,1.0
CMEs_in_past_month_norm,0.0,1.0
CME_DONKI_longitude_norm,0.0,1.0
CME_CDAW_MPA_norm,0.0,1.0


In [6]:
# Define the function to save the DataFrame to a CSV file
def save_dataframe_to_csv(df, file_path):
    """
    Save a given DataFrame to a CSV file at the specified file path.
    
    Parameters:
        df (pd.DataFrame): DataFrame to save.
        file_path (str): The file path where the DataFrame should be saved.
    """
    df.to_csv(file_path, index=False)


# Define the file path for saving the updated preprocessed data
file_path_to_save = '../cme_and_electron/cme_josias_10MeV.csv'

# Save the DataFrame to a CSV file
save_dataframe_to_csv(updated_preprocessed_data, file_path_to_save)

In [7]:
# max_peak_intensity = 6198.6
# log_peak_intensity_max = np.log(max_peak_intensity)
# log_peak_intensity_max

8.732078739083455

In [9]:
# def reverse_log_peak_intensity_norm(log_peak_intensity_norm, log_peak_intensity_max=8.732078739083455, in_log=False):
#     """
#     Reverse the normalization of log_peak_intensity_norm to obtain either peak_intensity or log_peak_intensity.
#     
#     Parameters:
#     - log_peak_intensity_norm (float or np.ndarray): The normalized log of peak intensity to be reversed.
#     - log_peak_intensity_max (float): The maximum value of the log of the original peak intensity.
#     - in_log (bool): If True, returns the log of the peak intensity. Otherwise, returns the peak intensity itself.
#     
#     Returns:
#     - float or np.ndarray: The reversed peak intensity or its log, depending on the value of in_log.
#     """
#     # Reverse normalization to get log_peak_intensity
#     log_peak_intensity = log_peak_intensity_norm * log_peak_intensity_max
# 
#     if in_log:
#         return log_peak_intensity
#     else:
#         # Exponentiate to get back to peak_intensity
#         return np.exp(log_peak_intensity)

In [10]:
# print(reverse_log_peak_intensity_norm(0.034652126))

1.3533528319798735
