Investigation of the difference (residuals) in some features that should be identical.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the GOFC data more efficiently
GFOC_dir = "/home/dschwarz/Documents/MT/Dataset_MSc/GFOC_RDCDFI.csv"
GFOC_data = pd.read_csv(GFOC_dir, low_memory=True)

# Load the SWMA data more efficiently
SWMA_dir = "/home/dschwarz/Documents/MT/Dataset_MSc/SWMA_RDAWFI.csv"
SWMA_data = pd.read_csv(SWMA_dir, low_memory=True)

In [None]:
import matplotlib.dates as mdates

# =========================== Input ===================================
start, end = 0, -1  # Define the interval for plotting

# Add an option for monthly, daily, or hourly ticks
tick_interval = 'monthly'  # Change to 'monthly', 'daily', or 'hourly'
tick_step = 2  # Step for the ticks (e.g., every month =1, every 2 months = 2, etc)

# Residuals
# features to check
# features = ['SymD (Omni)', 'SymH (Omni)', 'AsyD (Omni)', 'AsyH (Omni)', 'Percent Interpolated', 'Timeshift (seconds)', 'Time between observations (seconds)', '|avg B|', 'Flow Speed (km/s', 'Proton density (n/cc)', 'Temperature (K)', 'Alpha/Proton Ratio', 'Flow pressure (nPa)', 'Electric Field (Mv/m)', 'Plasma beta', 'Alfven mach number', 'Magnetosonic mach number', 'Vx Velocity (km/s)', 'Vy Velocity (km/s)', 'Vz Velocity (km/s)', 'Bx GSE', 'By GSE', 'Bz GSE', 'By GSM', 'Bz GSM', 'Spacecraft Position x', 'Spacecraft Position y', 'Spacecraft Position z', 'Bow Shock Nose Position x', 'Bow Shock Nose Position y', 'Bow Shock Nose Position z', 'RMS SD B scalar (nT)', 'RMS SD B vector (nT)', 'RMS Timeshift (seconds)', '# fine scale IMF points', '# fine scale Plasma points', 'Approximate Distance to SEL (Re)', 'F10.7 (LASP)', 'Kp (LASP)', 'ap (LASP)', 'Dst (nT) (LASP)']
# fname
# fname = ['SymD (Omni)', 'SymH (Omni)', 'AsyD (Omni)', 'AsyH (Omni)', 'Percent Interpolated', 'Timeshift (seconds)', 'Time between observations (seconds)', '|avg B|', 'Flow Speed (kms', 'Proton density (ncc)', 'Temperature (K)', 'AlphaProton Ratio', 'Flow pressure (nPa)', 'Electric Field (Mvm)', 'Plasma beta', 'Alfven mach number', 'Magnetosonic mach number', 'Vx Velocity (kms)', 'Vy Velocity (kms)', 'Vz Velocity (kms)', 'Bx GSE', 'By GSE', 'Bz GSE', 'By GSM', 'Bz GSM', 'Spacecraft Position x', 'Spacecraft Position y', 'Spacecraft Position z', 'Bow Shock Nose Position x', 'Bow Shock Nose Position y', 'Bow Shock Nose Position z', 'RMS SD B scalar (nT)', 'RMS SD B vector (nT)', 'RMS Timeshift (seconds)', 'No fine scale IMF points', 'No fine scale Plasma points', 'Approximate Distance to SEL (Re)', 'F10_7 (LASP)', 'Kp (LASP)', 'ap (LASP)', 'Dst (nT) (LASP)']

# Manuvers
features = ['is_maneuver_day', 'is_missing_day', 'is_outlier_maneuver_day', 'is_maneuver_day_extended', 'is_missing_day_extended', 'is_man_or_missing', 'is_maneuver_unresolved', 'is_maneuver_unresolved_10m_decay', 'is_maneuver_period_generic']
fname = ['is_maneuver_day', 'is_missing_day', 'is_outlier_maneuver_day', 'is_maneuver_day_extended', 'is_missing_day_extended', 'is_man_or_missing', 'is_maneuver_unresolved', 'is_maneuver_unresolved_10m_decay', 'is_maneuver_period_generic']
# =====================================================================

# Helper function for tick formatting
def format_ticks(ax, tick_interval, tick_step):
    if tick_interval == 'monthly':
        ax.xaxis.set_major_locator(mdates.MonthLocator(interval=tick_step))
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%b'))
    elif tick_interval == 'daily':
        ax.xaxis.set_major_locator(mdates.DayLocator(interval=tick_step))
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    elif tick_interval == 'hourly':
        ax.xaxis.set_major_locator(mdates.HourLocator(interval=tick_step))
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H:%M'))
    ax.tick_params(axis='x', rotation=60)

# Time conversion
GFOC_time = GFOC_data['time'][start:end]
GFOC_time = pd.to_datetime(GFOC_time, format='%Y-%m-%d %H:%M:%S')
SWMA_time = SWMA_data['time'][start:end]
SWMA_time = pd.to_datetime(SWMA_time, format='%Y-%m-%d %H:%M:%S')

for i in range(len(features)):

    feature = features[i]
    # Check if the feature exists in both datasets 
    if feature not in GFOC_data.columns or feature not in SWMA_data.columns:
        print(f"Feature '{feature}' not found in both datasets.")
        continue
    
    #Residuals: GFOC - SWMA
    try:
        residuals = GFOC_data[feature][start:end].astype(float) - SWMA_data[feature][start:end].astype(float)
    except ValueError as e:
        print(f"Error calculating residuals for feature '{feature}': {e}")
        continue
    # Record times and feature values where residuals are not zero
    # non_zero_residuals = residuals[(residuals != 0) & (~residuals.isna())]
    non_zero_residuals = residuals[(residuals != 0) & (~residuals.isna()) & (GFOC_time.dt.time == pd.Timestamp("00:00:00").time())]
    if not non_zero_residuals.empty:
        result = pd.DataFrame({
            'Time_GFOC': GFOC_time.iloc[non_zero_residuals.index],
            'Time_SWMA': SWMA_time.iloc[non_zero_residuals.index],
            f'GFOC_{feature}': GFOC_data[feature].iloc[non_zero_residuals.index],
            f'SWMA_{feature}': SWMA_data[feature].iloc[non_zero_residuals.index],
            f'Residual_{feature}': non_zero_residuals
        })
        # result.to_csv(f'/home/dschwarz/Documents/MT/Dataset_MSc/Residuals/{str(i+96)}_{fname[i]}_res.csv', index=True)
        result.to_csv(f'/home/dschwarz/Documents/MT/Dataset_MSc/Manuevers/{str(i+96)}_{fname[i]}_res.csv', index=True)


Feature 'is_outlier_maneuver_day' not found in both datasets.
