In [51]:
%run -i ./utilities/constants.ipynb

In [52]:
import warnings
warnings.filterwarnings('ignore')

In [53]:
import pandas as pd
import os
import json

In [1]:
txt = False

In [55]:
if txt:
    path_kp = os.path.join(DATA_DIR, "kp_data.txt")
    path_sn = os.path.join(DATA_DIR, "sn_data.txt")
else:
    path_sn = os.path.join(DATA_DIR, "SN_d_tot_V2.0.csv") # data described: https://www.sidc.be/SILSO/newdataset
    path_kp = os.path.join(DATA_DIR, "kp_index.json") # data described: https://kp.gfz-potsdam.de/en/data


In [56]:

if txt:
    data_kp = pd.read_csv(path_kp)
    data_kp["datetime"] = pd.to_datetime(data_kp["datetime"])
    data_kp["datetime"] = data_kp["datetime"].dt.floor("d")
    data_kp = data_kp.groupby(data_kp['datetime'].dt.date)['Kp'].max().reset_index()
    data_kp["datetime"] = pd.to_datetime(data_kp["datetime"])
else:
    with open(path_kp, 'r') as f:
        data_kp = json.load(f)
    # convert to dataframe
    data_kp = pd.DataFrame.from_dict(data_kp)
    # convert datetime
    data_kp["datetime"] = pd.to_datetime(data_kp["datetime"])
    # normalize time resolution to one day inteval
    data_kp = data_kp.groupby(data_kp['datetime'].dt.date)['Kp'].max().reset_index()
    # convert Timestamp
    data_kp["datetime"] = pd.to_datetime(data_kp["datetime"])

# Transform: kp values to binary
threshold = 5
print(f"data under threshold: {len(data_kp[data_kp["Kp"] <= threshold])}, data over threshold: {len(data_kp[data_kp["Kp"] >= threshold])}")
data_kp["Kp"] = (data_kp["Kp"] >= threshold).astype("int")

data under threshold: 4393, data over threshold: 1003


In [57]:
if txt:
    data_sn = pd.read_csv(path_sn)
    data_sn = data_sn.rename(columns={"date": "datetime", "SN":"solar_spots_per_day"})
    data_sn["datetime"] = pd.to_datetime(data_sn["datetime"])
else:
    column_names = ["year", "month", "day", "decimal_year", "SNvalue" , "SNerror", "Nb_observations"]
    data_sn = pd.read_csv(path_sn, sep=";", names=column_names, index_col=False)
    data_sn = data_sn.rename(columns={"Nb_observations":"solar_spots_per_day"})
    # extract and convert datetime
    data_sn['datetime'] = pd.to_datetime(data_sn[['year', 'month', 'day']]) + pd.to_timedelta(data_sn['decimal_year'], unit='D')
    data_sn = data_sn.drop(columns=["year", "month", "day", "decimal_year"])
    # filter data to max time resolution for start and end time
    data_sn["datetime"] = data_sn["datetime"].dt.floor("d")

start_date = data_kp["datetime"].iloc[0]
end_date = data_kp["datetime"].iloc[-1]
data_sn = data_sn[(data_sn["datetime"] >= start_date) & (data_sn["datetime"] <= end_date)].reset_index(drop=True)

In [59]:
data_sn["datetime"]

0      2010-05-21
1      2010-05-22
2      2010-05-23
3      2010-05-24
4      2010-05-25
          ...    
5110   2024-05-17
5111   2024-05-18
5112   2024-05-19
5113   2024-05-20
5114   2024-05-21
Name: datetime, Length: 5115, dtype: datetime64[ns]

In [28]:
def drop_mismatched_dates(df1, df2):
    """
    Drop rows with mismatched dates between two dataframes.
    """
    # Convert date columns to datetime if they're not already in datetime format
    if not pd.api.types.is_datetime64_any_dtype(df1["datetime"]):
        df1["datetime"] = pd.to_datetime(df1["datetime"])
    if not pd.api.types.is_datetime64_any_dtype(df2["datetime"]):
        df2["datetime"] = pd.to_datetime(df2["datetime"])

    # Find dates that appear in one dataframe but not in the other
    dates_in_df1_only = df1[~df1["datetime"].isin(df2["datetime"])]
    dates_in_df2_only = df2[~df2["datetime"].isin(df1["datetime"])]

    # Drop mismatched rows from both dataframes
    df1_clean = df1.drop(dates_in_df1_only.index)
    df2_clean = df2.drop(dates_in_df2_only.index)
    
    return df1_clean, df2_clean

# remove mismatched data
data_kp, data_sn = drop_mismatched_dates(data_kp, data_sn)

In [60]:
# ensure no NaN values
data_kp.isna().any()

datetime    False
Kp          False
dtype: bool

In [61]:
data_sn.isna().any()

datetime               False
solar_spots_per_day    False
dtype: bool

In [62]:
# merge on datetime
df = pd.merge(data_kp, data_sn, on='datetime', how='inner')