In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta

# ----------------------------
# 0. Atur path input & output
# ----------------------------
# Ganti sesuai file CSV referensi kamu
input_csv_path  = "./../dataset/solar_system_positions_with_velocity.csv"
# Ganti sesuai lokasi dan nama file hasil yang diinginkan
output_csv_path = "./../dataset/synthetic_test.csv"

# ----------------------------
# 1. Muat data asli
# ----------------------------
df = pd.read_csv(input_csv_path, parse_dates=['date'])
# Pastikan data terurut berdasarkan tanggal
df = df.sort_values('date').reset_index(drop=True)

# ----------------------------
# 2. Statistik (mean & cov) per 'name'
# ----------------------------
numeric_cols = ['x_au','y_au','z_au','vx_au_per_day','vy_au_per_day','vz_au_per_day']
stats = {}
for name, grp in df.groupby('name')[numeric_cols]:
    mean_vec = grp.mean().values
    cov_mat  = np.cov(grp.values, rowvar=False)
    cov_mat += np.eye(len(numeric_cols)) * 1e-10  # agar tidak singular
    stats[name] = (mean_vec, cov_mat)

# ----------------------------
# 3. Mapping 'name' → 'naif_id'
# ----------------------------
name_id_map = df[['name','naif_id']] \
                .drop_duplicates() \
                .set_index('name')['naif_id'] \
                .to_dict()

# ----------------------------
# 4. Fungsi simulasi Monte Carlo
# ----------------------------
def generate_simulation(df_ref,
                        stats, name_id_map,
                        past_days=0, future_days=0,
                        freq='D'):
    """
    - past_days, future_days: int jumlah hari simulasi,
      atau '0'/'No' untuk melewati sisi itu.
    - freq: frekuensi pd.date_range, misal 'D','W','M', dll.
    """
    # 4.1 Batas tanggal asli
    first_date = df_ref['date'].iloc[0]
    last_date  = df_ref['date'].iloc[-1]

    # 4.2 Buat rangka simulasi
    def make_dates(n_days, end=None, start=None):
        s = str(n_days).lower()
        if s in ('0','no','none'):
            return pd.DatetimeIndex([])
        n = int(n_days)
        if end is not None:
            # ke belakang: berakhir sehari sebelum first_date
            return pd.date_range(end=end - timedelta(days=1),
                                 periods=n, freq=freq)
        else:
            # ke depan: mulai sehari setelah last_date
            return pd.date_range(start=start + timedelta(days=1),
                                 periods=n, freq=freq)

    past_idx   = make_dates(past_days, end=first_date)
    future_idx = make_dates(future_days, start=last_date)

    # 4.3 Bangun data sintetik
    rows = []
    for name, (mean_vec, cov_mat) in stats.items():
        naif = name_id_map[name]
        for dt in past_idx.union(future_idx):
            samp = np.random.multivariate_normal(mean_vec, cov_mat)
            rows.append({
                'date': dt,
                'name': name,
                'naif_id': naif,
                'x_au': samp[0],
                'y_au': samp[1],
                'z_au': samp[2],
                'vx_au_per_day': samp[3],
                'vy_au_per_day': samp[4],
                'vz_au_per_day': samp[5]
            })

    # 4.4 Gabung & urutkan per name → date
    df_sim = pd.DataFrame(rows)
    df_all = pd.concat([df_ref, df_sim], ignore_index=True)
    return df_all.sort_values(['name','date']).reset_index(drop=True)

# ----------------------------
# 5. Eksekusi & Simpan
# ----------------------------
if __name__ == '__main__':
    # Ubah sesuai kebutuhan simulasi:
    past_days_to_simulate   = 3650  # misal 10 tahun ke belakang
    future_days_to_simulate = 3650  # misal 10 tahun ke depan

    df_out = generate_simulation(
        df_ref=df,
        stats=stats,
        name_id_map=name_id_map,
        past_days=past_days_to_simulate,
        future_days=future_days_to_simulate,
        freq='D'
    )

    # Simpan ke CSV hasil akhir
    df_out.to_csv(output_csv_path, index=False)
    print("Selesai!")
    print("Rentang tanggal:", df_out['date'].min().date(),
          "sampai", df_out['date'].max().date())
    print("File tersimpan di:", output_csv_path)


Selesai!
Rentang tanggal: 2010-01-03 sampai 2034-12-29
File tersimpan di: ./../dataset/synthetic_test.csv
