In [1]:
import os
import numpy as np
import pandas as pd
from glob import glob
from config import CONFIG


In [2]:

def format_date_column(df, date_column="Date"):
    if date_column in df.columns:
        df[date_column] = pd.to_datetime(df[date_column], errors="coerce")
        df = df.dropna(subset=[date_column])
        df = df.set_index(date_column)
    else:
        raise ValueError(f"'{date_column}' column not found.")

    df["Year"] = df.index.year
    df["Month"] = df.index.month
    df["Day"] = df.index.day

    return df


In [16]:
def compute_log_returns(df, value_column="Close"):
    if value_column not in df.columns:
        raise ValueError(f"Column '{value_column}' not found in DataFrame.")

    df = format_date_column(df, date_column="Start")
    log_return = np.log(df[value_column]) - np.log(df[value_column].shift(1))

    result = pd.DataFrame({
        "Start": df.index,
        "Year": df["Year"],
        "Month": df["Month"],
        "Day": df["Day"],
        "Value": log_return
    }).dropna()

    return result

def zscore_normalize(df):
    df = format_date_column(df, date_column="Start")
    numeric_df = df.select_dtypes(include=[np.number])
    df = df.drop(columns=numeric_df.columns, errors='ignore')
    normalized = (numeric_df - numeric_df.mean()) / numeric_df.std(ddof=0)
    return pd.concat([df.reset_index(), normalized], axis=1).dropna()

def minmax_normalize(df, feature_range=(0, 1)):
    df = format_date_column(df, date_column="Start")
    numeric_df = df.select_dtypes(include=[np.number])
    df = df.drop(columns=numeric_df.columns, errors='ignore')
    min_val = numeric_df.min()
    max_val = numeric_df.max()
    scale = feature_range[1] - feature_range[0]
    normalized = feature_range[0] + (numeric_df - min_val) / (max_val - min_val) * scale
    return pd.concat([df.reset_index(), normalized], axis=1).dropna()


In [4]:
def split_and_save(df, base_filename, split_ratio=0.9):
    split_idx = int(len(df) * split_ratio)
    train_df = df.iloc[:split_idx]
    test_df = df.iloc[split_idx:]

    train_path = os.path.join("training data", f"{base_filename}_train.csv")
    test_path = os.path.join("testing data", f"{base_filename}_test.csv")

    os.makedirs("training data", exist_ok=True)
    os.makedirs("testing data", exist_ok=True)

    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)

    print(f"✅ {base_filename} — Train: {len(train_df)} → training data/")
    print(f"✅ {base_filename} — Test: {len(test_df)} → testing data/")


In [5]:
def normalize_and_split(csv_path, transformer_fn, date_column="Date", split_ratio=0.9):
    df = pd.read_csv(csv_path)
    df = transformer_fn(df)
    base_filename = os.path.splitext(os.path.basename(csv_path))[0]
    split_and_save(df, base_filename, split_ratio)


In [6]:
def process_all_csvs(root_dir, transformer_fn, date_column="Date", split_ratio=0.9):
    csv_files = glob(os.path.join(root_dir, "**", "*.csv"), recursive=True)
    print(f"🔍 Found {len(csv_files)} CSV file(s) in {root_dir}")

    for csv_path in csv_files:
        try:
            normalize_and_split(
                csv_path,
                transformer_fn=transformer_fn,
                date_column=date_column,
                split_ratio=split_ratio
            )
        except Exception as e:
            print(f"❌ Failed to process {csv_path}: {e}")


#### Option 1: Normalize a single file
```
normalize_and_split(
    csv_path="Data Files/bitcoin.csv",
    transformer_fn=compute_log_returns
)
```
#### Option 2: Normalize all files in a folder
```
process_all_csvs(
    root_dir="Data Files",
    transformer_fn=compute_log_returns
)
```

In [17]:
normalize_and_split(r'C:\Users\thoma\Desktop\Diffusion\Bitcoin model\Data Files\bitcoin_2010-07-29_2025-04-25.csv',
                    transformer_fn=compute_log_returns, 
                    date_column="Start", 
                    split_ratio=0.9)

✅ bitcoin_2010-07-29_2025-04-25 — Train: 4844 → training data/
✅ bitcoin_2010-07-29_2025-04-25 — Test: 539 → testing data/
