In [1]:
import os, math, pickle
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error

#import tensorflow as tf
#from tensorflow.keras import layers, models, callbacks, optimizers
import warnings
warnings.filterwarnings("ignore")

  from scipy.sparse import issparse


In [2]:
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [3]:
# Load training data
df = pickle.load(open(r'C:\CB\q3\train.pkl', 'rb')).copy()

In [4]:
df['date'] = pd.to_datetime(df['date'], unit='D', origin='2024-01-01')

In [5]:
#df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["code", "date"]).reset_index(drop=True)

In [6]:
# Encode series id (fast and compact)
df["code_id"] = df["code"].astype("category").cat.codes.astype(np.int32)

In [7]:
# Identify base feature columns
base_feats = [c for c in df.columns if c.startswith("f_")]
print("Rows:", len(df), " | Series:", df["code"].nunique(), " | Base features:", len(base_feats))
df.head()

Rows: 2431658  | Series: 4925  | Base features: 20


Unnamed: 0,code,date,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,y,code_id
0,s_0,2024-05-01,0.3947,-0.0635,7.5625,0.0,0.048135,0.582772,0.0,1.952015,0.0,0.933833,0.0,0.0,1.452481,0.0,0.0,0.0,0.130791,7e-05,-0.00933,0.0,-0.046793,0
1,s_0,2024-05-02,0.302114,-0.03962,7.5625,0.0,0.003498,0.571028,0.0,1.7189,0.0,0.921778,0.0,0.0,1.322955,0.0,0.0,0.007457,0.132136,3.7e-05,-0.000959,0.0,-0.001007,0
2,s_0,2024-05-03,0.210442,-0.021979,7.5625,0.0,-0.035704,0.566825,0.0,1.529921,0.0,0.915979,0.0,0.0,1.203602,0.0,0.0,0.017134,0.121744,2.6e-05,0.02407,0.0,0.05723,0
3,s_0,2024-05-04,0.091796,-0.014817,7.5625,0.0,-0.073599,0.562066,0.0,1.483651,0.0,0.919647,0.0,0.0,1.197761,0.0,0.0,0.030256,0.115516,-6e-06,0.030045,0.0,-0.018251,0
4,s_0,2024-05-05,-6.5e-05,-0.010519,9.651191,0.0,-0.105692,0.548534,0.0,1.482967,0.0,0.921448,0.0,0.0,1.22491,-0.017885,-0.032768,0.040731,0.103038,-3.9e-05,0.023343,0.0,-0.038166,0


In [8]:
df["t_idx"] = df.groupby("code").cumcount().astype(np.int32)

In [123]:
for l in [1,2,3,7]:
    df[f"y_lag_{l}"]=df.groupby("code")["y"].shift(l)
for w in [3,7]:
    df[f"y_roll_mean_{w}"]=df.groupby("code")["y"].shift(1).rolling(w).mean()
    df[f"y_roll_std_{w}"]=df.groupby("code")["y"].shift(1).rolling(w).std()

PERIODS = [7, 30, 90, 365]
for p in PERIODS:
    df[f"sin_{p}"] = np.sin(2*np.pi*df["t_idx"]/p).astype(np.float32)
    df[f"cos_{p}"] = np.cos(2*np.pi*df["t_idx"]/p).astype(np.float32)

df["dow"] = df["date"].dt.dayofweek.astype(np.int16)
df["dom"] = df["date"].dt.day.astype(np.int16)
df["month"] = df["date"].dt.month.astype(np.int16)
df["weekofyear"] = df["date"].dt.isocalendar().week.astype(np.int16)
df["year"] = df["date"].dt.year.astype(np.int16)

df.dropna()
#extra = ["code_id","t_idx"] +         [f"sin_{p}" for p in PERIODS] + [f"cos_{p}" for p in PERIODS]
target_col="y"
feature_cols=[c for c in df.columns if c not in ["code","date",target_col]]

#feature_cols = base_feats + extra
#target_col = "y"

print("Total feature cols:", len(feature_cols))
df[feature_cols + [target_col]].head()


Total feature cols: 43


Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,code_id,t_idx,y_lag_1,y_lag_2,y_lag_3,y_lag_7,y_roll_mean_3,y_roll_std_3,y_roll_mean_7,y_roll_std_7,sin_7,cos_7,sin_30,cos_30,sin_90,cos_90,sin_365,cos_365,dow,dom,month,weekofyear,year,y
0,0.3947,-0.0635,7.5625,0.0,0.048135,0.582772,0.0,1.952015,0.0,0.933833,0.0,0.0,1.452481,0.0,0.0,0.0,0.130791,7e-05,-0.00933,0.0,0,0,,,,,,,,,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,2,1,5,18,2024,-0.046793
1,0.302114,-0.03962,7.5625,0.0,0.003498,0.571028,0.0,1.7189,0.0,0.921778,0.0,0.0,1.322955,0.0,0.0,0.007457,0.132136,3.7e-05,-0.000959,0.0,0,1,-0.046793,,,,,,,,0.781832,0.62349,0.207912,0.978148,0.069756,0.997564,0.017213,0.999852,3,2,5,18,2024,-0.001007
2,0.210442,-0.021979,7.5625,0.0,-0.035704,0.566825,0.0,1.529921,0.0,0.915979,0.0,0.0,1.203602,0.0,0.0,0.017134,0.121744,2.6e-05,0.02407,0.0,0,2,-0.001007,-0.046793,,,,,,,0.974928,-0.222521,0.406737,0.913545,0.139173,0.990268,0.034422,0.999407,4,3,5,18,2024,0.05723
3,0.091796,-0.014817,7.5625,0.0,-0.073599,0.562066,0.0,1.483651,0.0,0.919647,0.0,0.0,1.197761,0.0,0.0,0.030256,0.115516,-6e-06,0.030045,0.0,0,3,0.05723,-0.001007,-0.046793,,0.003143,0.052135,,,0.433884,-0.900969,0.587785,0.809017,0.207912,0.978148,0.05162,0.998667,5,4,5,18,2024,-0.018251
4,-6.5e-05,-0.010519,9.651191,0.0,-0.105692,0.548534,0.0,1.482967,0.0,0.921448,0.0,0.0,1.22491,-0.017885,-0.032768,0.040731,0.103038,-3.9e-05,0.023343,0.0,0,4,-0.018251,0.05723,-0.001007,,0.012657,0.039552,,,-0.433884,-0.900969,0.743145,0.669131,0.275637,0.961262,0.068802,0.99763,6,5,5,18,2024,-0.038166


In [128]:
df_t = df.copy()

In [129]:
df.dropna(inplace=True)

In [130]:
df.isnull().sum()

code             0
date             0
f_0              0
f_1              0
f_2              0
f_3              0
f_4              0
f_5              0
f_6              0
f_7              0
f_8              0
f_9              0
f_10             0
f_11             0
f_12             0
f_13             0
f_14             0
f_15             0
f_16             0
f_17             0
f_18             0
f_19             0
y                0
code_id          0
t_idx            0
y_lag_1          0
y_lag_2          0
y_lag_3          0
y_lag_7          0
y_roll_mean_3    0
y_roll_std_3     0
y_roll_mean_7    0
y_roll_std_7     0
sin_7            0
cos_7            0
sin_30           0
cos_30           0
sin_90           0
cos_90           0
sin_365          0
cos_365          0
dow              0
dom              0
month            0
weekofyear       0
year             0
dtype: int64

In [131]:
# -----------------------------
# Date-based split (time-aware)
# -----------------------------
unique_dates = np.array(sorted(df["date"].unique()))
cut = int(math.floor(len(unique_dates) * 0.8))
cut_date = unique_dates[cut]

train_df = df[df["date"] < cut_date].copy()
valid_df  = df[df["date"] >= cut_date].copy()

print("Cut date:", pd.to_datetime(cut_date).date())
print("Train rows:", len(train_df), "Validation rows:", len(valid_df))

Cut date: 2025-03-17
Train rows: 1877406 Validation rows: 519790


## 5) Feature Selection with Mutual Information

In [132]:
# -----------------------------
# 5) Mutual Information Feature Selection (row-level)
# -----------------------------
X_row = train_df[feature_cols].to_numpy(dtype=np.float32)
y_row = train_df[target_col].to_numpy(dtype=np.float32)

imp_mi = SimpleImputer(strategy="median")
X_row_imp = imp_mi.fit_transform(X_row)

rng = np.random.default_rng(42)
n_sample = min(20000, X_row_imp.shape[0])
idx = rng.choice(X_row_imp.shape[0], size=n_sample, replace=False)

mi = mutual_info_regression(X_row_imp[idx], y_row[idx], random_state=42)
mi_series = pd.Series(mi, index=feature_cols).sort_values(ascending=False)

TOP_K = min(35, len(feature_cols))
selected_features = mi_series.head(TOP_K).index.tolist()
mi_series.head(20)

y_lag_1          0.528938
y_roll_mean_3    0.221922
y_lag_2          0.181559
y_roll_std_7     0.115701
t_idx            0.113193
y_roll_std_3     0.111945
weekofyear       0.078735
y_roll_mean_7    0.070293
sin_365          0.063143
cos_365          0.046158
y_lag_3          0.040837
dom              0.031382
cos_90           0.026320
month            0.022556
sin_90           0.022504
y_lag_7          0.017638
cos_30           0.011952
f_15             0.010469
f_5              0.009342
f_0              0.009058
dtype: float64

In [133]:
y_train = y_row.copy()

In [134]:
y_train.shape

(1877406,)

In [135]:
y_valid = valid_df[target_col].to_numpy(dtype=np.float32)

In [136]:
X_train_row = train_df[selected_features].to_numpy(dtype=np.float32)
X_valid_row  = valid_df[selected_features].to_numpy(dtype=np.float32)

imp = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_train_imp = imp.fit_transform(X_train_row)
X_valid_imp  = imp.transform(X_valid_row)

X_train_sc = scaler.fit_transform(X_train_imp)
X_valid_sc  = scaler.transform(X_valid_imp)

PCA_N = min(20, X_train_sc.shape[1])
pca = PCA(n_components=PCA_N, random_state=42)
Z_train = pca.fit_transform(X_train_sc).astype(np.float32)
Z_valid  = pca.transform(X_valid_sc).astype(np.float32)


K = 8
kmeans = MiniBatchKMeans(n_clusters=K, random_state=42, batch_size=4096, n_init="auto")
kmeans.fit(Z_train)

D_train = kmeans.transform(Z_train).astype(np.float32)  # distances to centers
D_valid  = kmeans.transform(Z_valid).astype(np.float32)

F_train = np.hstack([Z_train, D_train]).astype(np.float32)
F_valid  = np.hstack([Z_valid,  D_valid]).astype(np.float32)

print("Row feature matrix shapes:", F_train.shape, F_valid.shape)

Row feature matrix shapes: (1877406, 28) (519790, 28)


In [137]:
print(F_train.shape)
print(F_valid.shape)

(1877406, 28)
(519790, 28)


In [138]:
print(y_train.shape)
print(y_valid.shape)

(1877406,)
(519790,)


In [139]:
y_tr = y_train.reshape(-1, 1)
y_vl = y_valid.reshape(-1, 1)

In [155]:
df_test = pickle.load(open(r'C:\CB\q3\test.pkl', 'rb')).copy()

In [156]:
df_test.head()

Unnamed: 0,code,date,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,y
2244974,s_4780,550,-0.704179,0.072523,1.990935,0.820878,-0.006093,0.485911,0.040129,0.942753,2.119506,0.93424,0.08147,1.129677,0.949033,-0.058095,0.016745,0.002501,0.080117,-1.9e-05,-0.003881,-0.030448,-0.106038
2031851,s_2097,550,-1.475341,-0.060034,1.737305,0.675257,0.101021,0.2738,0.003186,0.834724,2.751153,0.97695,0.003297,1.146666,0.872422,-0.089398,0.02201,-0.04887,0.016602,-8e-06,0.040727,-0.068891,-0.066604
2433999,s_2157,550,-1.577786,0.074787,1.659307,0.696049,0.076902,0.304876,-0.016647,0.80408,3.097071,0.981946,0.015689,1.223102,0.803156,-0.053105,-0.016663,-0.046717,-0.015627,-5.2e-05,0.026963,0.002812,-0.011905
2402607,s_4105,550,-1.003856,0.033389,1.360022,0.586838,-0.026944,0.450235,0.01949,0.848843,2.781379,0.958969,-0.001349,1.153103,0.940588,-0.052266,-0.01517,0.044854,-0.016701,-1.3e-05,0.038767,-0.031813,0.071741
1952904,s_2387,550,-1.260005,-0.068659,2.414646,0.838523,-0.094169,0.335265,0.001841,0.893063,2.62578,0.92879,0.057546,1.016474,0.897439,-0.092461,0.021719,0.010151,0.006536,-2.8e-05,-0.060134,0.083079,-0.004084


In [157]:
df_test['date'] = pd.to_datetime(df_test['date'], unit='D', origin='2024-01-01')

In [158]:
df_test = df_test.sort_values(["code", "date"]).reset_index(drop=True)

In [159]:
df_test["code_id"] = df_test["code"].astype("category").cat.codes.astype(np.int32)

In [160]:
df_test["t_idx"] = df_test.groupby("code").cumcount().astype(np.int32)

In [162]:
for l in [1,2,3,7]:
    df_test[f"y_lag_{l}"]=df_test.groupby("code")["y"].shift(l)
for w in [3,7]:
    df_test[f"y_roll_mean_{w}"]=df_test.groupby("code")["y"].shift(1).rolling(w).mean()
    df_test[f"y_roll_std_{w}"]=df_test.groupby("code")["y"].shift(1).rolling(w).std()

PERIODS = [7, 30, 90, 365]
for p in PERIODS:
    df_test[f"sin_{p}"] = np.sin(2*np.pi*df_test["t_idx"]/p).astype(np.float32)
    df_test[f"cos_{p}"] = np.cos(2*np.pi*df_test["t_idx"]/p).astype(np.float32)

df_test["dow"] = df_test["date"].dt.dayofweek.astype(np.int16)
df_test["dom"] = df_test["date"].dt.day.astype(np.int16)
df_test["month"] = df_test["date"].dt.month.astype(np.int16)
df_test["weekofyear"] = df_test["date"].dt.isocalendar().week.astype(np.int16)
df_test["year"] = df_test["date"].dt.year.astype(np.int16)

X_test = df_test[selected_features].to_numpy(dtype=np.float32)
y_test = df_test[target_col].to_numpy(dtype=np.float32)
X_test_imp = imp.fit_transform(X_test)
X_test_sc = scaler.transform(X_test_imp)

Z_test  = pca.transform(X_test_sc).astype(np.float32)
D_test =  kmeans.transform(Z_test).astype(np.float32)

F_test = np.hstack([Z_test, D_test]).astype(np.float32)

In [151]:
F_test.shape

(833916, 28)

In [150]:
y_test.shape

(833916,)

In [72]:
print("\n" + "=" * 80)
print("ACTIVATION FUNCTIONS")
print("=" * 80)

class ActivationFunctions:
    """
    Activation functions and their derivatives for DFNN
    """
    
    @staticmethod
    def relu(Z):
        """
        ReLU activation: f(z) = max(0, z)
        
        Args:
            Z: Pre-activation values (any shape)
        Returns:
            Activated values (same shape as Z)
        """
        return np.maximum(0, Z)
    
    @staticmethod
    def relu_derivative(Z):
        """
        ReLU derivative: f'(z) = 1 if z > 0, else 0
        
        Args:
            Z: Pre-activation values (any shape)
        Returns:
            Derivative values (same shape as Z)
        """
        return (Z > 0).astype(float)
    
    @staticmethod
    def identity(Z):
        """
        Identity activation: f(z) = z
        Used for regression output layer
        
        Args:
            Z: Pre-activation values (any shape)
        Returns:
            Same as input
        """
        return Z
    
    @staticmethod
    def identity_derivative(Z):
        """
        Identity derivative: f'(z) = 1
        
        Args:
            Z: Pre-activation values (any shape)
        Returns:
            Ones with same shape as Z
        """
        return np.ones_like(Z)

print("\n✓ Activation functions defined:")
print("  - ReLU for hidden layers: f(z) = max(0, z)")
print("  - Identity for output layer: f(z) = z")


ACTIVATION FUNCTIONS

✓ Activation functions defined:
  - ReLU for hidden layers: f(z) = max(0, z)
  - Identity for output layer: f(z) = z


In [73]:
# ============================================================================
# STEP 4: DEEP FEED-FORWARD NEURAL NETWORK CLASS
# ============================================================================

print("\n" + "=" * 80)
print("DFNN CLASS IMPLEMENTATION")
print("=" * 80)

class DFNN:
    """
    Deep Feed-Forward Neural Network for Regression
    Implements mini-batch SGD with backpropagation
    """
    
    def __init__(self, architecture, learning_rate=0.001, batch_size=64, 
                 epochs=50, patience=10, verbose=True):
        """
        Initialize DFNN
        
        Args:
            architecture: List of layer sizes [input_dim, hidden1, hidden2, ..., output_dim]
            learning_rate: Learning rate η for SGD
            batch_size: Mini-batch size B
            epochs: Number of training epochs
            patience: Early stopping patience
            verbose: Print training progress
        """
        self.architecture = architecture
        self.num_layers = len(architecture) - 1  # Exclude input layer
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.patience = patience
        self.verbose = verbose
        
        # Initialize parameters
        self.parameters = {}
        self.initialize_parameters()
        
        # Training history
        self.history = {
            'train_loss': [],
            'val_loss': []
        }
        
        # Activation functions
        self.activation = ActivationFunctions()
        
        if self.verbose:
            print(f"\n✓ DFNN initialized with architecture: {architecture}")
            print(f"  Learning rate: {learning_rate}")
            print(f"  Batch size: {batch_size}")
            print(f"  Epochs: {epochs}")
            print(f"  Early stopping patience: {patience}")
    
    def initialize_parameters(self):
        """
        Initialize weights and biases using He initialization
        
        He initialization: W ~ N(0, sqrt(2/n_in))
        Good for ReLU activations
        """
        for l in range(1, self.num_layers + 1):
            n_in = self.architecture[l - 1]
            n_out = self.architecture[l]
            
            # He initialization for weights
            self.parameters[f'W{l}'] = np.random.randn(n_in, n_out) * np.sqrt(2.0 / n_in)
            
            # Zero initialization for biases
            self.parameters[f'b{l}'] = np.zeros((1, n_out))
        
        if self.verbose:
            total_params = sum(W.size + self.parameters[f'b{l}'].size 
                             for l, W in enumerate([self.parameters[f'W{i}'] 
                             for i in range(1, self.num_layers + 1)], 1))
            print(f"\n✓ Parameters initialized (He initialization)")
            print(f"  Total parameters: {total_params:,}")
    
    def forward_propagation(self, X):
        """
        Forward propagation through the network
        
        For each layer l:
            Z^(l) = H^(l-1) @ W^(l) + b^(l)
            H^(l) = σ^(l)(Z^(l))
        
        Args:
            X: Input data (batch_size, input_dim)
        
        Returns:
            cache: Dictionary containing all Z and H values for backprop
        """
        cache = {}
        H = X
        cache['H0'] = X
        
        # Forward through all layers
        for l in range(1, self.num_layers + 1):
            W = self.parameters[f'W{l}']
            b = self.parameters[f'b{l}']
            
            # Linear transformation
            Z = H @ W + b
            cache[f'Z{l}'] = Z
            
            # Apply activation
            if l < self.num_layers:
                # Hidden layers: ReLU
                H = self.activation.relu(Z)
            else:
                # Output layer: Identity (for regression)
                H = self.activation.identity(Z)
            
            cache[f'H{l}'] = H
        
        return cache
    
    def compute_loss(self, Y_true, Y_pred):
        """
        Compute Mean Squared Error loss
        
        J = (1/2N) * ||Y_pred - Y_true||²
        
        Args:
            Y_true: True labels (batch_size, output_dim)
            Y_pred: Predicted labels (batch_size, output_dim)
        
        Returns:
            loss: Scalar loss value
        """
        batch_size = Y_true.shape[0]
        loss = (1.0 / (2.0 * batch_size)) * np.sum((Y_pred - Y_true) ** 2)
        return loss
    
    def backward_propagation(self, cache, Y_true):
        """
        Backward propagation (backpropagation) to compute gradients
        
        Output layer: δ^(L) = (1/B) * (Ŷ - Y)
        Hidden layers: δ^(l) = (δ^(l+1) @ W^(l+1)ᵀ) ⊙ σ'(Z^(l))
        
        Gradients:
            ∂J/∂W^(l) = (1/B) * H^(l-1)ᵀ @ δ^(l)
            ∂J/∂b^(l) = (1/B) * sum(δ^(l), axis=0)
        
        Args:
            cache: Forward propagation cache
            Y_true: True labels (batch_size, output_dim)
        
        Returns:
            gradients: Dictionary containing all gradients
        """
        gradients = {}
        batch_size = Y_true.shape[0]
        L = self.num_layers
        
        # Output layer gradient (for MSE + Identity)
        Y_pred = cache[f'H{L}']
        delta = (1.0 / batch_size) * (Y_pred - Y_true)
        
        # Backpropagate through all layers
        for l in range(L, 0, -1):
            # Compute parameter gradients
            H_prev = cache[f'H{l-1}']
            gradients[f'dW{l}'] = H_prev.T @ delta
            gradients[f'db{l}'] = np.sum(delta, axis=0, keepdims=True)
            
            # Propagate error to previous layer (if not input layer)
            if l > 1:
                W = self.parameters[f'W{l}']
                Z_prev = cache[f'Z{l-1}']
                
                # δ^(l-1) = (δ^(l) @ W^(l)ᵀ) ⊙ σ'(Z^(l-1))
                delta = (delta @ W.T) * self.activation.relu_derivative(Z_prev)
        
        return gradients
    
    def update_parameters(self, gradients):
        """
        Update parameters using SGD
        
        W^(l) ← W^(l) - η * ∂J/∂W^(l)
        b^(l) ← b^(l) - η * ∂J/∂b^(l)
        
        Args:
            gradients: Dictionary containing all gradients
        """
        for l in range(1, self.num_layers + 1):
            self.parameters[f'W{l}'] -= self.learning_rate * gradients[f'dW{l}']
            self.parameters[f'b{l}'] -= self.learning_rate * gradients[f'db{l}']
    
    def train(self, X_train, y_train, X_val, y_val):
        """
        Train the network using mini-batch SGD
        
        Args:
            X_train: Training features (N_train, input_dim)
            y_train: Training labels (N_train, output_dim)
            X_val: Validation features (N_val, input_dim)
            y_val: Validation labels (N_val, output_dim)
        """
        N_train = X_train.shape[0]
        num_batches = int(np.ceil(N_train / self.batch_size))
        
        best_val_loss = float('inf')
        patience_counter = 0
        
        print("\n" + "=" * 80)
        print("TRAINING DFNN")
        print("=" * 80)
        
        for epoch in range(self.epochs):
            # Shuffle training data
            indices = np.random.permutation(N_train)
            X_train_shuffled = X_train[indices]
            y_train_shuffled = y_train[indices]
            
            epoch_loss = 0.0
            
            # Mini-batch training
            for batch_idx in range(num_batches):
                # Get mini-batch
                start_idx = batch_idx * self.batch_size
                end_idx = min(start_idx + self.batch_size, N_train)
                
                X_batch = X_train_shuffled[start_idx:end_idx]
                y_batch = y_train_shuffled[start_idx:end_idx]
                
                # Forward propagation
                cache = self.forward_propagation(X_batch)
                
                # Compute loss
                Y_pred = cache[f'H{self.num_layers}']
                batch_loss = self.compute_loss(y_batch, Y_pred)
                epoch_loss += batch_loss * (end_idx - start_idx)
                
                # Backward propagation
                gradients = self.backward_propagation(cache, y_batch)
                
                # Update parameters
                self.update_parameters(gradients)
            
            # Average epoch loss
            epoch_loss /= N_train
            
            # Validation loss
            cache_val = self.forward_propagation(X_val)
            Y_val_pred = cache_val[f'H{self.num_layers}']
            val_loss = self.compute_loss(y_val, Y_val_pred)
            
            # Save history
            self.history['train_loss'].append(epoch_loss)
            self.history['val_loss'].append(val_loss)
            
            # Print progress
            if self.verbose and (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch+1:3d}/{self.epochs} | "
                      f"Train Loss: {epoch_loss:.6f} | "
                      f"Val Loss: {val_loss:.6f}")
            
            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                # Save best parameters
                self.best_parameters = {k: v.copy() for k, v in self.parameters.items()}
            else:
                patience_counter += 1
            
            if patience_counter >= self.patience:
                print(f"\n✓ Early stopping at epoch {epoch+1}")
                print(f"  Best validation loss: {best_val_loss:.6f}")
                # Restore best parameters
                self.parameters = self.best_parameters
                break
        
        print("\n✓ Training completed!")
    
    def predict(self, X):
        """
        Make predictions on new data
        
        Args:
            X: Input features (N, input_dim)
        
        Returns:
            predictions: Predicted values (N, output_dim)
        """
        cache = self.forward_propagation(X)
        predictions = cache[f'H{self.num_layers}']
        return predictions
    
    def evaluate(self, X, y, dataset_name="Test"):
        """
        Evaluate model performance
        
        Args:
            X: Features
            y: True labels
            dataset_name: Name for printing
        
        Returns:
            metrics: Dictionary of evaluation metrics
        """
        predictions = self.predict(X)
        
        # MSE
        mse = np.mean((predictions - y) ** 2)
        
        # RMSE
        rmse = np.sqrt(mse)
        
        # R² score
        ss_res = np.sum((y - predictions) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        r2 = 1 - (ss_res / ss_tot)
        
        # MAE
        mae = np.mean(np.abs(predictions - y))
        
        metrics = {
            'mse': mse,
            'rmse': rmse,
            'r2': r2,
            'mae': mae
        }
        
        print(f"\n{dataset_name} Set Performance:")
        print(f"  MSE:  {mse:.4f}")
        print(f"  RMSE: {rmse:.4f}")
        print(f"  MAE:  {mae:.4f}")
        print(f"  R²:   {r2:.4f}")
        
        return metrics

print("\n✓ DFNN class implemented successfully!")


DFNN CLASS IMPLEMENTATION

✓ DFNN class implemented successfully!


In [29]:
y_valid.shape[0]

525308

In [140]:
# ============================================================================
# STEP 5: TRAIN THE MODEL
# ============================================================================

print("\n" + "=" * 80)
print("MODEL TRAINING")
print("=" * 80)

# Define architecture
input_dim = F_train.shape[1]
architecture = [input_dim, 64, 32, 16, 1]

print(f"\nNetwork Architecture: {architecture}")
print(f"  Input layer: {architecture[0]} features")
print(f"  Hidden layer 1: {architecture[1]} neurons (ReLU)")
print(f"  Hidden layer 2: {architecture[2]} neurons (ReLU)")
print(f"  Hidden layer 3: {architecture[3]} neurons (ReLU)")
print(f"  Output layer: {architecture[4]} neuron (Identity)")

# Initialize model
model = DFNN(
    architecture=architecture,
    learning_rate=0.005,
    batch_size=64,
    epochs=1000,
    patience=20,
    verbose=True
)

# Train model
model.train(F_train, y_tr, F_valid, y_vl)


MODEL TRAINING

Network Architecture: [28, 64, 32, 16, 1]
  Input layer: 28 features
  Hidden layer 1: 64 neurons (ReLU)
  Hidden layer 2: 32 neurons (ReLU)
  Hidden layer 3: 16 neurons (ReLU)
  Output layer: 1 neuron (Identity)

✓ Parameters initialized (He initialization)
  Total parameters: 4,481

✓ DFNN initialized with architecture: [28, 64, 32, 16, 1]
  Learning rate: 0.005
  Batch size: 64
  Epochs: 1000
  Early stopping patience: 20

TRAINING DFNN
Epoch  10/1000 | Train Loss: 0.001332 | Val Loss: 0.000772
Epoch  20/1000 | Train Loss: 0.001312 | Val Loss: 0.000773
Epoch  30/1000 | Train Loss: 0.001297 | Val Loss: 0.000758
Epoch  40/1000 | Train Loss: 0.001285 | Val Loss: 0.000753
Epoch  50/1000 | Train Loss: 0.001275 | Val Loss: 0.000748
Epoch  60/1000 | Train Loss: 0.001267 | Val Loss: 0.000751
Epoch  70/1000 | Train Loss: 0.001260 | Val Loss: 0.000743
Epoch  80/1000 | Train Loss: 0.001254 | Val Loss: 0.000748
Epoch  90/1000 | Train Loss: 0.001249 | Val Loss: 0.000740
Epoch 10

In [57]:
print("\n" + "=" * 80)
print("MODEL EVALUATION")
print("=" * 80)

# Evaluate on all datasets (on scaled targets)
print("\n--- Performance on Scaled Targets ---")
train_metrics = model.evaluate(F_train, y_tr, "Training")
val_metrics = model.evaluate(F_valid, y_vl, "Validation")
#test_metrics = model.evaluate(F_test, y_test_scaled, "Test")

# Make predictions and inverse transform for actual scale
print("\n--- Performance on Original Scale ---")
y_train_pred_scaled = model.predict(F_train)
y_val_pred_scaled = model.predict(F_valid)
#y_test_pred_scaled = model.predict(X_test_scaled)

# Inverse transform to original scale
#y_train_pred = scaler.inverse_transform(y_train_pred_scaled)
#y_val_pred = scaler.inverse_transform(y_val_pred_scaled)
#y_test_pred = y_scaler.inverse_transform(y_test_pred_scaled)
y_train_pred = y_train_pred_scaled
y_val_pred = y_val_pred_scaled
# Evaluate on original scale
def evaluate_original_scale(y_true, y_pred, dataset_name):
    mse = np.mean((y_pred - y_true) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_pred - y_true))
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    
    print(f"\n{dataset_name} Set (Original Scale):")
    print(f"  MSE:  {mse:.2f}")
    print(f"  RMSE: {rmse:.2f} bikes/hour")
    print(f"  MAE:  {mae:.2f} bikes/hour")
    print(f"  R²:   {r2:.4f}")
    
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

train_metrics_orig = evaluate_original_scale(y_train, y_train_pred, "Training")
val_metrics_orig = evaluate_original_scale(y_valid, y_val_pred, "Validation")
#test_metrics_orig = evaluate_original_scale(y_test, y_test_pred, "Test")


MODEL EVALUATION

--- Performance on Scaled Targets ---

Training Set Performance:
  MSE:  0.0026
  RMSE: 0.0513
  MAE:  0.0362
  R²:   0.3358

Validation Set Performance:
  MSE:  0.0015
  RMSE: 0.0392
  MAE:  0.0273
  R²:   0.3269

--- Performance on Original Scale ---


MemoryError: Unable to allocate 26.4 TiB for an array with shape (1906350, 1906350) and data type float64

In [94]:
print("\n" + "=" * 80)
print("MODEL EVALUATION")
print("=" * 80)

# Evaluate on all datasets (on scaled targets)
print("\n--- Performance on Scaled Targets ---")
train_metrics = model.evaluate(F_train, y_tr, "Training")
val_metrics = model.evaluate(F_valid, y_vl, "Validation")
#test_metrics = model.evaluate(F_test, y_test_scaled, "Test")

# Make predictions and inverse transform for actual scale
print("\n--- Performance on Original Scale ---")
y_train_pred_scaled = model.predict(F_train)
y_val_pred_scaled = model.predict(F_valid)
#y_test_pred_scaled = model.predict(X_test_scaled)

# Inverse transform to original scale
#y_train_pred = scaler.inverse_transform(y_train_pred_scaled)
#y_val_pred = scaler.inverse_transform(y_val_pred_scaled)
#y_test_pred = y_scaler.inverse_transform(y_test_pred_scaled)
y_train_pred = y_train_pred_scaled
y_val_pred = y_val_pred_scaled
# Evaluate on original scale
def evaluate_original_scale(y_true, y_pred, dataset_name):
    mse = np.mean((y_pred - y_true) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_pred - y_true))
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    
    print(f"\n{dataset_name} Set (Original Scale):")
    print(f"  MSE:  {mse:.2f}")
    print(f"  RMSE: {rmse:.2f} bikes/hour")
    print(f"  MAE:  {mae:.2f} bikes/hour")
    print(f"  R²:   {r2:.4f}")
    
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

train_metrics_orig = evaluate_original_scale(y_train, y_train_pred, "Training")
val_metrics_orig = evaluate_original_scale(y_valid, y_val_pred, "Validation")
#test_metrics_orig = evaluate_original_scale(y_test, y_test_pred, "Test")


MODEL EVALUATION

--- Performance on Scaled Targets ---

Training Set Performance:
  MSE:  0.0026
  RMSE: 0.0506
  MAE:  0.0358
  R²:   0.3524

Validation Set Performance:
  MSE:  0.0015
  RMSE: 0.0389
  MAE:  0.0270
  R²:   0.3381

--- Performance on Original Scale ---


MemoryError: Unable to allocate 26.4 TiB for an array with shape (1906350, 1906350) and data type float64

In [164]:
y_test_pred = model.predict(F_test)
#test_metrics = evaluate_original_scale(y_test, y_test_pred, "Testing Metrics")

#y_train_pred = y_train_pred_scaled
#y_val_pred = y_val_pred_scaled


In [168]:
#test_metrics = evaluate_original_scale(y_test, y_test_pred, "Testing Metrics")
mse = mean_squared_error(y_test, y_test_pred)
print(mse)

0.0016068799194225297


In [166]:
print(y_test_pred)

[[-0.00273964]
 [ 0.00811288]
 [ 0.01705129]
 ...
 [ 0.00526618]
 [ 0.00444512]
 [ 0.00639858]]


In [170]:
df_test['y_pred'] = y_test_pred

In [97]:
y_test_pred.shape

(869269, 1)

In [1]:
#y_test_pred = scaler.inverse_transform(y_test_pred_scaled)

In [190]:
df1 = pickle.load(open(r'C:\CB\q3\train.pkl', 'rb')).copy()

In [191]:
df1['date'] = pd.to_datetime(df1['date'], unit='D', origin='2024-01-01')

In [192]:
min_date = df1['date'].min()

In [193]:
min_date

Timestamp('2024-01-01 00:00:00')

In [199]:
time_d = df_test1['date'] - min_date


In [200]:
time_d

0        550 days
1        551 days
2        552 days
3        553 days
4        554 days
           ...   
869264   722 days
869265   723 days
869266   724 days
869267   725 days
869268   726 days
Name: date, Length: 869269, dtype: timedelta64[ns]

In [198]:
df_test1=df_test.copy()

In [201]:
days_numeric = time_d / np.timedelta64(1, 'D')

In [202]:
days_numeric

0         550.0
1         551.0
2         552.0
3         553.0
4         554.0
          ...  
869264    722.0
869265    723.0
869266    724.0
869267    725.0
869268    726.0
Name: date, Length: 869269, dtype: float64

In [205]:
df_test1['date'] = pd.to_numeric(days_numeric, downcast='integer')

In [206]:
df_test1.head()

Unnamed: 0,code,date,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,y,code_id,t_idx,y_lag_1,y_lag_2,y_lag_3,y_lag_7,y_roll_mean_3,y_roll_std_3,y_roll_mean_7,y_roll_std_7,sin_7,cos_7,sin_30,cos_30,sin_90,cos_90,sin_365,cos_365,dow,dom,month,weekofyear,year,y_pred,date1
0,s_0,550,-1.369722,0.039572,1.779249,0.614737,-0.156733,0.283154,0.028497,0.578046,3.074321,0.980909,0.017531,0.993053,0.722489,-0.18344,0.003627,-0.034966,-0.014534,-5.8e-05,0.05935,-0.003217,0.090807,0,0,,,,,,,,,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,4,4,7,27,2025,-0.00274,550
1,s_0,551,-1.382292,0.046213,1.788899,0.588849,-0.113548,0.281461,0.028017,0.556442,3.054377,0.980679,0.006043,1.012733,0.697357,-0.177849,0.007046,-0.046088,-0.01379,-6.7e-05,0.035671,0.004659,0.075164,0,1,0.090807,,,,,,,,0.781832,0.62349,0.207912,0.978148,0.069756,0.997564,0.017213,0.999852,5,5,7,27,2025,0.008113,551
2,s_0,552,-1.400969,0.040787,1.945576,0.591494,-0.072045,0.28301,0.038164,0.582346,3.12178,0.979803,0.003479,1.19446,0.721132,-0.178476,0.009262,-0.038439,-0.011187,-6.4e-05,0.029482,-0.010477,0.033599,0,2,0.075164,0.090807,,,,,,,0.974928,-0.222521,0.406737,0.913545,0.139173,0.990268,0.034422,0.999407,6,6,7,27,2025,0.017051,552
3,s_0,553,-1.374383,0.045059,1.81576,0.600794,-0.066371,0.306641,0.038751,0.678451,3.087187,0.97299,0.005802,1.206392,0.775389,-0.177323,0.006861,-0.058894,-0.011347,-5.5e-05,0.027638,-0.011462,0.020968,0,3,0.033599,0.075164,0.090807,,0.066523,0.029567,,,0.433884,-0.900969,0.587785,0.809017,0.207912,0.978148,0.05162,0.998667,0,7,7,28,2025,0.032811,553
4,s_0,554,-1.382681,0.038729,1.834345,0.60196,-0.119939,0.273871,0.026979,0.726376,3.08161,0.971104,0.001889,1.209588,0.803587,-0.183263,0.003685,-0.045593,-0.006697,-5.4e-05,0.029528,-0.012795,-0.006153,0,4,0.020968,0.033599,0.075164,,0.043244,0.028356,,,-0.433884,-0.900969,0.743145,0.669131,0.275637,0.961262,0.068802,0.99763,1,8,7,28,2025,0.020006,554


In [207]:
out = df_test1[['code','date','y_pred']].copy()

In [208]:
out.head()

Unnamed: 0,code,date,y_pred
0,s_0,550,-0.00274
1,s_0,551,0.008113
2,s_0,552,0.017051
3,s_0,553,0.032811
4,s_0,554,0.020006


In [210]:
out1 = out1.sort_values(["code", "date"]).reset_index(drop=True)
#out["date"] = pd.to_datetime(out["date"]).dt.strftime("%Y-%m-%d")
os.makedirs("outputs", exist_ok=True)
out_path = 'C:\CB\q3\DFNN_model_out.csv'
out.to_csv(out_path, index=False)
print("Saved:", out_path)

Saved: C:\CB\q3\DFNN_model_out.csv


In [121]:
#PF_test=pd.todataframe(F_test)
y_test

array([0.09080688, 0.07516383, 0.03359878, ..., 0.00580495, 0.01965836,
       0.00575009], shape=(869269,), dtype=float32)

In [211]:
#import matplotlib.pyplot as plt
#plt.figure(figsize=(20,16))
#plt.plot(out['y'], out['y_pred'])
#plt.show