In [79]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.model_selection import train_test_split
# import joblib
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import seaborn as sns
# import openpyxl
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestRegressor
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [80]:
class DecisionTreeNode:
    def __init__(self):
        self.feature_index = None  # Index of the feature to split on
        self.threshold = None  # Threshold value for splitting
        self.left = None  # Left child node
        self.right = None  # Right child node
        self.value = None  # Prediction value (for leaf nodes)

In [81]:
def mse(y):
    """Mean Squared Error."""
    return np.mean((y - np.mean(y)) ** 2)

def mse_pred_gt(y_true, y_pred):
    """Mean Squared Error calculation."""
    return np.mean((y_true - y_pred) ** 2)
    
def split_data(X, y, feature_index, threshold):
    """Split the dataset based on a feature and threshold."""
    left_mask = X[:, feature_index] <= threshold
    right_mask = ~left_mask
    return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

def best_split(X, y):
    """Find the best feature and threshold to split the data."""
    best_feature, best_threshold, best_mse = None, None, float('inf')
    for feature_index in range(X.shape[1]):
        thresholds = np.unique(X[:, feature_index])
        for threshold in thresholds:
            _, _, y_left, y_right = split_data(X, y, feature_index, threshold)
            if len(y_left) == 0 or len(y_right) == 0:
                continue
            mse_split = (len(y_left) * mse(y_left) + len(y_right) * mse(y_right)) / len(y)
            if mse_split < best_mse:
                best_feature, best_threshold, best_mse = feature_index, threshold, mse_split
    return best_feature, best_threshold

In [82]:
def build_tree(X, y, depth=0, max_depth=5):
    """Build a decision tree recursively."""
    node = DecisionTreeNode()
    if depth == max_depth or len(np.unique(y)) == 1:  # Stop splitting at max depth or pure leaf
        node.value = np.mean(y)
        return node

    feature_index, threshold = best_split(X, y)
    if feature_index is None:  # No valid split
        node.value = np.mean(y)
        return node

    node.feature_index = feature_index
    node.threshold = threshold
    X_left, X_right, y_left, y_right = split_data(X, y, feature_index, threshold)
    node.left = build_tree(X_left, y_left, depth + 1, max_depth)
    node.right = build_tree(X_right, y_right, depth + 1, max_depth)
    return node

In [83]:
def predict_tree(node, X):
    """Predict with a single decision tree."""
    # if node.value is not None:
    #     return node.value
    # if X[node.feature_index] <= node.threshold:
    #     return predict_tree(node.left, X)
    # return predict_tree(node.right, X)
    if len(X.shape) == 1:  # Single row
        if node.value is not None:  # Leaf node
            return node.value
        if X[node.feature_index] <= node.threshold:
            return predict_tree(node.left, X)
        else:
            return predict_tree(node.right, X)
    else:  # Batch of rows
        return np.array([predict_tree(node, row) for row in X])


In [84]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=5, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.max_features = max_features  # Number of random features to consider at each split
        self.trees = []

    def fit(self, X, y):
        """Train the random forest."""
        for _ in range(self.n_trees):
            # Bootstrap sample: Sample with replacement
            indices = np.random.choice(len(X), int(len(X)*0.02), replace=True)
            X_sample, y_sample = X[indices], y[indices]
            # Train a decision tree on the sample
            tree = build_tree(X_sample, y_sample, max_depth=self.max_depth)
            self.trees.append(tree)

    def predict(self, X):
        """Predict by averaging predictions from all trees."""
        # predictions = np.array([predict_tree(tree, x) for tree in self.trees for x in X])
        tree_preds = np.array([predict_tree(tree, X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)
        # return predictions.reshape(self.n_trees, len(X)).mean(axis=0)


In [85]:
# load pre-processed data
import os
data_folder_fp = "data_folder"
raw_data_fp = os.path.join(data_folder_fp, "numpy_data.pt")
(X_train, X_val, X_test, y_train, y_val, y_test) = torch.load(raw_data_fp, weights_only=False)

In [86]:
print(X_train.shape, y_train.shape)
print(type(X_train))
print(type(y_train))

(58271, 671) (58271,)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [87]:
# X_train = X_train.to_numpy()  # Convert pandas DataFrame to NumPy array
# y_train = y_train.to_numpy()  # Convert pandas Series to NumPy array

In [88]:
rf = RandomForest(n_trees=10, max_depth=5)

In [89]:
import matplotlib.pyplot as plt
def visualize_heatmap(performance, n_trees_list, max_depth_list, image_folder, title="Validation MSE", filename="rf_performance_heatmap.jpg"):
    # Generate heatmap visualization
    performance_df = pd.DataFrame(
        performance,
        index=[f"max_depth={d}" for d in max_depth_list],
        columns=[f"n_trees={t}" for t in n_trees_list],
    )

    plt.figure(figsize=(10, 6))
    sns.heatmap(
        performance_df, annot=True, fmt=".4f", cmap="viridis", cbar_kws={"label": title}
    )
    plt.title(f"Random Forest Hyperparameter Tuning: {title}")
    plt.xlabel("Number of Trees")
    plt.ylabel("Maximum Depth")

    # Save the heatmap to the image folder
    heatmap_fp = os.path.join(image_folder, filename)
    plt.savefig(heatmap_fp)
    plt.show()

In [92]:
def validate_rf(rf, X_val, y_val):
    # Predict on the validation set
    y_val_pred = rf.predict(X_val)
    # Calculate validation MSE
    val_loss = mse_pred_gt(y_val, y_val_pred)
    return val_loss


def train_rt(X_train, 
             y_train, 
             X_val, 
             y_val,
             X_test,
             y_test,
             n_trees_list = [10, 50, 100], 
             max_depth_list =[15, 20],  
             num_epochs=1,
             model_folder="models/random_forest",
             image_folder="images/random_forest"):

    min_val_mse = float('inf')
    # Training loop for Random Forest
    os.makedirs(model_folder, exist_ok=True)
    os.makedirs(image_folder, exist_ok=True)

    
     # Initialize performance storage
    performance = np.zeros((len(max_depth_list), len(n_trees_list)))
    test_performance = np.zeros((len(max_depth_list), len(n_trees_list)))


    # Loop over all combinations of n_trees and max_depth
    for i, max_depth in enumerate(max_depth_list):
        for j, n_trees in enumerate(n_trees_list):
            # Build Random Forest model
            rf = RandomForest(n_trees=n_trees, max_depth=max_depth)

            # Train the Random Forest on the training set
            rf.fit(X_train, y_train)

            # Validate the model
            val_loss = validate_rf(rf, X_val, y_val)
            performance[i, j] = val_loss

            # Test the model
            y_test_pred = rf.predict(X_test)
            test_loss = mse_pred_gt(y_test, y_test_pred)
            test_performance[i, j] = test_loss

            # Save the model for the current configuration
            model_fp = os.path.join(
                model_folder, f"rf_{n_trees}_{max_depth}.pth"
            )
            torch.save(rf, model_fp)

            # Print progress
            print(
                f"n_trees={n_trees}, max_depth={max_depth}: Val MSE = {val_loss:.4f}, Test MSE = {test_loss:.4f}"
            )

    # Call the visualization function for validation and test performance
    visualize_heatmap(performance, n_trees_list, max_depth_list, image_folder, title="Validation MSE", filename="rf_validation_performance_heatmap.jpg")
    visualize_heatmap(test_performance, n_trees_list, max_depth_list, image_folder, title="Test MSE", filename="rf_test_performance_heatmap.jpg")

    return performance, test_performance

In [93]:
os.makedirs("models", exist_ok=True)
os.makedirs("images", exist_ok=True)

model_folder="models/random_forest"
image_folder="images/random_forest"
import time
start_time = time.time()
train_performance, test_performance= train_rt(X_train, y_train, 
             X_val, 
             y_val,
             X_test,
             y_test,
             model_folder=model_folder,
             image_folder=image_folder)
duration = time.time() - start_time
print(f"Training duration = {duration}")

n_trees=10, max_depth=15: Val MSE = 11787.6729, Test MSE = 10645.1258
n_trees=50, max_depth=15: Val MSE = 11407.8913, Test MSE = 10081.1329


KeyboardInterrupt: 

In [94]:
def demo_rf_price(rf_model_fp, data_sample, categorical_features, numerical_features, encoder, scaler):
    """
    Predict the sale price using a trained Random Forest model.

    Parameters:
        rf_model (RandomForest): Trained Random Forest model.
        data_sample (dict): New sample data as a dictionary.
        categorical_features (list): List of categorical feature names.
        numerical_features (list): List of numerical feature names.
        encoder (CustomOneHotEncoder): Fitted one-hot encoder for categorical features.
        scaler (CustomStandardScaler): Fitted standard scaler for numerical features.
    """
    # Convert the sample data to a DataFrame
    data_sample_df = pd.DataFrame(data_sample)

    # Apply one-hot encoding to categorical columns
    new_data_cat = encoder.transform(data_sample_df[categorical_features])
    
    # Scale numerical columns
    new_data_num = scaler.transform(data_sample_df[numerical_features])
    
    # Combine processed numerical and categorical features
    new_data_processed = np.hstack([new_data_num, new_data_cat])

    rf_model = torch.load(rf_model_fp, weights_only=False)
    # Predict sale price using the Random Forest model
    predicted_price = rf_model.predict(new_data_processed)

    # Print the predicted sale price
    print(f"Predicted Sale Price: {predicted_price[0]:.2f}")


In [95]:
best_model_fp = "models/random_forest/rf_100_20.pth"
# best_model_fp = "training_mae_2000.pt"
categorical_features = ['Order Date', 'Brand', 'Sneaker Name', 'Release Date', 'Buyer Region']
numerical_features = ['Retail Price', 'Shoe Size']

new_data = {
    'Order Date': ['2022-01-01'],
    'Brand': ['Yeezy'],
    'Sneaker Name': ['Adidas-Yeezy-Boost-350-V2-Core-Black-Red'],
    'Retail Price': [220],
    'Release Date': ['2018-02-11'],
    'Shoe Size': [11.0],
    'Buyer Region': ['California']
}
from utils import CustomOneHotEncoder, StandardScaler

encoder_scaler_fp = os.path.join(data_folder_fp, "encoder_scaler.pt")
encoder, scaler = torch.load(encoder_scaler_fp, weights_only=False)
demo_rf_price(best_model_fp, new_data, categorical_features, numerical_features, encoder, scaler)

Predicted Sale Price: 583.12
