In [1]:
# import joblib
import numpy as np
import pandas as pd
# import openpyxl

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib
import matplotlib.pyplot as plt


In [2]:
class CustomOneHotEncoder:
    def __init__(self):
        self.categories_ = {}

    def fit(self, X):
        """Fit the encoder by finding unique values for each column."""
        for col_idx in range(X.shape[1]):  # Iterate over columns
            unique_vals = np.unique(X[:, col_idx])
            self.categories_[col_idx] = {val: idx for idx, val in enumerate(unique_vals)}

    def transform(self, X):
        """Transform the data into one-hot encoded format."""
        encoded_columns = []
        for col_idx in range(X.shape[1]):  # Iterate over columns
            unique_vals = self.categories_[col_idx]
            encoded_col = np.zeros((X.shape[0], len(unique_vals)))
            for row_idx, value in enumerate(X[:, col_idx]):
                if value in unique_vals:
                    encoded_col[row_idx, unique_vals[value]] = 1
            encoded_columns.append(encoded_col)
        return np.hstack(encoded_columns)

    def fit_transform(self, X):
        """Fit and transform the data in one step."""
        self.fit(X)
        return self.transform(X)


In [3]:
class StandardScaler:
    def __init__(self):
        self.means_ = None
        self.stds_ = None

    def fit(self, X):
        """Compute the mean and standard deviation for each feature."""
        self.means_ = np.mean(X, axis=0)
        self.stds_ = np.std(X, axis=0)

    def transform(self, X):
        """Standardize features by subtracting the mean and dividing by the standard deviation."""
        if self.means_ is None or self.stds_ is None:
            raise ValueError("Scaler has not been fitted yet.")
        return (X - self.means_) / self.stds_

    def fit_transform(self, X):
        """Fit to data, then transform it."""
        self.fit(X)
        return self.transform(X)

    def inverse_transform(self, X_scaled):
        """Revert the standardization (return to original scale)."""
        if self.means_ is None or self.stds_ is None:
            raise ValueError("Scaler has not been fitted yet.")
        return (X_scaled * self.stds_) + self.means_


In [4]:
import numpy as np
import pandas as pd

def custom_train_test_split_old(X, y, test_size=0.2, random_state=None, stratify=None):
    """
    Split arrays or DataFrame into random train and test subsets with optional stratification.

    Parameters:
        X (array-like or DataFrame): Features data.
        y (array-like or Series): Target data.
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int): Random seed for reproducibility.
        stratify (array-like or Series): Column for stratified splitting.

    Returns:
        X_train, X_test, y_train, y_test: Split data with same type as input.
    """
    is_X_dataframe = isinstance(X, pd.DataFrame)
    is_y_series = isinstance(y, pd.Series)
    
    X = np.array(X) if not is_X_dataframe else X
    y = np.array(y) if not is_y_series else y
    
    if stratify is not None:
        stratify = np.array(stratify) if not isinstance(stratify, pd.Series) else stratify

    # Random seed
    if random_state is not None:
        np.random.seed(random_state)
    
    # Stratified split
    if stratify is not None:
        unique_classes, class_indices = np.unique(stratify, return_inverse=True)
        train_indices = []
        test_indices = []
        print(f"unique_classes = {unique_classes}, class_indices = {class_indices}")
        # for class_idx, class_label in enumerate(unique_classes):
        #     class_mask = (class_indices == class_idx)
        #     class_indices = np.where(class_mask)[0]
        #     np.random.shuffle(class_indices)

        #     test_size_class = int(len(class_indices) * test_size)
        #     test_indices.extend(class_indices[:test_size_class])
        #     train_indices.extend(class_indices[test_size_class:])
        for class_label in range(len(unique_classes)):  # Iterate over class indices
            # Mask for rows belonging to the current class
            class_mask = (class_indices == class_label)  # True for rows in this class
            class_rows = np.where(class_mask)[0]  # Indices of rows in this class
            
            # Shuffle the indices for this class
            np.random.shuffle(class_rows)
            
            # Calculate the number of test samples for this class
            test_size_class = int(len(class_rows) * test_size)
            
            # Split into train and test
            test_indices.extend(class_rows[:test_size_class])  # Add test rows
            train_indices.extend(class_rows[test_size_class:])  # Add train rows
        # print(f"train_indices = {train_indices}")
    else:
        indices = np.arange(len(X))
        np.random.shuffle(indices)

        test_size_count = int(len(X) * test_size)
        test_indices = indices[:test_size_count]
        train_indices = indices[test_size_count:]

    # Split the data
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    # print(f"y_train = \n{y_train}")
    # print(f"y_test = \n{y_test}")
    # Return the data with the original type
    if is_X_dataframe:
        X_train = pd.DataFrame(X_train, columns=X.columns)
        X_test = pd.DataFrame(X_test, columns=X.columns)
    print(y_train)
    if is_y_series:
        y_train = pd.Series(y_train, index=train_indices, name=y.name)
        y_test = pd.Series(y_test, index=test_indices, name=y.name)

    return X_train, X_test, y_train, y_test

def custom_train_test_split(X, y, test_size=0.2, random_state=None, stratify=None):
    """
    Split arrays or DataFrame into random train and test subsets with optional stratification.

    Parameters:
        X (DataFrame or array-like): Features data.
        y (Series or array-like): Target data.
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int): Random seed for reproducibility.
        stratify (array-like or Series): Column for stratified splitting.

    Returns:
        X_train, X_test, y_train, y_test: Split data, retaining original data types.
    """
    # Check if inputs are DataFrame/Series
    is_X_dataframe = isinstance(X, pd.DataFrame)
    is_y_series = isinstance(y, pd.Series)

    # Reset DataFrame indices for consistency
    if is_X_dataframe:
        X_reset = X.reset_index(drop=True)
    else:
        X_reset = np.array(X)

    if is_y_series:
        y_reset = y.reset_index(drop=True)
    else:
        y_reset = np.array(y)

    if stratify is not None and isinstance(stratify, pd.Series):
        stratify = stratify.reset_index(drop=True)

    # Set random seed for reproducibility
    if random_state is not None:
        np.random.seed(random_state)

    # Stratified splitting
    if stratify is not None:
        unique_classes, class_indices = np.unique(stratify, return_inverse=True)
        train_indices = []
        test_indices = []

        for class_label in range(len(unique_classes)):
            class_mask = (class_indices == class_label)
            class_rows = np.where(class_mask)[0]
            np.random.shuffle(class_rows)

            test_size_class = int(len(class_rows) * test_size)

            test_indices.extend(class_rows[:test_size_class])
            train_indices.extend(class_rows[test_size_class:])
    else:
        # Random split without stratification
        indices = np.arange(len(X_reset))
        np.random.shuffle(indices)

        test_size_count = int(len(X_reset) * test_size)
        test_indices = indices[:test_size_count]
        train_indices = indices[test_size_count:]

    # Use iloc to split DataFrame/Series
    if is_X_dataframe:
        X_train = X_reset.iloc[train_indices]
        X_test = X_reset.iloc[test_indices]
    else:
        X_train, X_test = X_reset[train_indices], X_reset[test_indices]

    if is_y_series:
        y_train = y_reset.iloc[train_indices]
        y_test = y_reset.iloc[test_indices]
    else:
        y_train, y_test = y_reset[train_indices], y_reset[test_indices]

    return X_train, X_test, y_train, y_test




In [5]:
df = pd.read_excel('StockX-Data-Contest-2019-3.xlsx', sheet_name = 'Raw Data', skiprows=0)
# print(df.head())
df.shape

(99956, 8)

In [6]:
# remove duplciated rows
print(f"Total number of duplicasted rows are: {sum([ele for ele in df.duplicated()])}")
duplicated_rows = df[df.duplicated(keep=False)] 
print("==="*30)
duplicates_summary = duplicated_rows.groupby(list(df.columns)).size().reset_index(name='Count')
duplicates_summary = duplicates_summary[duplicates_summary['Count'] > 1]
print(duplicates_summary)
df_cleaned = df.drop_duplicates()
print("==="*30)
print(f"Number of rows before clean: {df.shape[0]}")
print(f"Number of rows after clean : {df_cleaned.shape[0]}")
print(f"Number of columns: {df_cleaned.shape[1]}")

Total number of duplicasted rows are: 2840
     Order Date      Brand                           Sneaker Name  Sale Price  \
0    2017-10-13      Yeezy  Adidas-Yeezy-Boost-350-V2-Cream-White       425.0   
1    2017-10-13      Yeezy        Adidas-Yeezy-Boost-350-V2-Zebra       560.0   
2    2017-11-09  Off-White              Nike-Blazer-Mid-Off-White       550.0   
3    2017-11-10      Yeezy  Adidas-Yeezy-Boost-350-V2-Cream-White       460.0   
4    2017-11-11  Off-White              Nike-Air-Presto-Off-White       975.0   
...         ...        ...                                    ...         ...   
2413 2019-02-13      Yeezy       adidas-Yeezy-Boost-350-V2-Static       330.0   
2414 2019-02-13      Yeezy       adidas-Yeezy-Boost-350-V2-Static       330.0   
2415 2019-02-13      Yeezy       adidas-Yeezy-Boost-350-V2-Static       350.0   
2416 2019-02-13  Off-White        Nike-Air-Max-90-Off-White-Black       565.0   
2417 2019-02-13  Off-White   Nike-Air-Max-90-Off-White-Desert-Ore 

In [7]:
# X_train_val, X_test, y_train_val, y_test = train_test_split(
#     df_cleaned.drop(columns=['Sale Price']), df_cleaned['Sale Price'], test_size=0.2, random_state=42, stratify=df_cleaned['Brand']
# )
X_train_val, X_test, y_train_val, y_test = custom_train_test_split(
    df_cleaned.drop(columns=['Sale Price']), df_cleaned['Sale Price'], test_size=0.2, random_state=42, stratify=df_cleaned['Brand']
)

In [8]:
print(y_train_val.shape)

(77694,)


In [9]:
print(df_cleaned['Sale Price'].shape, type(df_cleaned['Sale Price']))
print(df_cleaned.drop(columns=['Sale Price']).shape, type(df_cleaned.drop(columns=['Sale Price'])))

(97116,) <class 'pandas.core.series.Series'>
(97116, 7) <class 'pandas.core.frame.DataFrame'>


In [10]:
# X_train, X_val, y_train, y_val = train_test_split(
#     X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=X_train_val['Brand']
# )
X_train, X_val, y_train, y_val = custom_train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=X_train_val['Brand']
)


In [11]:
categorical_features = ['Order Date', 'Brand', 'Sneaker Name', 'Release Date', 'Buyer Region']

# Initialize encoder
# encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder = CustomOneHotEncoder()

# Fit and transform categorical features
X_train_cat = encoder.fit_transform(X_train[categorical_features].to_numpy())
X_val_cat = encoder.transform(X_val[categorical_features].to_numpy())
X_test_cat = encoder.transform(X_test[categorical_features].to_numpy())

In [12]:
numerical_features = ['Retail Price', 'Shoe Size']

# Initialize scaler
# scaler = StandardScaler()
scaler = StandardScaler()

# Fit and transform numerical features
X_train_num = scaler.fit_transform(X_train[numerical_features])
X_val_num = scaler.transform(X_val[numerical_features])
X_test_num = scaler.transform(X_test[numerical_features])

In [13]:
X_train_processed = np.hstack([X_train_num, X_train_cat])
X_val_processed = np.hstack([X_val_num, X_val_cat])
X_test_processed = np.hstack([X_test_num, X_test_cat])

In [14]:
X_train_tensor = torch.tensor(X_train_processed, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_processed, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_processed, dtype=torch.float32)

# Convert targets
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [15]:
# save data
import os
data_folder_fp = "data_folder"
os.makedirs(data_folder_fp, exist_ok=True)
tensor_data_fp = os.path.join(data_folder_fp, "pytorch_data.pt")
numpy_data_fp = os.path.join(data_folder_fp, "numpy_data.pt")
torch.save((X_train_tensor, X_val_tensor, X_test_tensor, y_train_tensor, y_val_tensor, y_test_tensor), tensor_data_fp)
torch.save((X_train_processed, X_val_processed, X_test_processed, y_train.to_numpy(), y_val.to_numpy(), y_test.to_numpy()), numpy_data_fp)

In [16]:
print(y_train.size)
y_train

58271


41977    260.0
45598    480.0
4201     439.0
34155    350.0
3470     241.0
         ...  
69806    451.0
65122    821.0
76747    695.0
69422    707.0
76792    587.0
Name: Sale Price, Length: 58271, dtype: float64

In [17]:
xtrain_data_fp = os.path.join(data_folder_fp, "xtrain.pt")
torch.save(X_train, xtrain_data_fp)

In [18]:
encoder_scaler_fp = os.path.join(data_folder_fp, "encoder_scaler.pt")
torch.save((encoder, scaler), encoder_scaler_fp)