In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
import math

In [None]:
# importing data
data = pd.read_csv("/content/Aus_grocery_synthetic_dataset2.csv")

#fill missing values with mean value of the same items
mean_prices = data.groupby('Sku')['unit_price_x'].transform('mean')
data['unit_price_x'].fillna(mean_prices, inplace=True)
data.fillna(method='ffill', inplace=True) #forward fill remaining missing values

  data.fillna(method='ffill', inplace=True) #forward fill remaining missing values


In [None]:
def preprocess(data):
  #drop because product name has one to one relationship with sku
  data.drop(['Product_Name'], axis=1, inplace = True)

  #transform datetime column
  # Convert the 'date' column to datetime format
  data['RunDate'] = pd.to_datetime(data['RunDate'], format='%m/%d/%Y')
  # Extract new features
  data['year'] = data['RunDate'].dt.year
  data['month'] = data['RunDate'].dt.month
  data['day_of_month'] = data['RunDate'].dt.day
  data['day_of_week'] = data['RunDate'].dt.dayofweek  # Monday=0, Sunday=6
  #drop the old column
  data.drop(['RunDate'], axis=1, inplace = True)

  # Create lag features for price
  data['unit_price_x_lag1'] = data['unit_price_x'].shift(1)
  data['unit_price_x_lag2'] = data['unit_price_x'].shift(2)
  data['unit_price_x_lag3'] = data['unit_price_x'].shift(3)

  # #fill missing lag values with original values
  data['unit_price_x_lag1'].fillna(data['unit_price_x'], inplace=True)
  data['unit_price_x_lag2'].fillna(data['unit_price_x'], inplace=True)
  data['unit_price_x_lag3'].fillna(data['unit_price_x'], inplace=True)

  #Scale price features
  scaler = RobustScaler()
  data[['unit_price_x_lag1', 'unit_price_x_lag2', 'unit_price_x_lag3']] = scaler.fit_transform(data[['unit_price_x_lag1', 'unit_price_x_lag2', 'unit_price_x_lag3']])

  #one hot encoding
  data = pd.get_dummies(data, columns=['Category', 'Sub_category', 'Product_Group', 'Brand', 'Sku', 'year', 'month', 'day_of_month', 'day_of_week'])

  return data

data = preprocess(data)

In [None]:
# seperating into features and target
X = data.drop(['unit_price_x'], axis=1)
y = data['unit_price_x']

# Train, validation, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
#create tf.data.datset
def data_generator(X, y, batch_size):
    for start in range(0, len(X), batch_size):
        end = min(start + batch_size, len(X))
        yield X[start:end], y[start:end]

train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(X_train, y_train, batch_size=32),
    output_signature=(
        tf.TensorSpec(shape=(None, X_train.shape[1]), dtype=tf.float32),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
)

val_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(X_val, y_val, batch_size=32),
    output_signature=(
        tf.TensorSpec(shape=(None, X_val.shape[1]), dtype=tf.float32),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(X_test, y_test, batch_size=3200),
    output_signature=(
        tf.TensorSpec(shape=(None, X_test.shape[1]), dtype=tf.float32),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
)

train_dataset = train_dataset.cache().repeat().prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.cache().repeat().prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(tf.data.AUTOTUNE)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


nn = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])


early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor the validation loss (MSE)
    patience=5,          # Number of epochs with no improvement to wait
    verbose=1,
    restore_best_weights=True  # Restore the model weights from the epoch with the best value of the monitored metric
)


nn.compile(optimizer=Adam(learning_rate=0.001),
              loss='mean_squared_error',
              metrics=['mae'])

# Train the model
train_size = X_train.shape[0]
val_size = X_val.shape[0]

history = nn.fit(
    train_dataset,
    epochs=100,
    steps_per_epoch = math.ceil(train_size/32),
    validation_data = val_dataset,
    validation_steps = math.ceil(val_size/32),
    verbose=1,
    callbacks = [early_stopping]
)



Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4095/4095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 25ms/step - loss: 94.3943 - mae: 3.6107 - val_loss: 10.4703 - val_mae: 1.3801
Epoch 2/100
[1m4095/4095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 8.6656 - mae: 1.3587 - val_loss: 4.0403 - val_mae: 1.0633
Epoch 3/100
[1m4095/4095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 4.7921 - mae: 1.0897 - val_loss: 4.0399 - val_mae: 0.9279
Epoch 4/100
[1m4095/4095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 4.9901 - mae: 0.9839 - val_loss: 2.3332 - val_mae: 0.8420
Epoch 5/100
[1m4095/4095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 4.5304 - mae: 0.8796 - val_loss: 1.8095 - val_mae: 0.7597
Epoch 6/100
[1m4095/4095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 3.1198 - mae: 0.7785 - val_loss: 2.0465 - val_mae: 0.7095
Epoch 7/100
[1m4095/4095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [None]:
#test
y_pred = []
y_real = []
for feature, target in test_dataset:
  pred = nn.predict(feature)
  real = target.numpy()
  y_pred.extend(pred)
  y_real.extend(real)

mse = mean_squared_error(y_real, y_pred)
mse

[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step


0.33110225

In [None]:
# #test on new data
# new_data = pd.DataFrame({
#     'Category': ['Meat & seafood'],
#     'Sub_category': ['Poultry'],
#     'Product_Group': ['Crumbed chicken'],
#     'Product_Name': ['RSPCA Approved Chicken Breast Schnitzel Plain Crumb'],
#     'Brand': ['Coles'],
#     'Sku': ['5969865P'],
#     'RunDate': ['10/11/2022']
# })

# def preprocess_new(new_data, data):
#   #new df
#   new_data = preprocess(new_data)

#   # Drop columns from new_data that are not in original data
#   cols_to_drop = [col for col in new_data.columns if col not in data.columns]
#   new_data.drop(columns=cols_to_drop, inplace=True)

#   #Add missing columns in one go using pd.concat
#   cols_to_add = [col for col in data.columns if col not in new_data.columns]
#   if cols_to_add:
#       # Create a DataFrame with missing columns initialized with False
#       missing_cols_df = pd.DataFrame(False, index=new_data.index, columns=cols_to_add)
#       # Concatenate along columns (axis=1)
#       new_data = pd.concat([new_data, missing_cols_df], axis=1)

#   #Sort columns in the same order as the original dataframe
#   new_data = new_data[data.columns]

#   return new_data

# new_data = preprocess_new(new_data, X)
# new_data

# #need to create columns for lag features
# #fill lag feature columns with values of the closest date from the original data

KeyError: 'unit_price_x'

In [None]:
# pred = nn.predict(new_data)[0][0]
# pred