In [115]:
# libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [110]:
# importing data
data = pd.read_csv("Aus_grocery_synthetic_dataset2.csv")

# handeling missing data
data.fillna(method='ffill', inplace=True)

# seperating into features and target
X = data.drop('unit_price_x', axis=1)
y = data['unit_price_x']

# categorical columns
categorical_cols = ['Category', 'Sub_category', 'Product_Group', 'Product_Name', 'Brand', 'Sku', 'RunDate']

# one-hot encoding categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# preprocessing pipelne
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('scaler', StandardScaler(with_mean=False))])

# preprocess data
X_preprocessed = pipeline.fit_transform(X)

# splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=14)

In [111]:
# building model
network = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=[X_train.shape[1]]),
    tf.keras.layers.Dense(60, activation='relu'),
    tf.keras.layers.Dense(31, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model with a fresh optimizer
network.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [112]:
# training model
network.fit(X_train, y_train, epochs=30, batch_size=50)

# evaluating model
test_loss, test_mae = network.evaluate(X_test, y_test)

# calculating MSE
y_pred = network.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Test Mean Squared Error: {mse}')


Epoch 1/30
[1m2996/2996[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 13ms/step - loss: 69.7220 - mae: 3.2601
Epoch 2/30
[1m2996/2996[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 11ms/step - loss: 5.5162 - mae: 1.3249
Epoch 3/30
[1m2996/2996[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 11ms/step - loss: 4.9721 - mae: 1.2202
Epoch 4/30
[1m2996/2996[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 12ms/step - loss: 3.5859 - mae: 1.0937
Epoch 5/30
[1m2996/2996[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 12ms/step - loss: 3.0515 - mae: 1.0192
Epoch 6/30
[1m2996/2996[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 12ms/step - loss: 2.4496 - mae: 0.9383
Epoch 7/30
[1m2996/2996[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 12ms/step - loss: 2.4158 - mae: 0.8699
Epoch 8/30
[1m2996/2996[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 13ms/step - loss: 2.1146 - mae: 0.7814
Epoch 9/30
[1m2996/2996[0m [32m━━━━━━━━━━━━━

In [116]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Test RMSE: {rmse}')

# comparing RMSE to the mean of the target variable
print(f'Mean of target variable: {y_test.mean()}')

# r squred
r_squared = r2_score(y_test, y_pred)
print(f'R-squared: {r_squared}')

Test RMSE: 0.940859874263501
Mean of target variable: 8.53962232905983
R-squared: 0.9964675663247832


In [123]:
# test case
new_data = pd.DataFrame({
    'Category': ['Meat & seafood'], 
    'Sub_category': ['Poultry'], 
    'Product_Group': ['Crumbed chicken'], 
    'Product_Name': ['RSPCA Approved Chicken Breast Schnitzel Plain Crumb'], 
    'Brand': ['Coles'], 
    'Sku': ['5969865P'], 
    'RunDate': ['10/11/2022']  # Ensure this matches the format you used in training
})

# preprocessing the new data using the same pipeline
X_new_preprocessed = pipeline.transform(new_data)

# Predict the unit_price_x
predicted_price = network.predict(X_new_preprocessed)
print(f'Predicted unit_price_x: {predicted_price [0][0]:.2f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
Predicted unit_price_x: 16.82
