In [1]:
# libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import matplotlib.pyplot as plt

In [2]:
# importing data
df = pd.read_csv("Aus_grocery_synthetic_dataset2.csv")
data = df.sample(n=8000, random_state=42)

# handeling missing data
data.fillna(method='ffill', inplace=True)

# seperating into features and target
X = data.drop('unit_price_x', axis=1)
y = data['unit_price_x']

# categorical columns
categorical_cols = ['Category', 'Sub_category', 'Product_Group', 'Product_Name', 'Brand', 'Sku', 'RunDate']

# one-hot encoding categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# preprocessing pipelne
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('scaler', StandardScaler(with_mean=False))])

# preprocess data
X_preprocessed = pipeline.fit_transform(X)

# splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=14)

In [3]:
# building the CNN model
network = Sequential([
    Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(1)  
])

# Compile the model
network.compile(optimizer='adam', loss='mean_squared_error')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [4]:
# Train the model
history = network.fit(X_train, y_train, epochs=10, batch_size=10)

# Evaluate the model
loss = network.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 383ms/step - loss: 234.1201
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 342ms/step - loss: 87.8139
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 341ms/step - loss: 57.3271
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 339ms/step - loss: 45.3636
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 338ms/step - loss: 30.4836
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 341ms/step - loss: 17.6093
Epoch 7/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 426ms/step - loss: 12.8414
Epoch 8/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m348s[0m 544ms/step - loss: 10.8147
Epoch 9/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 371ms/step - loss: 7.4590
Epoch 10/10
[1m640/640[0m [32m━━━━━━━━━━━━━

In [8]:
# calculating MSE
y_pred = network.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Test RMSE: {rmse}')

# comparing RMSE to the mean of the target variable
print(f'Mean of target variable: {y_test.mean()}')

# r squred
r_squared = r2_score(y_test, y_pred)
print(f'R-squared: {r_squared}')

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 72ms/step
Test RMSE: 7.877552791513611
Mean of target variable: 8.17933125
R-squared: 0.712618647817475


In [7]:
# test case
new_data = pd.DataFrame({
    'Category': ['Meat & seafood'], 
    'Sub_category': ['Poultry'], 
    'Product_Group': ['Crumbed chicken'], 
    'Product_Name': ['RSPCA Approved Chicken Breast Schnitzel Plain Crumb'], 
    'Brand': ['Coles'], 
    'Sku': ['5969865P'], 
    'RunDate': ['10/11/2022']  # Ensure this matches the format you used in training
})

# preprocessing the new data using the same pipeline
X_new_preprocessed = pipeline.transform(new_data)

# Predict the unit_price_x
predicted_price = network.predict(X_new_preprocessed)
print(f'Predicted unit_price_x: {predicted_price [0][0]:.2f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
Predicted unit_price_x: 11.49
