In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Conv2D, MaxPooling2D, Flatten, Dense,
                                     Dropout, BatchNormalization, InputLayer)
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# path to Image_table.csv
image_table = pd.read_csv('D:\\CPEN355_project\\Data\\Image_table.csv')  # Update with actual path
# path to Price_table.csv
price_table = pd.read_csv('D:\\CPEN355_project\\Data\\Price_table.csv')  # Update with actual path

# Merge the tables on 'Genmodel_ID' and keep relevant columns
# image_table contains image name, while price table contains price, and they share a column called 'Genmodel_ID'
# so we need to merge them
merged_data = pd.merge(image_table[['Genmodel_ID', 'Image_name']],
                       price_table[['Genmodel_ID', 'Entry_price']],
                       on='Genmodel_ID')

print("data merged\n")

# Count the number of images per Genmodel_ID
image_counts = merged_data['Genmodel_ID'].value_counts()

# Remove Genmodel_IDs with fewer than 1000 images
valid_genmodels = image_counts[image_counts >= 1000].index

# Filter merged_data to only include valid Genmodel_IDs
filtered_data = merged_data[merged_data['Genmodel_ID'].isin(valid_genmodels)]

# For Genmodel_IDs with counts > 3000, randomly select 3000 images
def sample_images(group):
    if len(group) > 3000:
        return group.sample(n=3000, random_state=42)
    else:
        return group

filtered_data = filtered_data.groupby('Genmodel_ID').apply(sample_images).reset_index(drop=True)

print("Data filtered\n")

# path to the training images
image_paths = filtered_data['Image_name'].apply(lambda x: f"D:\\CPEN355_project\\355DataSet\\DVM_noNest\\{x}") # Update with the directory path where images are stored
prices = filtered_data['Entry_price'].values

# Prepare image data and prices
img_size = 224  # Resize images to 224*224
batch_size = 32

scaler = StandardScaler()
prices_scaled = scaler.fit_transform(prices.reshape(-1, 1))

X_train_paths, X_test_paths, y_train_scaled, y_test_scaled = train_test_split(
    image_paths, prices_scaled, test_size=0.2, random_state=42)

print("data splited\n")

def preprocess_image(image_path, label):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [img_size, img_size])
    image = image / 255.0
    return image, label

train_dataset = tf.data.Dataset.from_tensor_slices((X_train_paths.values, y_train_scaled))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_paths.values, y_test_scaled))

train_dataset = train_dataset.map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
test_dataset = test_dataset.map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)

data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip('horizontal'),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1),
])

def augment(image, label):
    return data_augmentation(image), label

train_dataset = train_dataset.map(augment, num_parallel_calls=tf.data.AUTOTUNE)

train_dataset = train_dataset.shuffle(buffer_size=1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# CNN
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=(img_size, img_size, 3)),
    BatchNormalization(),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D(2, 2),
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D(2, 2),
    Conv2D(256, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(1)
])

model.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error', metrics=['mae'])

print("model compiled\n")

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_cnn_model.h5', monitor='val_loss', save_best_only=True)

history = model.fit(train_dataset, epochs=50,
                    validation_data=test_dataset,
                    callbacks=[early_stopping, model_checkpoint])

print("finished training\n")

loss, mae = model.evaluate(test_dataset)

print(f"standarlized MAE: {mae}")

predictions_scaled = model.predict(test_dataset)
predictions_scaled = np.concatenate(predictions_scaled, axis=0)
y_test_scaled_flat = np.concatenate([y for x, y in test_dataset], axis=0)

predictions = scaler.inverse_transform(predictions_scaled.reshape(-1, 1))
y_test = scaler.inverse_transform(y_test_scaled_flat.reshape(-1, 1))

mae_original = mean_absolute_error(y_test, predictions)
print(f"original MAE: {mae_original}")
print("testsample number:", len(y_test))

import matplotlib.pyplot as plt

test_images = []
test_labels = []
for image, label in test_dataset.unbatch().take(5):
    test_images.append(image.numpy())
    test_labels.append(label.numpy())

test_predictions = model.predict(np.array(test_images))
test_predictions = scaler.inverse_transform(test_predictions)

for i in range(len(test_images)):
    plt.imshow(test_images[i])
    plt.axis('off')

    actual_price = scaler.inverse_transform(test_labels[i].reshape(-1, 1))[0][0]
    predicted_price = test_predictions[i][0]
    plt.title(f"perdicted price: {predicted_price:.2f}, actual price: {actual_price:.2f}")

    plt.show() 

data merged



  filtered_data = filtered_data.groupby('Genmodel_ID').apply(sample_images).reset_index(drop=True)


Data filtered

data splited

model compiled

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50