In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from tensorflow.keras.utils import load_img, img_to_array
from keras.applications.vgg16 import VGG16, preprocess_input

# Step 1: Read data and process
# Read image_table.csv and price_table.csv
image_table = pd.read_csv('D:\\CPEN355_project\\Data\\Image_table.csv')  # Update with actual path
price_table = pd.read_csv('D:\\CPEN355_project\\Data\\Price_table.csv')  # Update with actual path

# Merge the tables on 'Genmodel_ID' and keep relevant columns
merged_data = pd.merge(image_table[['Genmodel_ID', 'Image_name']],
                       price_table[['Genmodel_ID', 'Entry_price']],  # Assuming 'Entry_price' is the column name
                       on='Genmodel_ID')

# Count the number of images per Genmodel_ID
image_counts = merged_data['Genmodel_ID'].value_counts()

# Remove Genmodel_IDs with fewer than 100 images
valid_genmodels = image_counts[image_counts >= 300].index

# Filter merged_data to only include valid Genmodel_IDs
filtered_data = merged_data[merged_data['Genmodel_ID'].isin(valid_genmodels)]

# For Genmodel_IDs with counts > 100, randomly select 100 images
def sample_images(group):
    if len(group) > 500:
        return group.sample(n=500, random_state=42)
    else:
        return group

filtered_data = filtered_data.groupby('Genmodel_ID').apply(sample_images).reset_index(drop=True)

print("Step 1: Data processing completed. Filtered and sampled dataset.\n")

# Step 2: Extract features from images using pre-trained VGG16

# Initialize the VGG16 model, excluding the top layers
model = VGG16(weights='imagenet', include_top=False)

# Function to load image and extract features
def extract_features(img_path):
    img = load_img(img_path, target_size=(224, 224))
    x = img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x)
    features = features.flatten()
    return features

# Prepare lists to hold features and prices
features_list = []
prices_list = []

# Assuming images are stored in a directory, construct the path to images
# You may need to adjust the path accordingly
image_directory = 'D:\\CPEN355_project\\Data\\DVM_noNest_test\\'

# Process images
for _, row in filtered_data.iterrows():
    img_name = row['Image_name']
    img_path = os.path.join(image_directory, img_name)
    if os.path.exists(img_path):
        features = extract_features(img_path)
        features_list.append(features)
        prices_list.append(row['Entry_price'])

print("Step 2: Feature extraction completed. Extracted features from all images.\n")

# Convert lists to numpy arrays
X = np.array(features_list)
y = np.array(prices_list)

# Step 3: Normalize features and split data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("Step 3: Data normalization and splitting completed. Prepared training and test sets.\n")

# Step 4: Train SVM, collect loss over iterations
from sklearn.linear_model import SGDRegressor

sgd_regressor = SGDRegressor(loss='epsilon_insensitive', epsilon=0.1, max_iter=1000, tol=1e-3, learning_rate='invscaling', eta0=0.01, random_state=42)

# Fit the model and store loss over epochs
n_epochs = 10
train_losses = []
for epoch in range(n_epochs):
    sgd_regressor.partial_fit(X_train, y_train)
    y_pred = sgd_regressor.predict(X_train)
    mae = mean_absolute_error(y_train, y_pred)
    train_losses.append(mae)

print("Step 4: Model training completed. Collected training loss over epochs.\n")

# Step 5: Plot loss vs. iterations
plt.figure()
plt.plot(range(1, n_epochs+1), train_losses, marker='o')
plt.title('Training Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.grid()
plt.show()

print("Step 5: Loss plot generated.\n")

# Step 6: Evaluate model on test set
y_pred_test = sgd_regressor.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
mae_original = mean_absolute_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print(f"Step 6: Model evaluation completed. Mean Absolute Error: {mae_original}, R^2: {r2}\n")


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
mae_original = mean_absolute_error(y_test, y_pred_test)
print(f"Mean Absolute Error: {mae_original}")