In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

In [4]:
train = pd.read_csv('/content/drive/MyDrive/Amazon_ML/student_resource/dataset/train.csv')
train.head()

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [5]:
test = pd.read_csv('/content/drive/MyDrive/Amazon_ML/student_resource/dataset/test.csv')
test.head()

Unnamed: 0,sample_id,catalog_content,image_link
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...
3,95658,Item Name: Vlasic Snack'mm's Kosher Dill 16 Oz...,https://m.media-amazon.com/images/I/51Ex6uOH7y...
4,36806,"Item Name: McCormick Culinary Vanilla Extract,...",https://m.media-amazon.com/images/I/71QYlrOMoS...


In [None]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()

Train shape: (75000, 4)
Test shape: (75000, 3)


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [6]:
train.isnull().sum()

Unnamed: 0,0
sample_id,0
catalog_content,0
image_link,0
price,0


In [8]:
# =======================
# IMPORTS
# =======================
import pandas as pd
import numpy as np
from tqdm import tqdm
from urllib.request import urlopen
import io
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [9]:
# =======================
# CONFIG
# =======================
IMAGE_SIZE = (224, 224)
CSV_PATH = '/content/drive/MyDrive/Amazon_ML/student_resource/dataset/train.csv'
MODEL_SAVE_PATH = '/content/drive/MyDrive/Amazon_ML/student_resource/dataset/price_model_vgg.h5'

In [10]:
# =======================
# STEP 1: LOAD CSV
# =======================
train_df = pd.read_csv(CSV_PATH)

# Remove rows with missing image links or prices
train_df = train_df.dropna(subset=['image_link', 'price']).reset_index(drop=True)

print(f"Total samples: {len(train_df)}")

Total samples: 75000


In [None]:
# =======================
# STEP 2: LOAD & PREPROCESS IMAGES FROM URL
# =======================
def load_image_from_url(url, target_size=(224, 224)):
    try:
        # Download image to memory
        image_data = urlopen(url, timeout=10).read()
        img = Image.open(io.BytesIO(image_data)).convert('RGB')

        # Resize
        img = img.resize(target_size)

        # Convert to array
        img_array = img_to_array(img)

        # Preprocess for VGG16 (mean subtraction, BGR format, etc.)
        img_array = preprocess_input(img_array)

        return img_array

    except Exception as e:
        print(f"Error loading image: {url} - {e}")
        return None

print("Loading and preprocessing images from URLs...")
images = []
prices = []

for url, price in tqdm(zip(train_df['image_link'], train_df['price']), total=len(train_df)):
    img = load_image_from_url(url, IMAGE_SIZE)
    if img is not None:
        images.append(img)
        prices.append(price)

X = np.array(images)
y = np.array(prices)

Loading and preprocessing images from URLs...


 23%|██▎       | 16950/75000 [53:07<2:11:50,  7.34it/s]

In [None]:
print("Image shape:", X.shape)
print("Price shape:", y.shape)

In [None]:
# =======================
# STEP 3: TRAIN-TEST SPLIT
# =======================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Train samples:", len(X_train))
print("Validation samples:", len(X_val))

In [None]:
# =======================
# STEP 4: BUILD MODEL (VGG16 + DNN)
# =======================
base_model = VGG16(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
for layer in base_model.layers:
    layer.trainable = False  # Freeze pretrained layers

model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')  # Regression output
])

In [None]:
model.compile(
    optimizer=Adam(learning_rate=1e-4),
    loss='mean_absolute_error'
)

model.summary()

In [None]:
# =======================
# STEP 5: TRAIN MODEL
# =======================
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32
)

In [None]:
# =======================
# STEP 6: EVALUATE MODEL
# =======================
y_pred = model.predict(X_val).flatten()

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_pred) + np.abs(y_true)) / 2
    return np.mean(numerator / denominator) * 100

mae = mean_absolute_error(y_val, y_pred)
smape_score = smape(y_val, y_pred)

print(f"Validation MAE: {mae:.2f}")
print(f"Validation SMAPE: {smape_score:.2f}%")

In [None]:
# =======================
# STEP 7: SAVE MODEL
# =======================
model.save(MODEL_SAVE_PATH)
print(f"Model saved to {MODEL_SAVE_PATH}")

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error

# Assuming you already have these from your training script
# y_val     = actual prices
# y_pred    = model.predict(X_val).flatten()

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_pred) + np.abs(y_true)) / 2
    smape_val = np.mean(numerator / denominator) * 100
    return smape_val

# Predict on validation images (already preprocessed as X_val)
y_pred = model.predict(X_val).flatten()

# Calculate MAE and SMAPE
mae = mean_absolute_error(y_val, y_pred)
smape_score = smape(y_val, y_pred)

print(f"✅ Validation MAE: {mae:.2f}")
print(f"✅ Validation SMAPE: {smape_score:.2f}%")


In [None]:
import os
import pandas as pd
import numpy as np
from urllib.request import urlopen
import io
from PIL import Image
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input

In [None]:
# =======================
# PREDICTOR FUNCTION
# =======================
def predictor(sample_id, catalog_content, image_link):
    '''
    Predict price using trained VGG16 model on image

    Parameters:
    - sample_id: Unique identifier for the sample
    - catalog_content: Text (ignored for now)
    - image_link: URL of product image

    Returns:
    - price: Predicted price as float
    '''
    try:
        # Load image from URL
        image_data = urlopen(image_link, timeout=10).read()
        img = Image.open(io.BytesIO(image_data)).convert('RGB')
        img = img.resize((224, 224))
        img_array = img_to_array(img)
        img_array = preprocess_input(img_array)
        img_array = np.expand_dims(img_array, axis=0)  # shape (1, 224, 224, 3)

        # Predict using the loaded model
        price = model.predict(img_array).flatten()[0]

        # Round to 2 decimal places
        return round(float(price), 2)

    except Exception as e:
        print(f"Error processing sample {sample_id}: {e}")
        # Return -1 or 0 if prediction fails
        return -1.0

# =======================
# MAIN SCRIPT
# =======================
if __name__ == "__main__":
    DATASET_FOLDER = 'dataset'
    MODEL_PATH = os.path.join('models', 'vgg16_model.h5')

    # Load trained model
    print(f"Loading model from {MODEL_PATH}...")
    model = load_model(MODEL_PATH)

    # Load test CSV
    test_csv_path = os.path.join(DATASET_FOLDER, 'test.csv')
    test_df = pd.read_csv(test_csv_path)

    print(f"Total test samples: {len(test_df)}")

    # Apply predictor function to each row
    test_df['price'] = test_df.apply(
        lambda row: predictor(row['sample_id'], row['catalog_content'], row['image_link']),
        axis=1
    )

    # Prepare output
    output_df = test_df[['sample_id', 'price']]

    # Save predictions
    output_path = os.path.join(DATASET_FOLDER, 'test_out.csv')
    output_df.to_csv(output_path, index=False)

    print(f"Predictions saved to {output_path}")
    print(f"Total predictions: {len(output_df)}")
    print(f"Sample predictions:\n{output_df.head()}")