In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
file_path = '/content/drive/MyDrive/train.csv'
df = pd.read_csv(file_path)
display(df.head())

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [None]:
df.shape

(75000, 4)

In [None]:
from PIL import Image
import requests
from io import BytesIO

In [None]:
import os
save_dir = "/content/drive/MyDrive/image_features"
os.makedirs(save_dir, exist_ok=True)


In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from PIL import Image
from io import BytesIO
import numpy as np
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import h5py


In [None]:

model = ResNet50(weights='imagenet', include_top=False, pooling='avg')


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
def load_and_preprocess(url):
    try:
        response = requests.get(url, timeout=5)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img = img.resize((224, 224))
        return image.img_to_array(img)
    except:
        return None


In [None]:
def extract_and_save_features(urls, batch_size=32, save_every=5000):
    n = len(urls)
    features_list = []
    batch_count = 0
    failed_urls = []

    with ThreadPoolExecutor(max_workers=8) as executor:
        for i in tqdm(range(0, n, batch_size), desc="Extracting image features"):
            batch_urls = urls[i:i+batch_size]
            batch_images = list(executor.map(load_and_preprocess, batch_urls))
            valid_images = [img for img in batch_images if img is not None]

            if not valid_images:
                failed_urls.extend(batch_urls)
                continue

            batch_images = np.array(valid_images)
            batch_images = preprocess_input(batch_images)
            batch_features = model.predict(batch_images, verbose=0)
            features_list.append(batch_features)

            # 💾 Save every few thousand batches
            if (i + batch_size) % save_every == 0 or i + batch_size >= n:
                features_np = np.vstack(features_list)
                np.save(f"{save_dir}/features_part_{batch_count}.npy", features_np)
                features_list.clear()
                batch_count += 1
                print(f"✅ Saved batch {batch_count}")

    return failed_urls


In [None]:

save_dir = "/content/drive/MyDrive/image_features"
saved_files = sorted([os.path.join(save_dir, f) for f in os.listdir(save_dir) if f.endswith('.npy')])
print("Found feature files:", saved_files)

features = np.load(saved_files[0])
print("Shape:", features.shape)
print("Sample embedding vector (first image):")
print(features[0][:10])

Found feature files: ['/content/drive/MyDrive/image_features/features_part_0.npy', '/content/drive/MyDrive/image_features/features_part_1.npy', '/content/drive/MyDrive/image_features/features_part_10.npy', '/content/drive/MyDrive/image_features/features_part_11.npy', '/content/drive/MyDrive/image_features/features_part_12.npy', '/content/drive/MyDrive/image_features/features_part_13.npy', '/content/drive/MyDrive/image_features/features_part_14.npy', '/content/drive/MyDrive/image_features/features_part_2.npy', '/content/drive/MyDrive/image_features/features_part_3.npy', '/content/drive/MyDrive/image_features/features_part_4.npy', '/content/drive/MyDrive/image_features/features_part_5.npy', '/content/drive/MyDrive/image_features/features_part_6.npy', '/content/drive/MyDrive/image_features/features_part_7.npy', '/content/drive/MyDrive/image_features/features_part_8.npy', '/content/drive/MyDrive/image_features/features_part_9.npy']
Shape: (5000, 2048)
Sample embedding vector (first image):

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=1000,
)

In [None]:
text_embeddings = vectorizer.fit_transform(df['catalog_content'])
print(text_embeddings.shape)

(75000, 1000)


In [None]:
import numpy as np
import glob

feature_files = sorted(glob.glob("/content/drive/MyDrive/image_features/features_part_*.npy"))
image_embeddings = np.vstack([np.load(f) for f in feature_files])
print(image_embeddings.shape)

(74999, 2048)


In [None]:

df = df.iloc[:image_embeddings.shape[0]]

In [None]:
df.head()

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [None]:

text_embeddings_dense = text_embeddings.toarray()

# Print shapes for debugging
print("Shape of image_embeddings:", image_embeddings.shape)
print("Shape of text_embeddings_dense:", text_embeddings_dense.shape)
print("Shape of df:", df.shape)

# Align the text embeddings with the image embeddings
# This assumes that the image_embeddings are a subset of the original text_embeddings
text_embeddings_dense_aligned = text_embeddings_dense[:image_embeddings.shape[0]]

# Combine image and text embeddings
combined_features = np.concatenate((image_embeddings, text_embeddings_dense_aligned), axis=1)

# Create a new DataFrame with combined features and the price label
combined_df = pd.DataFrame(combined_features)
combined_df['price'] = df['price'].values

display(combined_df.head())

Shape of image_embeddings: (74999, 2048)
Shape of text_embeddings_dense: (75000, 1000)
Shape of df: (74999, 4)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3039,3040,3041,3042,3043,3044,3045,3046,3047,price
0,0.014956,0.0,0.282973,0.014469,0.283499,0.508511,0.008285,0.09506,0.0,0.022204,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.89
1,0.186282,0.311111,0.0,0.106105,0.0,0.456322,0.428575,0.0,0.0,0.289942,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.12
2,0.0,0.301527,0.687368,0.0,0.007372,0.070838,0.0,0.0,0.0,2.518153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.97
3,0.155558,0.216651,0.279735,0.301102,0.0,0.0,0.058509,0.192694,0.094762,0.911756,...,0.0,0.0,0.0,0.0,0.0,0.150455,0.0,0.0,0.0,30.34
4,0.106239,0.053126,0.423736,0.309885,0.0,0.132846,0.083443,0.79516,0.0,0.337355,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66.49


In [None]:
combined_df.shape

(74999, 3049)

In [None]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = combined_df.drop('price', axis=1)
y = combined_df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (59999, 3048)
Shape of X_test: (15000, 3048)
Shape of y_train: (59999,)
Shape of y_test: (15000,)


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

from xgboost import XGBRegressor

In [None]:
X_sample = X_train.sample(20000, random_state=42)
y_sample = y_train.loc[X_sample.index]
from xgboost import XGBRegressor
xgb_model = XGBRegressor(
    n_estimators=150,      # half the trees
    learning_rate=0.1,     # slightly faster convergence
    max_depth=6,           # shallower trees
    subsample=0.8,         # use 80% of samples per tree
    colsample_bytree=0.8,  # use 80% of features per tree
    tree_method='hist',
    n_jobs=-1,
    random_state=42
)
xgb_model.fit(X_sample, y_sample)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict on test set
y_pred = xgb_model.predict(X_test)

# Compute metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


Mean Absolute Error (MAE): 15.4251
Root Mean Squared Error (RMSE): 32.6036
R² Score: 0.0206
