In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (Input, Dense, Concatenate, GlobalAveragePooling2D)
from tensorflow.keras.models import Model
from tensorflow.keras.applications import EfficientNetV2B0
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "
  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# --- 1. Combined Preprocessing and Feature Engineering ---
# This section combines the logic from your preprocessing.ipynb

df = pd.read_csv(r"C:\Users\rauna\Downloads\68e8d1d70b66d_student_resource\student_resource\dataset\train.csv")

# a. Extract structured data from 'catalog_content'
def parse_catalog(content):
    lines = content.split('\n')
    data = {'name': '', 'description': [], 'value': np.nan, 'unit': 'unknown'}
    for line in lines:
        if line.startswith('Item Name:'):
            data['name'] = line.replace('Item Name:', '').strip()
        elif line.startswith('Bullet Point'):
            data['description'].append(line.split(':', 1)[1].strip())
        elif line.startswith('Value:'):
            # Use regex for safety to find numbers
            val = ''.join(filter(lambda i: i.isdigit() or i=='.', line))
            if val: data['value'] = float(val)
        elif line.startswith('Unit:'):
            data['unit'] = line.replace('Unit:', '').strip()
    data['description'] = ' '.join(data['description'])
    return pd.Series(data)

parsed_df = df['catalog_content'].apply(parse_catalog)
df = pd.concat([df, parsed_df], axis=1)

# b. Standardize the 'unit' column
unit_mapping = {
    'Ounce': 'ounce', 'ounce': 'ounce', 'Ounces': 'ounce', 'oz': 'ounce',
    'Fl Oz': 'fluid_ounce', 'Fluid Ounce': 'fluid_ounce',
    'Count': 'count', 'count': 'count', 'gram': 'gram'
}
df['unit_cleaned'] = df['unit'].map(unit_mapping).fillna('unknown')

# c. One-Hot Encode the 'unit' column
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
unit_encoded = ohe.fit_transform(df[['unit_cleaned']])
unit_encoded_df = pd.DataFrame(unit_encoded, columns=ohe.get_feature_names_out())

# d. Combine tabular features and handle missing values
df['value'].fillna(df['value'].median(), inplace=True)
tabular_features = pd.concat([df[['value']], unit_encoded_df], axis=1)

# e. Log transform the target variable 'price' for better training stability
df['price_log'] = np.log1p(df['price'])

# Add image paths (assuming images are downloaded to 'dataset/images/')
df['image_path'] = 'dataset/images/' + df['sample_id'].astype(str) + '.jpg'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['value'].fillna(df['value'].median(), inplace=True)


In [3]:

# --- 2. Data Splitting ---
X_train, X_val = train_test_split(df, test_size=0.2, random_state=42)
tabular_train = tabular_features.loc[X_train.index]
tabular_val = tabular_features.loc[X_val.index]
y_train = X_train['price_log']
y_val = X_val['price_log']

In [4]:

# --- 3. Model Constants and Tokenizer ---
IMG_SIZE = 224
BERT_MODEL_NAME = 'bert-base-uncased'
MAX_TEXT_LENGTH = 128
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [22]:

# --- 4. TensorFlow Data Pipeline - Load images from URLs ---
import requests
from io import BytesIO
from PIL import Image

def data_generator(df, tabular_df):
    for i, row in df.iterrows():
        try:
            # Load image from URL
            response = requests.get(row['image_link'], timeout=5)
            img = Image.open(BytesIO(response.content)).convert('RGB')
            img = img.resize((IMG_SIZE, IMG_SIZE))
            img = np.array(img, dtype=np.float32)
            img = tf.keras.applications.efficientnet_v2.preprocess_input(img)
        except Exception as e:
            # If image fails to load, use a black placeholder
            print(f"Failed to load image for {row['sample_id']}: {e}")
            img = np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.float32)

        # Tokenize text
        text_input = tokenizer(
            row['name'] + " " + row['description'],
            max_length=MAX_TEXT_LENGTH,
            truncation=True,
            padding='max_length',
            return_tensors='tf'
        )
        input_ids = text_input['input_ids'][0]
        attention_mask = text_input['attention_mask'][0]

        # Get tabular data
        tabular_data = tabular_df.loc[row.name].values.astype('float32')

        yield {
            'image_input': img,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'tabular_input': tabular_data
        }, row['price_log']

def create_dataset(df, tabular_df, batch_size=32):
    output_signature = (
        {
            'image_input': tf.TensorSpec(shape=(IMG_SIZE, IMG_SIZE, 3), dtype=tf.float32),
            'input_ids': tf.TensorSpec(shape=(MAX_TEXT_LENGTH,), dtype=tf.int32),
            'attention_mask': tf.TensorSpec(shape=(MAX_TEXT_LENGTH,), dtype=tf.int32),
            'tabular_input': tf.TensorSpec(shape=(tabular_df.shape[1],), dtype=tf.float32)
        },
        tf.TensorSpec(shape=(), dtype=tf.float32)
    )
    dataset = tf.data.Dataset.from_generator(
        lambda: data_generator(df, tabular_df),
        output_signature=output_signature
    )
    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Create datasets
train_dataset = create_dataset(X_train, tabular_train, batch_size=16)  # Smaller batch for stability
val_dataset = create_dataset(X_val, tabular_val, batch_size=16)

In [None]:
# --- 5. Building the Multi-Modal Model (Functional API) ---

# a. Image Branch
image_input = Input(shape=(IMG_SIZE, IMG_SIZE, 3), name='image_input')
base_cnn = EfficientNetV2B0(include_top=False, weights='imagenet', input_tensor=image_input)
base_cnn.trainable = False # Freeze pre-trained layers
image_features = GlobalAveragePooling2D(name='image_features')(base_cnn.output)

In [9]:
!pip install --upgrade transformers tensorflow safetensors



In [8]:
!pip install safetensors==0.3.1

Collecting safetensors==0.3.1
  Downloading safetensors-0.3.1.tar.gz (34 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: safetensors
  Building wheel for safetensors (pyproject.toml): started
  Building wheel for safetensors (pyproject.toml): finished with status 'error'
Failed to build safetensors


  error: subprocess-exited-with-error
  
  × Building wheel for safetensors (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [36 lines of output]
      !!
      
              ********************************************************************************
              Please consider removing the following classifiers in favor of a SPDX license expression:
      
              License :: OSI Approved :: Apache Software License
      
              See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
              ********************************************************************************
      
      !!
        self._finalize_license_expression()
      running bdist_wheel
      running build
      running build_py
      creating build\lib.win-amd64-cpython-312\safetensors
      copying py_src\safetensors\flax.py -> build\lib.win-amd64-cpython-312\safetensors
      copying py_src\safetensors\numpy.py -> build\lib.win-amd6

In [20]:
# b. Text Branch
input_ids = Input(shape=(MAX_TEXT_LENGTH,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(MAX_TEXT_LENGTH,), dtype=tf.int32, name='attention_mask')
base_bert = TFBertModel.from_pretrained(BERT_MODEL_NAME, from_pt=True)
base_bert.trainable = False

# Wrap BERT in a Lambda layer to handle KerasTensors
def bert_encode(inputs):
    input_ids, attention_mask = inputs
    return base_bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

from tensorflow.keras.layers import Lambda
# Specify output_shape: BERT base outputs 768-dim vectors for the [CLS] token
text_features = Lambda(bert_encode, output_shape=(768,))([input_ids, attention_mask])

# c. Tabular Branch
tabular_input = Input(shape=(tabular_features.shape[1],), name='tabular_input')
tabular_dense = Dense(32, activation='relu')(tabular_input)
tabular_features_out = Dense(16, activation='relu')(tabular_dense)

# d. Concatenate all branches
combined_features = Concatenate()([image_features, text_features, tabular_features_out])

# e. Final Regression Head
x = Dense(256, activation='relu')(combined_features)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='linear', name='price_output')(x)

# Create the final model
model = Model(
    inputs=[image_input, input_ids, attention_mask, tabular_input],
    outputs=output
)

model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [None]:



import tensorflow.keras.backend as K

def smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE).
    The model predicts log1p(price), so we must convert predictions and
    labels back to the original price scale with expm1.
    """
    # Un-log the values
    y_true_unlogged = tf.math.expm1(y_true)
    y_pred_unlogged = tf.math.expm1(y_pred)

    # Calculate SMAPE
    numerator = K.abs(y_pred_unlogged - y_true_unlogged)
    denominator = (K.abs(y_true_unlogged) + K.abs(y_pred_unlogged)) / 2.0

    # Add a small epsilon to avoid division by zero
    ratio = numerator / (denominator + K.epsilon())

    return K.mean(ratio) * 100.0




# --- 6. Compile and Train the Model (Updated) ---

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='mean_squared_error',  # This is what the model tries to minimize
    metrics=[smape]              # This is what we monitor for the competition
)

print("Starting model training...")
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5
)
print("Model training complete.")

# You can access the validation SMAPE scores like this:
print("\nValidation SMAPE per epoch:", history.history['val_smape'])

Starting model training...
Epoch 1/5


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


    155/Unknown [1m828s[0m 5s/step - loss: 1.4189 - smape: 87.4931