In [1]:
import pandas as pd
import numpy as np
import re
import warnings
import os
import joblib
import gc
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
from textblob import TextBlob
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_PATH = '/content/drive/MyDrive/NLPProject/'
    print("Google Drive mounted.")
except Exception as e:
    print(f"Not in Colab or Drive mount failed: {e}. Using local paths.")
    BASE_PATH = './'

Mounted at /content/drive
Google Drive mounted.


In [2]:
data_path = os.path.join(BASE_PATH, 'train.csv')
embeddings_path = os.path.join(BASE_PATH, 'train_embeddings.parquet') # Assuming this is your CLIP parquet
MODEL_SAVE_PATH = BASE_PATH # Save models to the same folder

try:
    df = pd.read_csv(data_path)
    print(f"Dataset: {df.shape}")

    img_embeddings = pd.read_parquet(embeddings_path)
    print(f"Image embeddings: {img_embeddings.shape}")
    clip_col_name = 'clip_embedding'
    if clip_col_name not in img_embeddings.columns:
        raise ValueError(f"Column '{clip_col_name}' not found in embeddings parquet.")

    df = df.merge(img_embeddings[['sample_id', clip_col_name]], on='sample_id', how='left')
    df = df.rename(columns={clip_col_name: 'clip'})
    print(f"Merged dataset: {df.shape}")

except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    df = pd.DataFrame()
except Exception as e:
    print(f"An error occurred: {e}")
    df = pd.DataFrame()

Dataset: (75000, 4)
Image embeddings: (74999, 3)
Merged dataset: (75000, 5)


In [3]:
def parse_item_name(t):
    if pd.isna(t): return ""
    m = re.search(r'Item Name:\s*(.+?)(?:\n|$)', t)
    return m.group(1).strip() if m else ""

def parse_product_desc(t):
    if pd.isna(t): return ""
    m = re.search(r'Product Description:\s*(.+?)(?:\n(?:Bullet Point|Value|Unit|$))', t, re.DOTALL)
    return m.group(1).strip() if m else ""

def parse_value(t):
    if pd.isna(t): return np.nan
    m = re.search(r'Value:\s*([0-9.]+)', t)
    return float(m.group(1)) if m else np.nan

def extract_brand(name):
    if pd.isna(name) or not name: return "unknown"
    words = name.split()
    if words and len(words[0]) > 2:
        return words[0]
    return "unknown"

def extract_pack_size(text):
    if pd.isna(text): return 1
    patterns = [r'\(Pack of (\d+)\)', r'Pack of (\d+)', r'(\d+)-Pack']
    for p in patterns:
        m = re.search(p, text, re.IGNORECASE)
        if m: return int(m.group(1))
    return 1

def clean_text(t):
    if pd.isna(t): return ""
    t = t.lower()
    t = re.sub(r'\(pack of \d+\)', '', t)
    t = re.sub(r'pack of \d+', '', t)
    t = re.sub(r'\d+-pack', '', t)
    t = re.sub(r'\d+ count', '', t)
    t = re.sub(r'\d+\.?\d*\s*(oz|ounce|lb|pound|gram|kg|ml|liter|fl oz)', '', t)
    t = re.sub(r'[^a-z0-9\s]', ' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

print("Feature engineering functions defined.")

Feature engineering functions defined.


In [4]:
if not df.empty:
    print("Applying feature engineering...")
    df['item_name'] = df['catalog_content'].apply(parse_item_name)
    df['desc'] = df['catalog_content'].apply(parse_product_desc)
    df['parsed_value'] = df['catalog_content'].apply(parse_value)

    df['full_text'] = (df['item_name'].fillna('') + ' ' + df['desc'].fillna(''))
    df['full_text_cleaned'] = df['full_text'].apply(clean_text)

    df['item_name_length'] = df['item_name'].str.len().fillna(0)
    df['desc_length'] = df['desc'].str.len().fillna(0)
    df['has_description'] = (df['desc_length'] > 0).astype(int)

    df['brand'] = df['item_name'].apply(extract_brand)
    df['brand_count'] = df['brand'].map(df['brand'].value_counts()).fillna(0)

    df['pack_size'] = df['item_name'].apply(extract_pack_size)
    df['total_volume'] = df['parsed_value'].fillna(0)
    df['log_pack_size'] = np.log1p(df['pack_size'])
    df['log_total_volume'] = np.log1p(df['total_volume'])

    df['log_price'] = np.log1p(df['price'])
    print("Feature engineering complete.")

Applying feature engineering...
Feature engineering complete.


In [5]:
if not df.empty and 'clip' in df.columns:
    print("Processing CLIP embeddings...")

    # Handle missing values
    missing_count = df['clip'].isna().sum()
    df['has_clip'] = (~df['clip'].isna()).astype(int)
    print(f"Found {missing_count} missing CLIP embeddings.")

    def parse_embedding(embed_obj):
        if pd.isna(embed_obj):
            return np.zeros(512)
        if isinstance(embed_obj, str):
            try:
                clean_str = embed_obj.strip('[]')
                return np.fromstring(clean_str, sep=',')
            except:
                return np.zeros(512)
        if hasattr(embed_obj, '__iter__'):
            try:
                return np.asarray(embed_obj, dtype=float)
            except:
                 return np.zeros(512)

        return np.zeros(512)
    print("Converting embeddings column to numpy matrix...")
    embeddings_matrix = np.stack(df['clip'].apply(parse_embedding).values)
    embedding_dim = embeddings_matrix.shape[1]

    if embedding_dim != 512:
        print(f"Warning: Detected embedding dimension is {embedding_dim}, not 512.")

    clip_cols = [f'clip_{i}' for i in range(embedding_dim)]
    clip_df = pd.DataFrame(embeddings_matrix, columns=clip_cols, index=df.index)
    df = pd.concat([df, clip_df], axis=1)

    print(f"CLIP embeddings processed: {embedding_dim} dimensions.")
    del clip_df, embeddings_matrix
    gc.collect()

Processing CLIP embeddings...
Found 1 missing CLIP embeddings.
Converting embeddings column to numpy matrix...
CLIP embeddings processed: 512 dimensions.


In [6]:
def smape(y_true, y_pred):
    y_true_orig = np.expm1(y_true)
    y_pred_orig = np.expm1(y_pred)

    denom = (np.abs(y_true_orig) + np.abs(y_pred_orig))
    diff = np.abs(y_true_orig - y_pred_orig) / denom
    diff[denom == 0] = 0.0
    return 100 * np.mean(diff)

def mae(y_true, y_pred):
    y_true_orig = np.expm1(y_true)
    y_pred_orig = np.expm1(y_pred)
    return mean_absolute_error(y_true_orig, y_pred_orig)

print("Metrics defined.")

Metrics defined.


In [7]:
if not df.empty:
    y = df['log_price']
    drop_cols = ['price', 'log_price', 'catalog_content', 'image_link', 'sample_id',
                 'item_name', 'desc', 'full_text', 'clip']

    X = df.drop(columns=[c for c in drop_cols if c in df.columns])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Train: {X_train.shape}, Test: {X_test.shape}")
    del df
    gc.collect()

Train: (60000, 524), Test: (15000, 524)


In [8]:
numeric_features = ['parsed_value', 'pack_size', 'total_volume',
                    'brand_count', 'item_name_length', 'desc_length',
                    'has_description', 'log_pack_size', 'log_total_volume', 'has_clip']
categorical_features = ['brand']
text_feature = 'full_text_cleaned'
clip_features = [col for col in X_train.columns if col.startswith('clip_')]

# Ensure all features exist
numeric_features = [f for f in numeric_features if f in X_train.columns]
categorical_features = [f for f in categorical_features if f in X_train.columns]

print(f"Using {len(numeric_features)} numeric features.")
print(f"Using {len(categorical_features)} categorical features.")
print(f"Using {len(clip_features)} CLIP features.")

Using 10 numeric features.
Using 1 categorical features.
Using 512 CLIP features.


In [9]:
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', max_categories=50))
])

text_transformer = TfidfVectorizer(
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2)
)

text_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('tfidf', text_transformer, text_feature)
    ],
    remainder='drop'
)

# Create the full pipeline
text_model = Pipeline([
    ('preprocessor', text_preprocessor),
    ('model', lgb.LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        n_jobs=-1,
        random_state=42,
        objective='huber'
    ))
])

# Train
print("Fitting text_model...")
text_model.fit(X_train, y_train)

# Evaluate
pred_text = text_model.predict(X_test)
smape_text = smape(y_test, pred_text)
mae_text = mae(y_test, pred_text)
print(f"Text Model SMAPE: {smape_text:.4f}%")
print(f"Text Model MAE: ${mae_text:.4f}")

# Save
text_model_path = os.path.join(MODEL_SAVE_PATH, 'text_model.pkl')
joblib.dump(text_model, text_model_path)
print(f"Text model saved to {text_model_path}")

Fitting text_model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 3.661375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431601
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 5048
[LightGBM] [Info] Start training from score 2.740904
Text Model SMAPE: 26.8093%
Text Model MAE: $12.1466
Text model saved to /content/drive/MyDrive/NLPProject/text_model.pkl


In [10]:
image_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

clip_indices = [X_train.columns.get_loc(c) for c in clip_features if c in X_train]
non_clip_indices = [X_train.columns.get_loc(c) for c in numeric_features + categorical_features if c in X_train]

image_preprocessor_v2 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('clip', StandardScaler(), clip_features)
    ],
    remainder='drop'
)

image_model = Pipeline([
    ('preprocessor', image_preprocessor_v2),
    ('model', lgb.LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        n_jobs=-1,
        random_state=42,
        objective='huber'
    ))
])

# Train
print("Fitting image_model...")
image_model.fit(X_train, y_train)

# Evaluate
pred_image = image_model.predict(X_test)
smape_image = smape(y_test, pred_image)
mae_image = mae(y_test, pred_image)
print(f"Image Model SMAPE: {smape_image:.4f}%")
print(f"Image Model MAE: ${mae_image:.4f}")

# Save
image_model_path = os.path.join(MODEL_SAVE_PATH, 'image_model.pkl')
joblib.dump(image_model, image_model_path)
print(f"Image model saved to {image_model_path}")

Fitting image_model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.979587 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 132211
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 571
[LightGBM] [Info] Start training from score 2.740904
Image Model SMAPE: 27.8563%
Image Model MAE: $12.7837
Image model saved to /content/drive/MyDrive/NLPProject/image_model.pkl


In [11]:
pred_ensemble = (pred_text + pred_image) / 2.0

smape_ensemble = smape(y_test, pred_ensemble)
mae_ensemble = mae(y_test, pred_ensemble)

print(f"Text Model SMAPE:   {smape_text:.4f}% | MAE: ${mae_text:.4f}")
print(f"Image Model SMAPE:  {smape_image:.4f}% | MAE: ${mae_image:.4f}")
print(f"Ensemble SMAPE: {smape_ensemble:.4f}% | MAE: ${mae_ensemble:.4f}")

Text Model SMAPE:   26.8093% | MAE: $12.1466
Image Model SMAPE:  27.8563% | MAE: $12.7837
Ensemble SMAPE: 26.6320% | MAE: $12.1415
