In [None]:
!pip install lightgbm tqdm scikit-learn joblib --quiet

import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import joblib
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [23]:
train_columns = ['sample_id', 'catalog_content', 'image_link', 'price']
df_train = pd.read_csv("train.csv", names=train_columns, engine='python', on_bad_lines='skip')

print(f"✅ Train rows loaded: {len(df_train)}")

# Fill missing prices
df_train['price'] = pd.to_numeric(df_train['price'], errors='coerce')
df_train['price'] = df_train['price'].fillna(df_train['price'].mean())


✅ Train rows loaded: 75001


In [24]:
def clean_text(text):
    """Lowercase, remove unwanted symbols, keep meaningful text."""
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [25]:
weight_units = {
    'gram': ['g','gm','grams'],
    'kilogram': ['kg','kgs','kilogram','kilograms'],
    'milliliter': ['ml','milliliter','milliliters'],
    'liter': ['l','ltr','liter','liters'],
    'ounce': ['oz','ounce','ounces'],
    'pound': ['lb','lbs','pound','pounds']
}

def extract_quantity(text):
    text = str(text).lower()
    match = re.search(r'(\d+(?:\.\d+)?)\s*(pack|packs|pcs|pieces|bottle|can|jar|bunch|box|set|sets|dozen)?', text)
    if match:
        return float(match.group(1))
    nums = re.findall(r'\b\d+(?:\.\d+)?\b', text)
    return float(nums[0]) if nums else 1.0

def extract_weight(text):
    """Handle compound weights like '1 kg x 2 pack'"""
    text = str(text).lower()
    quantity = 1.0
    # handle 'x' pattern for multiple packs
    x_match = re.search(r'(\d+(?:\.\d+)?)\s*(kg|g|ml|l|oz|lb)s?\s*(?:x|×)\s*(\d+)', text)
    if x_match:
        val, unit, mult = x_match.groups()
        val = float(val)
        mult = float(mult)
        if unit.startswith('kg'): val *= 1000
        elif unit.startswith('l'): val *= 1000
        elif unit.startswith('lb'): val *= 453.592
        elif unit.startswith('oz'): val *= 28.3495
        return round(val * mult,2)

    # fallback: simple weight extraction
    total_weight = 0.0
    for unit, variants in weight_units.items():
        for v in variants:
            matches = re.findall(r'(\d+(?:\.\d+)?)\s*' + re.escape(v), text)
            for val in matches:
                val = float(val)
                if unit == 'kilogram': val *= 1000
                elif unit == 'liter': val *= 1000
                elif unit == 'pound': val *= 453.592
                elif unit == 'ounce': val *= 28.3495
                total_weight += val
    if total_weight == 0: total_weight = 250
    return round(total_weight,2)


In [26]:
def extract_brand(text):
    words = re.findall(r'\b[A-Z][a-zA-Z]+\b', str(text))
    generic_words = {'Pack','Set','Box','New','Best','The'}
    for w in words:
        if w not in generic_words:
            return w
    return "Unknown"

def detect_category(text):
    text = str(text).lower()
    categories = {
        'electronics': ['phone','laptop','charger','headphone','camera','battery','speaker','tablet','cable'],
        'clothing': ['shirt','pants','dress','jacket','jeans','skirt','blouse','t-shirt'],
        'grocery': ['food','snack','rice','oil','juice','drink','spice','vegetable','fruit'],
        'home_kitchen': ['kitchen','appliance','utensil','furniture','bed','decor','pillow','towel'],
        'beauty_personal': ['cosmetic','makeup','skin','hair','shampoo','lotion','perfume','soap']
    }
    for cat, keys in categories.items():
        if any(k in text for k in keys):
            return cat
    return 'other'

def detect_flags(text):
    t = str(text).lower()
    return pd.Series({
        'is_premium': int(any(k in t for k in ['premium','luxury','exclusive'])),
        'is_organic': int(any(k in t for k in ['organic','natural','eco'])),
        'is_discount': int(any(k in t for k in ['sale','discount','offer','deal'])),
        'has_warranty': int(any(k in t for k in ['warranty','guarantee'])),
        'is_new': int(any(k in t for k in ['new','latest','2023','2024']))
    })

def detect_color(text):
    colors = ['red','blue','green','black','white','yellow','pink','orange','brown','silver','gold','purple','grey']
    t = str(text).lower()
    for c in colors:
        if c in t: return c
    return 'unknown'


In [27]:
df_train['quantity'] = df_train['catalog_content'].progress_apply(extract_quantity)
df_train['total_weight_g'] = df_train['catalog_content'].progress_apply(extract_weight)
df_train['brand'] = df_train['catalog_content'].progress_apply(extract_brand)
df_train['category'] = df_train['catalog_content'].progress_apply(detect_category)
flags_df = df_train['catalog_content'].progress_apply(detect_flags)
df_train = pd.concat([df_train, flags_df], axis=1)
df_train['dominant_color'] = df_train['catalog_content'].progress_apply(detect_color)
df_train['clean_text'] = df_train['catalog_content'].progress_apply(clean_text)


Progress: 100%|██████████| 75001/75001 [00:03<00:00, 22099.65it/s]
Progress: 100%|██████████| 75001/75001 [01:08<00:00, 1095.22it/s]
Progress: 100%|██████████| 75001/75001 [00:02<00:00, 30119.87it/s]
Progress: 100%|██████████| 75001/75001 [00:02<00:00, 26521.43it/s]
Progress: 100%|██████████| 75001/75001 [00:21<00:00, 3475.09it/s]
Progress: 100%|██████████| 75001/75001 [00:00<00:00, 94890.78it/s]
Progress: 100%|██████████| 75001/75001 [00:09<00:00, 8061.87it/s]


In [28]:
# Load model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings
df_train['text_embedding'] = df_train['clean_text'].progress_apply(lambda x: sbert_model.encode(x))


Progress: 100%|██████████| 75001/75001 [1:43:30<00:00, 12.08it/s]


In [29]:
numeric_features = ['quantity','total_weight_g']
categorical_features = ['brand','category','dominant_color','is_premium','is_organic','is_discount','has_warranty','is_new']

X_numeric = df_train[numeric_features].values
X_categorical = pd.get_dummies(df_train[categorical_features])
X_text = np.stack(df_train['text_embedding'].values)

# Concatenate all features
X = np.hstack([X_numeric, X_categorical.values, X_text])
y = np.log1p(df_train['price'])


In [30]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

lgb_model = lgb.LGBMRegressor(
    n_estimators=1800,
    learning_rate=0.03,
    num_leaves=50fir ,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    reg_alpha=0.2,
    reg_lambda=0.4,
    random_state=42,
    n_jobs=-1
)

lgb_model.fit(X_train, y_train)
print("✅ Model trained with SentenceTransformer embeddings!")


✅ Model trained with SentenceTransformer embeddings!


In [31]:
def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_pred - y_true) / np.where(denom==0, 1, denom)
    return np.mean(diff) * 100

y_val_pred = np.expm1(lgb_model.predict(X_val))
y_val_true = np.expm1(y_val)
score = smape(y_val_true, y_val_pred)
print(f"✅ Validation SMAPE: {score:.2f}%")


✅ Validation SMAPE: 56.64%


In [35]:
df_test = pd.read_csv("test.csv", names=['sample_id','catalog_content','image_link'], engine='python', on_bad_lines='skip')
df_test = df_test.dropna(subset=['catalog_content']).reset_index(drop=True)

df_test['quantity'] = df_test['catalog_content'].progress_apply(extract_quantity)
df_test['total_weight_g'] = df_test['catalog_content'].progress_apply(extract_weight)
df_test['brand'] = df_test['catalog_content'].progress_apply(extract_brand)
df_test['category'] = df_test['catalog_content'].progress_apply(detect_category)
flags_test = df_test['catalog_content'].progress_apply(detect_flags)
df_test = pd.concat([df_test, flags_test], axis=1)
df_test['dominant_color'] = df_test['catalog_content'].progress_apply(detect_color)
df_test['clean_text'] = df_test['catalog_content'].progress_apply(clean_text)
df_test['text_embedding'] = df_test['clean_text'].progress_apply(lambda x: sbert_model.encode(x))


In [35]:
X_test_numeric = df_test[numeric_features].values
X_test_categorical = pd.get_dummies(df_test[categorical_features])

# Ensure same columns as train
X_test_categorical = X_test_categorical.reindex(columns=X_categorical.columns, fill_value=0)
X_test_text = np.stack(df_test['text_embedding'].values)

X_test = np.hstack([X_test_numeric, X_test_categorical.values, X_test_text])

y_test_pred = np.expm1(lgb_model.predict(X_test))

submission = pd.DataFrame({
    'sample_id': df_test['sample_id'],
    'price': y_test_pred
})
submission.to_csv("test_predictions_sbert.csv", index=False)
print("✅ Test predictions saved to 'test_predictions_sbert.csv'")
