In [None]:
pip install pandas scikit-learn lightgbm torch torchvision timm requests Pillow tqdm



In [2]:
import pandas as pd
import numpy as np
import re
import csv
from tqdm import tqdm

# Scikit-learn & SciPy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack, csr_matrix

# LightGBM Model
import lightgbm as lgb

# PyTorch & Timm for Image Processing
import torch
import timm
from PIL import Image
import requests
from io import BytesIO
from torchvision import transforms

# --- 0. Configuration ---
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {DEVICE}")

# --- 1. Robust Data Loading Functions ---
def load_data_robustly(filepath):
    """Reads the entire training CSV line-by-line."""
    texts, prices, image_links = [], [], []
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        next(f, None)
        reader = csv.reader(f)
        for row in tqdm(reader, desc=f"Loading {filepath}"):
            try:
                if len(row) >= 4:
                    price_val, text_val, image_link = float(row[3]), str(row[1]), str(row[2])
                    if text_val and price_val > 0 and image_link:
                        texts.append(text_val)
                        prices.append(price_val)
                        image_links.append(image_link)
            except (ValueError, IndexError):
                continue
    return pd.DataFrame({'catalog_content': texts, 'price': prices, 'image_link': image_links})

def load_final_test_data_robustly(filepath):
    """Reads the entire final test CSV for prediction."""
    ids, texts, image_links = [], [], []
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        next(f, None)
        reader = csv.reader(f)
        for row in tqdm(reader, desc=f"Loading {filepath}"):
            try:
                if len(row) >= 3:
                    ids.append(str(row[0]))
                    texts.append(str(row[1]))
                    image_links.append(str(row[2]))
            except (ValueError, IndexError):
                continue
    return pd.DataFrame({'sample_id': ids, 'catalog_content': texts, 'image_link': image_links})

# --- 2. Data Preparation ---
print("Loading the full train.csv dataset...")
df = load_data_robustly('train.csv')
df['log_price'] = np.log1p(df['price'])
print(f"âœ… Successfully loaded {len(df)} clean rows from train.csv.")

# --- 3. Feature Extraction Setup ---
print("Loading image model (EfficientNet)...")
image_model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=0).to(DEVICE)
image_model.eval()
config = image_model.default_cfg
transform = transforms.Compose([
    transforms.Resize(config['input_size'][1:]),
    transforms.CenterCrop(config['input_size'][1:]),
    transforms.ToTensor(),
    transforms.Normalize(config['mean'], config['std']),
])
output_dim = image_model.num_features

def get_image_embeddings(image_links: pd.Series):
    embeddings = []
    print(f"Generating image embeddings for {len(image_links)} images (this will take a long time)...")
    for url in tqdm(image_links):
        try:
            response = requests.get(url, timeout=20)
            response.raise_for_status()
            image = Image.open(BytesIO(response.content)).convert("RGB")
            image_tensor = transform(image).unsqueeze(0).to(DEVICE)
            with torch.no_grad():
                embedding = image_model(image_tensor).cpu().numpy().flatten()
            embeddings.append(embedding)
        except Exception:
            embeddings.append(np.zeros(output_dim))
    return np.vstack(embeddings)

def extract_ipq(text):
    match = re.search(r'(\d+)\s*(pk|pack|count|ct|pairs)', str(text).lower())
    return int(match.group(1)) if match else 1

# --- 4. Create Features, Split Data, and Train ---
# Create features for the entire training dataframe
image_features = get_image_embeddings(df['image_link'])
df['ipq'] = df['catalog_content'].apply(extract_ipq)
tfidf = TfidfVectorizer(stop_words='english', max_features=15000, ngram_range=(1, 2))
text_features = tfidf.fit_transform(df['catalog_content'])

# Combine features
X = hstack([csr_matrix(image_features), text_features, csr_matrix(df['ipq'].values.reshape(-1, 1))]).tocsr()
y = df['log_price']

# Split the full dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"âœ… Data split: {X_train.shape[0]} for training, {X_test.shape[0]} for testing.")

# Train model
print("\nTraining LightGBM model on 80% of the data...")
lgb_model = lgb.LGBMRegressor(random_state=42, n_estimators=1000, learning_rate=0.05, num_leaves=31)
lgb_model.fit(X_train, y_train)
print("âœ… Model training complete.")

# --- 5. Evaluate Model ---
log_predictions = lgb_model.predict(X_test)
predicted_prices = np.expm1(log_predictions)
actual_prices = np.expm1(y_test)
predicted_prices[predicted_prices < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    ratio = np.where(denominator == 0, 0, numerator / (denominator + 1e-8))
    return np.mean(ratio) * 100

validation_smape = smape(actual_prices, predicted_prices)
print(f"\nðŸ“ˆ **Validation SMAPE on {len(y_test)} items: {validation_smape:.4f}%**")

# --- 6. Generate Final Predictions on test.csv ---
print(f"\nLoading the full test.csv for final prediction...")
final_test_df = load_final_test_data_robustly('test.csv')

print("Creating features for the final test set...")
final_image_features = get_image_embeddings(final_test_df['image_link'])
final_test_df['ipq'] = final_test_df['catalog_content'].apply(extract_ipq)
# Use the already-fitted TF-IDF to transform the new text data
final_text_features = tfidf.transform(final_test_df['catalog_content'])

X_final_pred = hstack([csr_matrix(final_image_features), final_text_features, csr_matrix(final_test_df['ipq'].values.reshape(-1, 1))]).tocsr()

print("Generating final predictions for all items...")
final_log_preds = lgb_model.predict(X_final_pred)
final_prices = np.expm1(final_log_preds)
final_prices[final_prices < 0] = 0

submission_df = pd.DataFrame({'sample_id': final_test_df['sample_id'], 'price': final_prices})
submission_df.to_csv('predicted_prices.csv', index=False)

print("\nâœ… Final predictions saved to 'predicted_prices.csv'")
print(f"Total predictions generated: {len(submission_df)}")
print(submission_df.head())

Using device: cpu
Loading the full train.csv dataset...


Loading train.csv: 75000it [00:01, 66710.21it/s]


âœ… Successfully loaded 75000 clean rows from train.csv.
Loading image model (EfficientNet)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

Generating image embeddings for 75000 images (this will take a long time)...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 75000/75000 [4:14:08<00:00,  4.92it/s]


âœ… Data split: 60000 for training, 15000 for testing.

Training LightGBM model on 80% of the data...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 29.759883 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1341834
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 16121
[LightGBM] [Info] Start training from score 2.740904
âœ… Model training complete.





ðŸ“ˆ **Validation SMAPE on 15000 items: 52.9658%**

Loading the full test.csv for final prediction...


Loading test.csv: 75000it [00:01, 68079.52it/s]


Creating features for the final test set...
Generating image embeddings for 75000 images (this will take a long time)...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 75000/75000 [4:10:12<00:00,  5.00it/s]


Generating final predictions for all items...





âœ… Final predictions saved to 'predicted_prices.csv'
Total predictions generated: 75000
  sample_id      price
0    100179  16.905832
1    245611  20.462277
2    146263  20.007089
3     95658   8.364760
4     36806  22.766728
