In [None]:
# --- SETUP AND IMPORTS ---
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.model_selection import KFold
from sentence_transformers import SentenceTransformer
import torch
from google.colab import drive

print("üöÄ Mounting Google Drive...")
drive.mount('/content/drive')

# --- PATH CONFIGURATION ---
# Make sure this path points to your project folder
DRIVE_PATH = "/content/drive/MyDrive/ML SCHOOL 2025/"
DATASET_PATH = DRIVE_PATH + "68e8d1d70b66d_student_resource/student_resource/dataset/"

# --- LOAD ALL NECESSARY FILES ---
print("\nüìÇ Loading your data files...")
train_df = pd.read_csv(DATASET_PATH + "train.csv")
test_df = pd.read_csv(DATASET_PATH + "test.csv")

# Load the image features you already created
print("Loading your pre-computed image features...")
# Update these paths if your image features are in a different location
train_image_features = pd.read_csv(DRIVE_PATH + "train_image_features.csv")
test_image_features = pd.read_csv(DRIVE_PATH + "test_image_features.csv")

print("‚úÖ All data loaded successfully!")

üöÄ Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

üìÇ Loading your data files...
Loading your pre-computed image features...
‚úÖ All data loaded successfully!


In [None]:
# --- 1. TEXT CLEANING ---
print("üßπ Cleaning text data...")
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower() # Convert to lowercase
    text = re.sub(r'[^a-z0-9\s,.-]', '', text) # Remove special characters
    return text

train_df['cleaned_content'] = train_df['catalog_content'].apply(clean_text)
test_df['cleaned_content'] = test_df['catalog_content'].apply(clean_text)
print("‚úÖ Text cleaning complete.")

# --- 2. CRITICAL FEATURE EXTRACTION ---
print("üíé Extracting IPQ and Brand features...")
# This IPQ function is designed to catch many different patterns
def extract_ipq(text):
    if not isinstance(text, str): return 1
    # Looks for 'pack of 12', 'set of 2', 'count: 6', '10 count' etc.
    match = re.search(r'(\d+)\s*(pack|count|set)|(pack of|set of|count)\s*(\d+)', text, re.IGNORECASE)
    if match:
        # The regex has multiple capture groups, find which one has the number
        num = match.group(1) or match.group(4)
        return int(num)
    return 1

# A simple but effective brand extractor
def extract_brand(text):
    if not isinstance(text, str): return "unknown"
    # Often the first word, if it's capitalized in the original text
    match = re.match(r'^[A-Z][a-zA-Z]+', str(text))
    if match:
        return match.group(0)
    return "unknown"

# Apply the functions to the ORIGINAL, uncleaned text
train_df['ipq'] = train_df['catalog_content'].apply(extract_ipq)
test_df['ipq'] = test_df['catalog_content'].apply(extract_ipq)

train_df['brand'] = train_df['catalog_content'].apply(extract_brand)
test_df['brand'] = test_df['catalog_content'].apply(extract_brand)
print("‚úÖ IPQ and Brand features extracted.")

# --- 3. GENERATE TEXT EMBEDDINGS ---
print("\nüß† Generating high-quality text embeddings...")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# This model is a great balance of speed and performance
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Generate embeddings for the cleaned text
train_text_embeddings = model.encode(train_df['cleaned_content'].tolist(), show_progress_bar=True)
test_text_embeddings = model.encode(test_df['cleaned_content'].tolist(), show_progress_bar=True)

# Create DataFrames for the embeddings
text_feature_cols = [f'text_{i}' for i in range(train_text_embeddings.shape[1])]
train_text_features = pd.DataFrame(train_text_embeddings, columns=text_feature_cols)
test_text_features = pd.DataFrame(test_text_embeddings, columns=text_feature_cols)

print("‚úÖ Text embedding generation complete!")

üßπ Cleaning text data...
‚úÖ Text cleaning complete.
üíé Extracting IPQ and Brand features...
‚úÖ IPQ and Brand features extracted.

üß† Generating high-quality text embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

In [None]:
# --- MERGE ALL FEATURES ---
print("\nüß© Assembling the final master dataset...")

# 1. Combine our new text features with the text embeddings
train_features = pd.concat([train_df[['sample_id', 'ipq', 'brand']], train_text_features], axis=1)
test_features = pd.concat([test_df[['sample_id', 'ipq', 'brand']], test_text_features], axis=1)

# 2. Merge with your image embeddings
train_master = pd.merge(train_features, train_image_features, on='sample_id', how='left')
test_master = pd.merge(test_features, test_image_features, on='sample_id', how='left')

# 3. Add the target variable ('price') to the training set
train_master = pd.merge(train_master, train_df[['sample_id', 'price']], on='sample_id', how='left')

# --- HANDLE THE 'brand' CATEGORICAL FEATURE ---
# We use one-hot encoding to convert brand names into a numerical format
all_brands = pd.concat([train_master[['brand']], test_master[['brand']]])
top_brands = all_brands['brand'].value_counts().nlargest(50).index # Use top 50 brands
train_master['brand_encoded'] = pd.Categorical(train_master['brand'], categories=top_brands, ordered=True).codes
test_master['brand_encoded'] = pd.Categorical(test_master['brand'], categories=top_brands, ordered=True).codes
train_master.drop('brand', axis=1, inplace=True)
test_master.drop('brand', axis=1, inplace=True)

# --- FINAL PREPARATION ---
# Apply the crucial log transformation to the price
train_master['log_price'] = np.log1p(train_master['price'])

# Define feature columns
feature_columns = [col for col in train_master.columns if col not in ['sample_id', 'price', 'log_price']]
target_column = 'log_price'

X = train_master[feature_columns]
y = train_master[target_column]
X_test = test_master[feature_columns]

print("‚úÖ Master dataset is ready for training!")
print(f"Total features being used: {len(feature_columns)}")


üß© Assembling the final master dataset...


KeyError: "['ipq', 'brand'] not in index"

In [None]:
# --- DEFINE THE SMAPE METRIC ---
def smape(y_true, y_pred):
    # We must convert from log scale back to actual prices before scoring
    y_true_actual = np.expm1(y_true)
    y_pred_actual = np.expm1(y_pred)
    numerator = np.abs(y_pred_actual - y_true_actual)
    denominator = (np.abs(y_true_actual) + np.abs(y_pred_actual)) / 2
    return np.mean(numerator / denominator) * 100

# --- K-FOLD MODEL TRAINING ---
print("\nüöÄ Starting 5-Fold Cross-Validation Training...")
NFOLDS = 5
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros(X.shape[0])
sub_preds = np.zeros(X_test.shape[0])

# Tuned LGBM parameters
lgbm_params = {
    'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 2000,
    'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
    'bagging_freq': 1, 'lambda_l1': 0.1, 'lambda_l2': 0.1,
    'num_leaves': 31, 'verbose': -1, 'n_jobs': -1, 'seed': 42
}

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

    print(f"--- Training Fold {n_fold + 1} ---")
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric='mae',
              callbacks=[lgb.early_stopping(100, verbose=False)])

    oof_preds[valid_idx] = model.predict(X_valid)
    sub_preds += model.predict(X_test) / folds.n_splits

# --- VALIDATION SCORE ---
validation_score = smape(y, oof_preds)
print(f"\n\nüéâüéâüéâ 5-Fold CV SMAPE Score: {validation_score:.4f}% üéâüéâüéâ")
print("This is your new, reliable score!")

# --- CREATE SUBMISSION FILE ---
print("\n‚úçÔ∏è Creating final submission file...")
final_predictions = np.expm1(sub_preds) # Convert log predictions to actual prices
final_predictions[final_predictions < 0] = 0 # Ensure no negative prices

submission_df = pd.DataFrame({'sample_id': test_master['sample_id'], 'price': final_predictions})
submission_path = DRIVE_PATH + "submission_tier1.csv"
submission_df.to_csv(submission_path, index=False)

print(f"‚úÖ Submission file saved to: {submission_path}")
display(submission_df.head())


üöÄ Starting 5-Fold Cross-Validation Training...
--- Training Fold 1 ---
--- Training Fold 2 ---
--- Training Fold 3 ---
--- Training Fold 4 ---
--- Training Fold 5 ---


üéâüéâüéâ 5-Fold CV SMAPE Score: 57.5124% üéâüéâüéâ
This is your new, reliable score!
