In [6]:
import pandas as pd

# Load datasets
train_df = pd.read_csv("C:/Users/pm5cd/Downloads/68e8d1d70b66d_student_resource/student_resource/dataset/train.csv")
test_df = pd.read_csv("C:/Users/pm5cd/Downloads/68e8d1d70b66d_student_resource/student_resource/dataset/test.csv")

# Print column names to verify
print("Train columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())

# Preview data
print(train_df.head())
print(test_df.head())


Train columns: ['sample_id', 'catalog_content', 'image_link', 'price']
Test columns: ['sample_id', 'catalog_content', 'image_link']
   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   
2     261251  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   
3      55858  Item Name: Judeeâ€™s Blue Cheese Powder 11.25 oz...   
4     292686  Item Name: kedem Sherry Cooking Wine, 12.7 Oun...   

                                          image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12  
2  https://m.media-amazon.com/images/I/51+PFEe-w-...   1.97  
3  https://m.media-amazon.com/images/I/41mu0HAToD...  30.34  
4  https://m.media-amazon.com/images/I/41sA037+Qv...  66.49  
   sample_id                                    catalog_content  \
0     100179  Item Name:

In [12]:
import sys
import subprocess
# -----------------------------
# Import libraries
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# -----------------------------
# Load Dataset
# -----------------------------
train_df = pd.read_csv(r"C:/Users/pm5cd/Downloads/68e8d1d70b66d_student_resource/student_resource/dataset/train.csv")
test_df = pd.read_csv(r"C:/Users/pm5cd/Downloads/68e8d1d70b66d_student_resource/student_resource/dataset/test.csv")

# -----------------------------
# Feature Engineering from catalog_content
# -----------------------------
text_column = 'catalog_content'
for df in [train_df, test_df]:
    df['content_length'] = df[text_column].apply(lambda x: len(str(x)))
    df['word_count'] = df[text_column].apply(lambda x: len(str(x).split()))
    df['digit_count'] = df[text_column].apply(lambda x: sum(c.isdigit() for c in str(x)))
    df['avg_word_length'] = df['content_length'] / (df['word_count'] + 1)
    df['digit_ratio'] = df['digit_count'] / (df['content_length'] + 1)
    df['caps_ratio'] = df[text_column].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / (len(str(x)) + 1))

# -----------------------------
# TF-IDF + TruncatedSVD
# -----------------------------
tfidf = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1,3),
    stop_words='english',
    sublinear_tf=True
)
tfidf_train = tfidf.fit_transform(train_df[text_column])
tfidf_test = tfidf.transform(test_df[text_column])

svd = TruncatedSVD(n_components=150, random_state=42)
svd_train = svd.fit_transform(tfidf_train)
svd_test = svd.transform(tfidf_test)

# -----------------------------
# Numeric Features
# -----------------------------
numeric_features = ['content_length','word_count','digit_count','avg_word_length','digit_ratio','caps_ratio']
X_train_numeric = train_df[numeric_features].values
X_test_numeric = test_df[numeric_features].values

scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train_numeric)
X_test_numeric = scaler.transform(X_test_numeric)

# Combine numeric + SVD features
X_train = np.hstack([X_train_numeric, svd_train])
X_test = np.hstack([X_test_numeric, svd_test])

# -----------------------------
# Target
# -----------------------------
y_train = np.log1p(train_df['price'].values)

# -----------------------------
# K-Fold Setup
# -----------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pred_lgb = np.zeros(len(test_df))
pred_cat = np.zeros(len(test_df))
pred_xgb = np.zeros(len(test_df))

import lightgbm as lgb

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training LightGBM fold {fold+1}")
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        params=lgb_params,
        train_set=train_data,
        num_boost_round=5000,
        valid_sets=[train_data, val_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ]
    )

    pred_lgb += model.predict(X_test, num_iteration=model.best_iteration) / kf.n_splits

# -----------------------------
# CatBoost Training
# -----------------------------
cat_params = {
    'iterations':3000,
    'learning_rate':0.02,
    'depth':8,
    'loss_function':'MAE',
    'verbose':200,
    'random_seed':42
}

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training CatBoost fold {fold+1}")
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]
    
    model = CatBoostRegressor(**cat_params)
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=100, verbose=False)
    
    pred_cat += model.predict(X_test) / kf.n_splits



Training LightGBM fold 1
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 0.523731	valid_1's l1: 0.591136
[400]	training's l1: 0.469814	valid_1's l1: 0.574228
[600]	training's l1: 0.43609	valid_1's l1: 0.566999
[800]	training's l1: 0.411979	valid_1's l1: 0.563157
[1000]	training's l1: 0.393489	valid_1's l1: 0.560521
[1200]	training's l1: 0.378902	valid_1's l1: 0.558563
[1400]	training's l1: 0.366565	valid_1's l1: 0.556958
[1600]	training's l1: 0.356163	valid_1's l1: 0.555895
[1800]	training's l1: 0.347214	valid_1's l1: 0.554724
[2000]	training's l1: 0.339169	valid_1's l1: 0.553695
[2200]	training's l1: 0.332201	valid_1's l1: 0.552824
[2400]	training's l1: 0.326554	valid_1's l1: 0.552043
[2600]	training's l1: 0.321313	valid_1's l1: 0.551502
[2800]	training's l1: 0.316396	valid_1's l1: 0.550898
[3000]	training's l1: 0.311822	valid_1's l1: 0.5504
[3200]	training's l1: 0.307756	valid_1's l1: 0.549984
[3400]	training's l1: 0.303913	valid_1's l1: 0.549636
[3

TypeError: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [20]:
# -----------------------------
# XGBoost Training (Universal Compatible)
# -----------------------------
import xgboost as xgb

xgb_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.02,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'rmse',
    'seed': 42
}

pred_xgb = np.zeros(X_test.shape[0])

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training XGBoost fold {fold+1}")

    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test)

    model = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=5000,
        evals=[(dtrain, 'train'), (dval, 'val')],
        early_stopping_rounds=100,
        verbose_eval=200
    )

    pred_xgb += model.predict(dtest, iteration_range=(0, model.best_iteration)) / kf.n_splits

# -----------------------------
# Ensemble
# -----------------------------
test_ensemble = 0.5 * pred_lgb + 0.35 * pred_cat + 0.15 * pred_xgb
test_preds = np.expm1(test_ensemble)
test_preds = np.clip(test_preds, train_df['price'].min(), train_df['price'].max())

# -----------------------------
# Create Submission
# -----------------------------
submission = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': test_preds})
submission.to_csv("submission_optimized1.csv", index=False)
print("Submission file created successfully: submission_optimized1.csv")


Training XGBoost fold 1
[0]	train-rmse:0.93473	val-rmse:0.94771
[200]	train-rmse:0.65130	val-rmse:0.76070
[400]	train-rmse:0.55549	val-rmse:0.74075
[600]	train-rmse:0.48918	val-rmse:0.73283
[800]	train-rmse:0.43662	val-rmse:0.72806
[1000]	train-rmse:0.39098	val-rmse:0.72498
[1200]	train-rmse:0.35111	val-rmse:0.72272
[1400]	train-rmse:0.31503	val-rmse:0.72130
[1600]	train-rmse:0.28221	val-rmse:0.71975
[1800]	train-rmse:0.25334	val-rmse:0.71891
[2000]	train-rmse:0.22853	val-rmse:0.71814
[2200]	train-rmse:0.20502	val-rmse:0.71732
[2400]	train-rmse:0.18465	val-rmse:0.71692
[2600]	train-rmse:0.16634	val-rmse:0.71657
[2800]	train-rmse:0.15058	val-rmse:0.71639
[3000]	train-rmse:0.13639	val-rmse:0.71611
[3200]	train-rmse:0.12412	val-rmse:0.71584
[3400]	train-rmse:0.11342	val-rmse:0.71572
[3600]	train-rmse:0.10365	val-rmse:0.71554
[3800]	train-rmse:0.09512	val-rmse:0.71537
[4000]	train-rmse:0.08752	val-rmse:0.71522
[4200]	train-rmse:0.08095	val-rmse:0.71510
[4400]	train-rmse:0.07536	val-rmse:0.