In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from scipy.sparse import hstack
import warnings

warnings.filterwarnings("ignore")

In [2]:
train_df = pd.read_csv(r'C:\Rahul\GitHub\AmazonML-Hackathon\dataset\train.csv')
test_df = pd.read_csv(r'C:\Rahul\GitHub\AmazonML-Hackathon\dataset\test.csv')

In [3]:
def extract_quantity(text):
    nums = re.findall(r'\d+', str(text))
    return int(nums[-1]) if nums else 1

def extract_weight(text):
    match = re.search(r'(\d*\.?\d+)\s*(kg|g|l|ml|oz)', str(text).lower())
    if match:
        val, unit = match.groups()
        val = float(val)
        if unit == 'g':
            val /= 1000
        elif unit == 'ml':
            val /= 1000
        elif unit == 'oz':
            val *= 0.02835
        return val
    return 0.0

train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity)
train_df['weight'] = train_df['catalog_content'].apply(extract_weight)
test_df['quantity'] = test_df['catalog_content'].apply(extract_quantity)
test_df['weight'] = test_df['catalog_content'].apply(extract_weight)

In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['clean_text'] = train_df['catalog_content'].apply(clean_text)
test_df['clean_text'] = test_df['catalog_content'].apply(clean_text)

In [5]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1,2)
)

X_train_text = vectorizer.fit_transform(train_df['clean_text'])
X_test_text = vectorizer.transform(test_df['clean_text'])

In [6]:
numeric_train = train_df[['quantity','weight']].values
numeric_test = test_df[['quantity','weight']].values

X_train_full = hstack([X_train_text, numeric_train])
X_test_full = hstack([X_test_text, numeric_test])

In [7]:
y_train = np.log1p(train_df['price'])

In [8]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train_full, y_train, test_size=0.1, random_state=42)

lgbm = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=8,
    num_leaves=64,
    verbose=50
)

lgbm.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric='rmse', early_stopping_rounds=50)

TypeError: LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [9]:
lgbm.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse'
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.986823
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.984814
[LightGBM] [Debug] init for col-wise cost 0.571586 seconds, init for row-wise cost 0.542315 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.741454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 768547
[LightGBM] [Info] Number of data points in the train set: 67500, number of used features: 5002
[LightGBM] [Info] Start training from score 2.738202
[LightGBM] [Debug] Trained a tree with leaves = 64 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 64 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 64 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 64 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 64 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 64 and depth = 8
[LightG

0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,8
,learning_rate,0.05
,n_estimators,1000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [10]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_text, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [11]:
pred_lgbm = np.expm1(lgbm.predict(X_test_full))
pred_ridge = np.expm1(ridge.predict(X_test_text))

predictions = 0.7*pred_lgbm + 0.3*pred_ridge
predictions = np.maximum(predictions, 0.01)
predictions = np.round(predictions, 2)

In [12]:
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': predictions
})

submission.to_csv('test_out.csv', index=False)
print("✅ Submission saved: test_out.csv")

✅ Submission saved: test_out.csv
