In [None]:
!pip install category_encoders

In [None]:
pip install lightgbm

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
folder_path = "/content/drive/My Drive/Backpack Prediction Challenge/"

train_data = pd.read_csv(folder_path + "train.csv")
train_extra_data = pd.read_csv(folder_path + "training_extra.csv")
test_data = pd.read_csv(folder_path + "test.csv")

train_data = pd.concat([train_data, train_extra_data], ignore_index=True)

In [None]:
size_mapping = {'Small': 1, 'Medium': 2, 'Large': 3}
train_data['Size'] = train_data['Size'].map(size_mapping)
test_data['Size'] = test_data['Size'].map(size_mapping)

In [None]:
train_data['Brand_Material'] = train_data['Brand'] + '_' + train_data['Material']
test_data['Brand_Material'] = test_data['Brand'] + '_' + test_data['Material']
train_data['Compartments_per_Size'] = train_data['Compartments'] / train_data['Size']
test_data['Compartments_per_Size'] = test_data['Compartments'] / test_data['Size']

In [None]:
X = train_data.drop(columns=["Price"])
y = train_data["Price"]

In [None]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist() + ['Brand_Material']
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist() + ['Compartments_per_Size']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

In [None]:
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [None]:
preprocessor.fit(X_train)

In [None]:
X_train_preprocessed = preprocessor.transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

In [None]:
cat_feature_indices = list(range(len(numerical_cols), len(numerical_cols) + len(categorical_cols)))

In [None]:
train_dataset = lgb.Dataset(X_train_preprocessed, label=y_train, categorical_feature=cat_feature_indices)
val_dataset = lgb.Dataset(X_val_preprocessed, label=y_val, categorical_feature=cat_feature_indices, reference=train_dataset)

In [None]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

num_round = 1000
bst = lgb.train(params, train_dataset, num_round, valid_sets=[val_dataset])

LightGBMError: Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.

In [None]:
# Make predictions on validation set
y_pred = bst.predict(X_val_preprocessed, num_iteration=bst.best_iteration)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'Validation RMSE: {rmse}')

Validation RMSE: 38.87588244794277


In [None]:
# Preprocess full training data
X_preprocessed = preprocessor.fit_transform(X)

In [None]:
# Create full training dataset
full_train_dataset = lgb.Dataset(X_preprocessed, label=y, categorical_feature=cat_feature_indices)

In [None]:
print(f"Best iteration: {bst.best_iteration}")

Best iteration: 0


In [None]:
bst_full = lgb.train(params, full_train_dataset, num_boost_round=bst.best_iteration)

ValueError: num_boost_round must be greater than 0. Got 0.

In [None]:
X_test = test_data[X.columns]

In [None]:
X_test_preprocessed = preprocessor.transform(X_test)

In [None]:
predictions = bst_full.predict(X_test_preprocessed, num_iteration=bst_full.best_iteration)

In [None]:
submission = pd.DataFrame({"id": test_data["id"], "Price": predictions})
submission.to_csv("submission10.csv", index=False)
print("Submission file created successfully!")