In [52]:
import pandas as pd
import numpy as np
# Load main datasets
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')

# Display first few rows
holidays.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [53]:
# Convert 'date' columns to datetime format
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
oil['date'] = pd.to_datetime(oil['date'])
holidays['date'] = pd.to_datetime(holidays['date'])

# Merge store metadata into train/test
train = train.merge(stores, on='store_nbr', how='left')
test = test.merge(stores, on='store_nbr', how='left')

# Merge oil prices into train/test
train = train.merge(oil, on='date', how='left')
test = test.merge(oil, on='date', how='left')

# Merge holiday data into train/test
# We only keep "holiday" type rows (not Work Day, Transfer, Bridge for now)
holiday_filtered = holidays[holidays['type'].isin(['Holiday', 'Additional'])]
train = train.merge(holiday_filtered[['date', 'locale', 'description']], on='date', how='left')
test = test.merge(holiday_filtered[['date', 'locale', 'description']], on='date', how='left')

In [54]:
# Extract date features
for df in [train, test]:
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['is_weekend'] = df['weekday'] >= 5

In [55]:
from sklearn.preprocessing import LabelEncoder

# List of columns to encode
cat_cols = ['family', 'city', 'type', 'cluster', 'locale', 'description']

# Label encode all categorical columns
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    # Fill NaNs with a placeholder so encoding doesn't fail
    train[col] = train[col].fillna('Unknown')
    test[col] = test[col].fillna('Unknown')
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])  # Use the same encoder as train
    le_dict[col] = le  # Save the encoder in case needed later
train.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,dcoilwtico,locale,description,day,weekday,month,year,is_weekend
0,0,2013-01-01,1,0,0.0,0,18,Pichincha,3,12,,1,38,1,1,1,2013,False
1,1,2013-01-01,1,1,0.0,0,18,Pichincha,3,12,,1,38,1,1,1,2013,False
2,2,2013-01-01,1,2,0.0,0,18,Pichincha,3,12,,1,38,1,1,1,2013,False
3,3,2013-01-01,1,3,0.0,0,18,Pichincha,3,12,,1,38,1,1,1,2013,False
4,4,2013-01-01,1,4,0.0,0,18,Pichincha,3,12,,1,38,1,1,1,2013,False


In [56]:
# Final feature list
features = [
    'store_nbr', 'family', 'onpromotion', 'city', 'type', 'cluster',
    'locale', 'description', 'day', 'weekday', 'month', 'year', 'is_weekend'
]

target = 'sales'

# Final train set
X_train = train[features]
y_train = train[target]

In [57]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

#Split Your Training Data for Validation
X_train_split, X_valid, y_train_split, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

#Train the Model
model = LGBMRegressor(n_estimators=1000, learning_rate=0.1,max_depth=10,
    num_leaves=128, random_state=42)
model.fit(X_train_split, y_train_split)

#Evaluate Model on Validation Set
#Use Root Mean Squared Log Error (RMSLE), which is the competition metric
y_pred = model.predict(X_valid)
y_pred = y_pred.clip(0, None)  # Sales can't be negative

rmsle = mean_squared_log_error(y_valid, y_pred, squared=False)
print("Validation RMSLE:", rmsle)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.164909 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 446
[LightGBM] [Info] Number of data points in the train set: 2429222, number of used features: 13
[LightGBM] [Info] Start training from score 358.746721
Validation RMSLE: 1.4035127666472211


In [58]:
test_preds = model.predict(test[features])
test_preds = test_preds.clip(0, None)  # No negative sales allowed


In [59]:
submission = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv')
submission['sales'] = test_preds
submission.to_csv('submission.csv', index=False)