In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error, make_scorer
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
train_df = pd.read_csv('train.csv')

def extract_date_features(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['quarter'] = df['date'].dt.quarter
    return df.drop(columns=['date'])

train_df = extract_date_features(train_df)
train_df = train_df.dropna(subset=['num_sold'])

X = train_df.drop(columns=['num_sold', 'id'])
y = train_df['num_sold']


In [3]:
# Preprocessing
numerical_features = ['year', 'month', 'day', 'day_of_week', 'week_of_year', 'is_weekend', 'quarter']
categorical_features = ['country', 'store', 'product']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OrdinalEncoder(), categorical_features)
    ])

In [4]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', TransformedTargetRegressor(
        regressor=lgb.LGBMRegressor(
            objective='mae',
            random_state=12,
            n_jobs=2, 
            verbose=-1
        ),
        func=np.log,
        inverse_func=np.exp))
])


In [5]:
param_grid = {
    'regressor__regressor__n_estimators': [300, 500],
    'regressor__regressor__learning_rate': [0.05, 0.1],
    'regressor__regressor__max_depth': [5, 7],
    'regressor__regressor__num_leaves': [31, 63],
    'regressor__regressor__subsample': [0.8],
}

In [6]:
scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [9]:
grid_search = GridSearchCV(
    model,
    param_grid,
    cv=5,  
    scoring=scorer,
    n_jobs=1,  
    verbose=2
)

In [10]:
best_model = grid_search.fit(X, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END regressor__regressor__learning_rate=0.05, regressor__regressor__max_depth=5, regressor__regressor__n_estimators=300, regressor__regressor__num_leaves=31, regressor__regressor__subsample=0.8; total time=   3.7s
[CV] END regressor__regressor__learning_rate=0.05, regressor__regressor__max_depth=5, regressor__regressor__n_estimators=300, regressor__regressor__num_leaves=31, regressor__regressor__subsample=0.8; total time=   2.6s
[CV] END regressor__regressor__learning_rate=0.05, regressor__regressor__max_depth=5, regressor__regressor__n_estimators=300, regressor__regressor__num_leaves=31, regressor__regressor__subsample=0.8; total time=   2.7s
[CV] END regressor__regressor__learning_rate=0.05, regressor__regressor__max_depth=5, regressor__regressor__n_estimators=300, regressor__regressor__num_leaves=31, regressor__regressor__subsample=0.8; total time=   2.6s
[CV] END regressor__regressor__learning_rate=0.05, regressor__r

In [11]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X)
mape = mean_absolute_percentage_error(y, y_pred)
print(f"Best Model MAPE: {mape:.4f}")

Best Model MAPE: 0.0452


In [17]:
def process_data(df):
    # Date features
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['quarter'] = df['date'].dt.quarter
    
    # Ensure required columns exist
    required_columns = [
        'year', 'month', 'day', 'day_of_week',
        'week_of_year', 'is_weekend', 'quarter',
        'country', 'store', 'product'
    ]
    
    # Add missing columns with NaN
    for col in required_columns:
        if col not in df.columns:
            df[col] = np.nan
            
    return df.drop(columns=['date'])

In [18]:
df = pd.read_csv('test.csv')

In [19]:
test_df = process_data(df)

In [20]:
test_predictions = best_model.predict(test_df)

In [21]:
test_df['num_sold'] = test_predictions
submission = test_df[['id', 'num_sold']]
submission.to_csv("Submission_Apurva.csv", index=False)
print("Predictions saved to 'Submission_Apurva.csv'.")

Predictions saved to 'Submission_Apurva.csv'.
