<a href="https://colab.research.google.com/github/DimuthuPHD/api/blob/main/ci_using_lightgbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import joblib
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [28]:

# Load Data
train = pd.read_csv("/content/drive/MyDrive/ICBT/Collab/CI/train.csv")
test = pd.read_csv("/content/drive/MyDrive/ICBT/Collab/CI/test.csv")
oil = pd.read_csv("/content/drive/MyDrive/ICBT/Collab/CI/oil.csv")
stores = pd.read_csv("/content/drive/MyDrive/ICBT/Collab/CI/stores.csv")
transactions = pd.read_csv("/content/drive/MyDrive/ICBT/Collab/CI/transactions.csv")
holidays = pd.read_csv("/content/drive/MyDrive/ICBT/Collab/CI/holidays_events.csv")

In [114]:
# Feature Engineering
train['test'] = 0
test['test'] = 1
data = pd.concat([train, test], axis=0)

data = data.merge(holidays, on='date', how='left')
data = data.merge(stores, on='store_nbr', how='left')
data = data.merge(oil, on='date', how='left')
data = data.merge(transactions, on=['date', 'store_nbr'], how='left')
data = data.set_index(['store_nbr', 'date', 'family']).drop(index='2013-01-01', level=1)

data_ = data.reset_index()

# Ensure 'date' column is in datetime format
data_['date'] = pd.to_datetime(data_['date'])

# Extract Date Features
date_features = ['day', 'month', 'year']
for feature in date_features:
    data_[feature] = data_['date'].dt.__getattribute__(feature)

# Encoding Categorical Variables
categorical_cols = [ 'store_nbr', 'type_y', 'cluster', 'family', 'onpromotion', 'type_x',
                    'locale', 'locale_name', 'city', 'state' , 'day', 'month', 'year']
for col in categorical_cols:
    data_[col] = data_[col].astype('category')

data_ = data_.drop(['date', 'description', 'transferred', 'type_x', 'locale', 'locale_name', 'city', 'state', 'transactions'], axis=1)

In [111]:
# Prepare Train/Test Data
train_data = data_[data_['test'] == 0]
test_data = data_[data_['test'] == 1]

X_train = train_data.drop(['test', 'sales', 'id'], axis=1)
y_train = train_data['sales']
X_test = test_data.drop(['test', 'sales', 'id'], axis=1)
X_train.columns

Index(['store_nbr', 'family', 'onpromotion', 'type_y', 'cluster',
       'dcoilwtico'],
      dtype='object')

In [112]:
# Train Model

# LightGBM Parameters
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.train(lgb_params, lgb_train, num_boost_round=100)

In [96]:
# Make Predictions
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

In [97]:
# Post-processing
output = pd.DataFrame(index=test_data['id'])
output['sales'] = y_pred.clip(0)

In [108]:
# Save Predictions
output.to_csv('submission.csv')
joblib.dump(gbm, 'model.pkl')

['model.pkl']