In [11]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load your data
data = pd.read_csv('/content/drive/MyDrive/train_0irEZ2H.csv')  # Replace 'your_data.csv' with the actual file path

In [4]:
# Convert 'week' to datetime format
data['week'] = pd.to_datetime(data['week'])

# Extract features from the 'week' column
data['day_of_week'] = data['week'].dt.dayofweek
data['month'] = data['week'].dt.month
data['year'] = data['week'].dt.year

In [5]:
# Define features (X) and target variable (y)
X = data.drop(['record_ID', 'week', 'units_sold'], axis=1)
y = data['units_sold']

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define the parameter grid for LightGBM
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [5, 10, 15],
    'num_leaves': [31, 50, 100],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

In [8]:
# Initialize the LGBM model
lgb_model = LGBMRegressor(random_state=42)

In [9]:
# Grid search for the best parameters
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 729 candidates, totalling 2187 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011896 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 615
[LightGBM] [Info] Number of data points in the train set: 120120, number of used features: 9
[LightGBM] [Info] Start training from score 51.789352


In [10]:
# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')


Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 15, 'n_estimators': 200, 'num_leaves': 100, 'subsample': 0.8}


In [12]:
# Train the model with the best parameters
best_model = LGBMRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011636 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 615
[LightGBM] [Info] Number of data points in the train set: 120120, number of used features: 9
[LightGBM] [Info] Start training from score 51.789352


In [13]:
# Make predictions on the testing set
y_pred = best_model.predict(X_test)

In [14]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 498.32541049484416
