In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import lightgbm as lgb

# Load datasets
train = pd.read_csv('train.csv')
stores = pd.read_csv('stores.csv')
features = pd.read_csv('features.csv')
test = pd.read_csv('test.csv')

# Merge datasets
data = train.merge(stores, on="Store").merge(features, on=["Store", "Date", "IsHoliday"])
test_data = test.merge(stores, on="Store").merge(features, on=["Store", "Date", "IsHoliday"])

# Drop unnecessary columns
data.drop(['MarkDown1', 'MarkDown5'], axis=1, inplace=True)
test_data.drop(['MarkDown1', 'MarkDown5'], axis=1, inplace=True)

# Handle missing values
data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

# Convert Date to datetime and extract features
data['Date'] = pd.to_datetime(data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])
data['Year'] = data['Date'].dt.year
test_data['Year'] = test_data['Date'].dt.year
data['Month'] = data['Date'].dt.month
test_data['Month'] = test_data['Date'].dt.month
data['Week'] = data['Date'].dt.isocalendar().week
test_data['Week'] = test_data['Date'].dt.isocalendar().week

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Type'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Type'], drop_first=True)

# Define features and target
X = data.drop(['Weekly_Sales', 'Date'], axis=1)
y = data['Weekly_Sales']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# Train model
model = lgb.train(params, train_data, num_boost_round=500, valid_sets=[valid_data])

# Predict on validation set
predictions = model.predict(X_val, num_iteration=model.best_iteration)

# Calculate evaluation metrics
mse = mean_squared_error(y_val, predictions)
mae = mean_absolute_error(y_val, predictions)
r2 = r2_score(y_val, predictions)

print("LightGBM Model Performance:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2: {r2}")

# Predict on test dataset
final_predictions = model.predict(test_data.drop(['Date'], axis=1))
test_data['Predicted_Sales'] = final_predictions

# Save predictions to CSV
test_data.to_csv('lightgbm_predictions.csv', index=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029864 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2019
[LightGBM] [Info] Number of data points in the train set: 337256, number of used features: 16
[LightGBM] [Info] Start training from score 15979.221909
LightGBM Model Performance:
Mean Squared Error (MSE): 20772018.366955057
Mean Absolute Error (MAE): 2642.499287739719
R^2: 0.9601664931498417
