In [2]:
import pandas as pd

# Load the datasets
train_path = 'data/train.csv'
test_path = 'data/test.csv'
submission_path = 'data/sample_submission.csv'

# Load the datasets into dataframes
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
submission_df = pd.read_csv(submission_path)

# Display the first few rows of each dataset to understand their structure
train_info = train_df.head(), train_df.info()
test_info = test_df.head(), test_df.info()
submission_info = submission_df.head(), submission_df.info()

train_info, test_info, submission_info


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11616 entries, 0 to 11615
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              11616 non-null  int64  
 1   Country         11616 non-null  object 
 2   Year            11616 non-null  int64  
 3   Month           11616 non-null  int64  
 4   FoodPriceIndex  11268 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 453.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2640 entries, 0 to 2639
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       2640 non-null   int64 
 1   Country  2640 non-null   object
 2   Year     2640 non-null   int64 
 3   Month    2640 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 82.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2640 entries, 0 to 2639
Data columns (total 2 columns):
 #   Column          Non-Nul

((      id       Country  Year  Month  FoodPriceIndex
  0   3501        Mexico  1971     11             0.0
  1  60601  South Africa  2002      7            43.9
  2  64399         Japan  2004      8            89.8
  3  24227        France  1983      1            48.9
  4  63779         Japan  2004      4            90.2,
  None),
 (      id        Country  Year  Month
  0  99116          Italy  2023      4
  1  97687        Denmark  2022      7
  2  92061  United States  2019      6
  3  95122    Netherlands  2021      2
  4  92296          Italy  2019      8,
  None),
 (      id  FoodPriceIndex
  0  99116            20.5
  1  97687            20.5
  2  92061            20.5
  3  95122            20.5
  4  92296            20.5,
  None))

In [3]:
# Check for missing values in the training data
missing_values_summary = train_df.isnull().sum()

# Fill missing values for `FoodPriceIndex` using mean value grouped by `Country`, `Year`, and `Month`.
# If not possible, fill with the overall mean.
train_df['FoodPriceIndex'] = train_df.groupby(['Country', 'Year', 'Month'])['FoodPriceIndex']\
                                     .transform(lambda x: x.fillna(x.mean()))
train_df['FoodPriceIndex'].fillna(train_df['FoodPriceIndex'].mean(), inplace=True)

# Verify that no missing values remain in the training data
remaining_missing_values = train_df['FoodPriceIndex'].isnull().sum()

missing_values_summary, remaining_missing_values


(id                  0
 Country             0
 Year                0
 Month               0
 FoodPriceIndex    348
 dtype: int64,
 0)

In [4]:
from sklearn.preprocessing import LabelEncoder

# Feature Engineering
# Encode the 'Country' column using Label Encoding
label_encoder = LabelEncoder()
train_df['Country_encoded'] = label_encoder.fit_transform(train_df['Country'])
test_df['Country_encoded'] = label_encoder.transform(test_df['Country'])

# Create cyclical features for 'Month' to represent seasonality
import numpy as np
train_df['Month_sin'] = np.sin(2 * np.pi * train_df['Month'] / 12)
train_df['Month_cos'] = np.cos(2 * np.pi * train_df['Month'] / 12)
test_df['Month_sin'] = np.sin(2 * np.pi * test_df['Month'] / 12)
test_df['Month_cos'] = np.cos(2 * np.pi * test_df['Month'] / 12)

# Drop original 'Country' and 'Month' columns (they are now encoded or transformed)
train_df.drop(['Country', 'Month'], axis=1, inplace=True)
test_df.drop(['Country', 'Month'], axis=1, inplace=True)

# Final check on the transformed training data
train_transformed_info = train_df.head()
train_transformed_info

Unnamed: 0,id,Year,FoodPriceIndex,Country_encoded,Month_sin,Month_cos
0,3501,1971,0.0,11,-0.5,0.866025
1,60601,2002,43.9,14,-0.5,-0.866025
2,64399,2004,89.8,9,-0.866025,-0.5
3,24227,1983,48.9,4,0.5,0.866025
4,63779,2004,90.2,9,0.866025,-0.5


In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

# Define input features and target variable
X = train_df.drop(['id', 'FoodPriceIndex'], axis=1)
y = train_df['FoodPriceIndex']

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Define sMAPE metric
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0  # Avoid division by zero
    return np.mean(diff) * 100

# Initialize LightGBM Regressor
lgb_model = lgb.LGBMRegressor(random_state=42)

# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 10, 20],
}

# Grid Search with cross-validation
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=1)
grid_search.fit(X_train, y_train)

# Best model and hyperparameters
best_lgb_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Validate the model
y_valid_pred = best_lgb_model.predict(X_valid)
validation_smape = smape(y_valid, y_valid_pred)

best_params, validation_smape

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000917 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 6194, number of used features: 4
[LightGBM] [Info] Start training from score 54.342366
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 6195, number of used features: 4
[LightGBM] [Info] Start training from score 54.166974
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c

({'learning_rate': 0.2,
  'max_depth': -1,
  'n_estimators': 300,
  'num_leaves': 70},
 17.306078977047015)

In [10]:
# Train a baseline XGBoost model with default parameters
from xgboost import XGBRegressor

baseline_xgb_model = XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=5)
baseline_xgb_model.fit(X_train, y_train)

# Validate the baseline model
y_valid_pred_baseline = baseline_xgb_model.predict(X_valid)
validation_smape_baseline = smape(y_valid, y_valid_pred_baseline)

validation_smape_baseline

22.189265892933406

In [12]:
# Train a baseline XGBoost model with default parameters
from sklearn.ensemble import RandomForestRegressor
baseline_rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
baseline_rf_model.fit(X_train, y_train)

# Validate the baseline model
y_valid_pred_baseline = baseline_rf_model.predict(X_valid)
validation_smape_baseline = smape(y_valid, y_valid_pred_baseline)

validation_smape_baseline

9.448715249605181