In [1]:
import pandas as pd
import numpy as np
import joblib
import os
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported.")

Libraries imported.


In [2]:
# Path to your saved XGBoost model
model_path = "../models/baseline_models/xgboost_model.pkl"  # or tuned version if preferred
# model_path = "../models/tuned_models/xgboost_tuned.pkl"

# Load the model
xgb_model = joblib.load(model_path)
print(f"Model loaded from {model_path}")
print(f"Model type: {type(xgb_model)}")

Model loaded from ../models/baseline_models/xgboost_model.pkl
Model type: <class 'xgboost.sklearn.XGBRegressor'>


In [3]:
# Load the original clean dataset (to understand feature engineering)
original_data_path = "../data/preprocessed/vegetable_prices_clean.csv"
original_df = pd.read_csv(original_data_path)

# Also load the feature‑engineered dataset to get the final feature set
fe_path = "../data/feature_engineered/vegetable_prices_fe.csv"
fe_df = pd.read_csv(fe_path)

# Get the list of features the model expects (all columns except 'price')
feature_cols = [col for col in fe_df.columns if col != 'price']
print(f"Model expects {len(feature_cols)} features: {feature_cols[:5]}...")

Model expects 39 features: ['vegetable', 'Badulla_actual_class', 'Hambantota_actual_class', 'Jaffna_actual_class', 'Kurunegala_actual_class']...


In [4]:
# Generate weeks for 2026, 2027, 2028
years = [2026, 2027, 2028]
weeks_per_year = 52  # assuming 52 weeks per year (adjust if your data uses 53)

future_rows = []
for year in years:
    for week in range(1, weeks_per_year + 1):
        future_rows.append({
            'year': year,
            'week': f'W{week}',      # e.g., "W1", "W2", ...
            'week_num': week,
            'year_week': f"{year}{week:02d}"  # e.g., "202601"
        })

future_calendar = pd.DataFrame(future_rows)
print(f"Created {len(future_calendar)} week rows for 2026–2028")
future_calendar.head()

Created 156 week rows for 2026–2028


Unnamed: 0,year,week,week_num,year_week
0,2026,W1,1,202601
1,2026,W2,2,202602
2,2026,W3,3,202603
3,2026,W4,4,202604
4,2026,W5,5,202605


In [5]:
# Vegetable mapping (names to IDs)
veg_mapping = {
    'Bitter Gourd': 1,
    'Brinjals': 2,
    'Cabbage': 3,
    'Carrot': 4,
    'Pumpkin': 5,
    'Tomatoes': 6
}
# Reverse mapping for final output (IDs to names)
veg_reverse = {v: k for k, v in veg_mapping.items()}

# Create all combinations of (year, week, vegetable)
vegetables = list(veg_mapping.keys())
future_data = []

for _, row in future_calendar.iterrows():
    for veg in vegetables:
        future_data.append({
            'year': row['year'],
            'week': row['week'],
            'vegetable': veg,                     # keep name for now
            'vegetable_id': veg_mapping[veg],
            'week_num': row['week_num'],
            'year_week': row['year_week']
        })

future_df = pd.DataFrame(future_data)
print(f"Total rows to predict: {len(future_df)}")
future_df.head()

Total rows to predict: 936


Unnamed: 0,year,week,vegetable,vegetable_id,week_num,year_week
0,2026,W1,Bitter Gourd,1,1,202601
1,2026,W1,Brinjals,2,1,202601
2,2026,W1,Cabbage,3,1,202601
3,2026,W1,Carrot,4,1,202601
4,2026,W1,Pumpkin,5,1,202601


In [6]:
# Step 1: Compute historical averages per week and location
# (This is one approach – you may have a different method)

# Extract week number from original data
original_df['week_num'] = original_df['week'].str.replace('W', '').astype(int)

# List of location prefixes
locations = ['Badulla', 'Hambantota', 'Jaffna', 'Kurunegala', 'Matale', 'Nuwara_Eliya', 'Ratnapura']

# For each location, compute average precipitation, prob_drought, prob_flood_risk, prob_normal by week_num
weekly_stats = {}

for loc in locations:
    # Precipitation columns
    precip_col = f'{loc}_precipitation'
    if precip_col in original_df.columns:
        weekly_stats[precip_col] = original_df.groupby('week_num')[precip_col].mean().to_dict()

    # Probability columns
    for prob in ['prob_drought', 'prob_flood_risk', 'prob_normal']:
        col = f'{loc}_{prob}'
        if col in original_df.columns:
            weekly_stats[col] = original_df.groupby('week_num')[col].mean().to_dict()

# For USD_LKR_avg and RateChange_avg_%, use overall average (or last value)
usd_avg = original_df['USD_LKR_avg'].mean()
rate_change_avg = original_df['RateChange_avg_%'].mean()

# For actual_class columns, we need to encode them (they were encoded in fe_df)
# Since we don't have actual future classes, we'll use the most common class per week or overall mode.
# Here we'll use the mode of the encoded values from fe_df.
class_mode = {}
for loc in locations:
    col = f'{loc}_actual_class'
    if col in fe_df.columns:
        # Get the most frequent encoded value (0, 1, -1)
        class_mode[col] = fe_df[col].mode()[0]

print("Historical averages computed.")

Historical averages computed.


In [7]:
# Initialize feature matrix with zeros (or NaN)
future_features = pd.DataFrame(index=future_df.index)

# Add vegetable_id (already encoded)
future_features['vegetable'] = future_df['vegetable_id']

# Add week_num
future_features['week_num'] = future_df['week_num']

# Add precipitation features (using weekly averages)
for loc in locations:
    col = f'{loc}_precipitation'
    if col in weekly_stats:
        future_features[col] = future_df['week_num'].map(weekly_stats[col])
    else:
        future_features[col] = 0  # fallback

# Add probability features
for loc in locations:
    for prob in ['prob_drought', 'prob_flood_risk', 'prob_normal']:
        col = f'{loc}_{prob}'
        if col in weekly_stats:
            future_features[col] = future_df['week_num'].map(weekly_stats[col])
        else:
            future_features[col] = 0

# Add actual_class features (using mode from historical encoded data)
for loc in locations:
    col = f'{loc}_actual_class'
    if col in class_mode:
        future_features[col] = class_mode[col]
    else:
        future_features[col] = 0  # default to 'normal'

# Add economic features
future_features['USD_LKR_avg'] = usd_avg
future_features['RateChange_avg_%'] = rate_change_avg

# Ensure all required features are present in the correct order
missing_cols = set(feature_cols) - set(future_features.columns)
if missing_cols:
    print(f"Warning: Missing columns: {missing_cols}. Adding with default 0.")
    for col in missing_cols:
        future_features[col] = 0

# Reorder columns to match model's expected order
future_features = future_features[feature_cols]

print("Future feature matrix shape:", future_features.shape)
future_features.head()

Future feature matrix shape: (936, 39)


Unnamed: 0,vegetable,Badulla_actual_class,Hambantota_actual_class,Jaffna_actual_class,Kurunegala_actual_class,Matale_actual_class,Nuwara_Eliya_actual_class,Ratnapura_actual_class,Badulla_precipitation,Hambantota_precipitation,...,Badulla_prob_normal,Hambantota_prob_normal,Jaffna_prob_normal,Kurunegala_prob_normal,Matale_prob_normal,Nuwara_Eliya_prob_normal,Ratnapura_prob_normal,USD_LKR_avg,RateChange_avg_%,week_num
0,1,0,0,0,0,0,0,0,6.960357,7.917321,...,0.202411,0.263839,0.301429,0.347589,0.345982,0.230982,0.36125,188.203154,0.128123,1
1,2,0,0,0,0,0,0,0,6.960357,7.917321,...,0.202411,0.263839,0.301429,0.347589,0.345982,0.230982,0.36125,188.203154,0.128123,1
2,3,0,0,0,0,0,0,0,6.960357,7.917321,...,0.202411,0.263839,0.301429,0.347589,0.345982,0.230982,0.36125,188.203154,0.128123,1
3,4,0,0,0,0,0,0,0,6.960357,7.917321,...,0.202411,0.263839,0.301429,0.347589,0.345982,0.230982,0.36125,188.203154,0.128123,1
4,5,0,0,0,0,0,0,0,6.960357,7.917321,...,0.202411,0.263839,0.301429,0.347589,0.345982,0.230982,0.36125,188.203154,0.128123,1


In [8]:
# Predict prices
predicted_prices = xgb_model.predict(future_features)

# Add predictions to the future_df
future_df['price_predicted'] = predicted_prices

print("Predictions complete.")
print(f"Predicted price range: {predicted_prices.min():.2f} – {predicted_prices.max():.2f}")

Predictions complete.
Predicted price range: 31.02 – 212.43


In [9]:
# Select only the requested columns
output_df = future_df[['year', 'week', 'vegetable', 'price_predicted']].copy()

# Rename price column to just 'price' as requested
output_df.rename(columns={'price_predicted': 'price'}, inplace=True)

# Ensure vegetable names are as strings (they already are)
print("Output DataFrame preview:")
print(output_df.head(10))

# Check unique vegetables
print(f"\nVegetables present: {output_df['vegetable'].unique()}")

Output DataFrame preview:
   year week     vegetable       price
0  2026   W1  Bitter Gourd  190.014267
1  2026   W1      Brinjals  105.558220
2  2026   W1       Cabbage   61.929848
3  2026   W1        Carrot  156.161575
4  2026   W1       Pumpkin   68.984688
5  2026   W1      Tomatoes  137.699509
6  2026   W2  Bitter Gourd  184.416580
7  2026   W2      Brinjals  103.047707
8  2026   W2       Cabbage   65.121834
9  2026   W2        Carrot  157.868988

Vegetables present: ['Bitter Gourd' 'Brinjals' 'Cabbage' 'Carrot' 'Pumpkin' 'Tomatoes']


In [10]:
# Define output path
output_dir = "../data/predictions"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "vegetable_prices_2026_2028_xgboost.csv")

# Save to CSV
output_df.to_csv(output_path, index=False)
print(f"Predictions saved to: {output_path}")

Predictions saved to: ../data/predictions\vegetable_prices_2026_2028_xgboost.csv
