In [7]:
# Cell 1: Import libraries and load data
import pandas as pd
import os
import numpy as np

# Load the feature‑engineered dataset
input_path = '../data/feature_engineered/vegetable_prices_fe.csv'
df = pd.read_csv(input_path)

print("Original data shape:", df.shape)
print("Columns:", df.columns.tolist()[:10], "...")  # show first 10

Original data shape: (4842, 171)
Columns: ['vegetable', 'price', 'Badulla_actual_class', 'Hambantota_actual_class', 'Jaffna_actual_class', 'Kurunegala_actual_class', 'Matale_actual_class', 'Nuwara_Eliya_actual_class', 'Ratnapura_actual_class', 'Badulla_precipitation'] ...


In [8]:
# Cell 2: Check that year_num column exists and see unique years
if 'year_num' not in df.columns:
    raise KeyError("Column 'year_num' not found in the dataset. Please ensure it exists.")

print("Unique years in data (from year_num):", sorted(df['year_num'].unique()))
print("\nRows per year:")
print(df['year_num'].value_counts().sort_index())

Unique years in data (from year_num): [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

Rows per year:
year_num
2010    198
2011    312
2012    312
2013    312
2014    312
2015    312
2016    312
2017    312
2018    312
2019    312
2020    312
2021    312
2022    312
2023    312
2024    312
2025    276
Name: count, dtype: int64


In [9]:
# Cell 3: Split by year_num into train, validation, test
train_years = range(2010, 2021)      # 2010–2020 inclusive
val_years   = range(2021, 2023)      # 2021–2022 inclusive
test_years  = range(2023, 2026)      # 2023–2025 inclusive

train = df[df['year_num'].isin(train_years)].copy()
val   = df[df['year_num'].isin(val_years)].copy()
test  = df[df['year_num'].isin(test_years)].copy()

print(f"Train set: {len(train)} rows ({len(train)/len(df)*100:.1f}%)")
print(f"Validation set: {len(val)} rows ({len(val)/len(df)*100:.1f}%)")
print(f"Test set: {len(test)} rows ({len(test)/len(df)*100:.1f}%)")

Train set: 3318 rows (68.5%)
Validation set: 624 rows (12.9%)
Test set: 900 rows (18.6%)


In [10]:
# Cell 4: Verify that no data is lost and years are correctly assigned
assert len(train) + len(val) + len(test) == len(df), "Row count mismatch!"
print("\n✅ All rows accounted for.")

print("\nTrain year range:", train['year_num'].min(), "-", train['year_num'].max())
print("Validation year range:", val['year_num'].min(), "-", val['year_num'].max())
print("Test year range:", test['year_num'].min(), "-", test['year_num'].max())


✅ All rows accounted for.

Train year range: 2010 - 2020
Validation year range: 2021 - 2022
Test year range: 2023 - 2025


In [11]:
# Cell 5: Save the splits to ../data/splits/
output_dir = '../data/splits'
os.makedirs(output_dir, exist_ok=True)

train.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
val.to_csv(os.path.join(output_dir, 'validation.csv'), index=False)
test.to_csv(os.path.join(output_dir, 'test.csv'), index=False)

print(f"\n✅ Splits saved to: {output_dir}/")
print("Files created:")
print(f"  - train.csv ({len(train)} rows)")
print(f"  - validation.csv ({len(val)} rows)")
print(f"  - test.csv ({len(test)} rows)")


✅ Splits saved to: ../data/splits/
Files created:
  - train.csv (3318 rows)
  - validation.csv (624 rows)
  - test.csv (900 rows)
