In [9]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Load the feature‑engineered dataset
input_path = "../data/feature_engineered/vegetable_prices_fe.csv"
df = pd.read_csv(input_path)

print("Dataset loaded successfully.")
print(f"Total samples: {len(df)}")
df.head()

Dataset loaded successfully.
Total samples: 4944


Unnamed: 0,vegetable,price,Badulla_actual_class,Hambantota_actual_class,Jaffna_actual_class,Kurunegala_actual_class,Matale_actual_class,Nuwara_Eliya_actual_class,Ratnapura_actual_class,Badulla_precipitation,...,Matale_prob_normal,Nuwara_Eliya_prob_normal,Ratnapura_prob_normal,USD_LKR_avg,RateChange_avg_%,week_num,price_lag1,price_lag2,price_roll_mean_4,price_roll_std_4
0,1,75.6,-1,-1,-1,-1,-1,-1,-1,5.428571,...,0.012857,0.004286,0.008571,113.83,-0.06,1,67.5,52.4,65.166667,11.77469
1,1,60.0,1,1,1,1,1,-1,-1,21.025714,...,0.758571,0.014286,0.015714,126.25,-0.9,1,75.6,67.5,63.875,9.955024
2,1,66.0,-1,-1,-1,0,0,0,0,2.248571,...,0.992857,0.974286,0.998571,130.7,0.0,1,60.0,75.6,67.275,6.426702
3,1,132.0,-1,-1,-1,-1,-1,-1,-1,0.792857,...,0.025714,0.021429,0.015714,131.5,0.15,1,66.0,60.0,83.4,33.030895
4,1,134.0,-1,-1,-1,-1,-1,-1,-1,6.584286,...,0.017143,0.004286,0.007143,143.72,-0.13,1,132.0,66.0,98.0,40.496913


In [10]:
# Define split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# First split: separate train + temp (val + test)
train_df, temp_df = train_test_split(df, test_size=(1 - train_ratio), random_state=42)

# Second split: split temp into val and test
# test_size = test_ratio / (val_ratio + test_ratio) because temp contains both
val_df, test_df = train_test_split(temp_df, test_size=test_ratio/(val_ratio+test_ratio), random_state=42)

print(f"Train set size: {len(train_df)} ({len(train_df)/len(df):.1%})")
print(f"Validation set size: {len(val_df)} ({len(val_df)/len(df):.1%})")
print(f"Test set size: {len(test_df)} ({len(test_df)/len(df):.1%})")

Train set size: 3460 (70.0%)
Validation set size: 742 (15.0%)
Test set size: 742 (15.0%)


In [11]:
# Check that the splits are disjoint and cover all rows
assert set(train_df.index).isdisjoint(set(val_df.index)), "Train and validation overlap!"
assert set(train_df.index).isdisjoint(set(test_df.index)), "Train and test overlap!"
assert set(val_df.index).isdisjoint(set(test_df.index)), "Validation and test overlap!"
assert len(train_df) + len(val_df) + len(test_df) == len(df), "Row count mismatch!"

print("All checks passed: splits are correct and cover the whole dataset.")

All checks passed: splits are correct and cover the whole dataset.


In [12]:
# Define output directory
output_dir = "../data/splits"
os.makedirs(output_dir, exist_ok=True)

# Save each split
train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
val_df.to_csv(os.path.join(output_dir, "validation.csv"), index=False)
test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)

print(f"Splits saved to {output_dir}")
print("Files created: train.csv, validation.csv, test.csv")

Splits saved to ../data/splits
Files created: train.csv, validation.csv, test.csv
