In [41]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Load the feature‑engineered dataset
input_path = "../data/feature_engineered/vegetable_prices_fe.csv"
df = pd.read_csv(input_path)

print("Dataset loaded successfully.")
print(f"Total samples: {len(df)}")
df.head()

Dataset loaded successfully.
Total samples: 4956


Unnamed: 0,vegetable,price,Badulla_actual_class,Hambantota_actual_class,Jaffna_actual_class,Kurunegala_actual_class,Matale_actual_class,Nuwara_Eliya_actual_class,Ratnapura_actual_class,Badulla_precipitation,...,Badulla_prob_normal,Hambantota_prob_normal,Jaffna_prob_normal,Kurunegala_prob_normal,Matale_prob_normal,Nuwara_Eliya_prob_normal,Ratnapura_prob_normal,USD_LKR_avg,RateChange_avg_%,week_num
0,1,52.4,-1,0,-1,-1,-1,-1,0,2.89,...,0.004286,0.994286,0.017143,0.015714,0.015714,0.035714,0.967143,114.4,0.0,1
1,1,58.6,-1,0,-1,-1,-1,-1,0,7.071429,...,0.012857,0.911429,0.04,0.047143,0.05,0.055714,0.905714,114.25,-0.13,2
2,1,58.6,-1,0,-1,-1,-1,-1,0,4.504286,...,0.011429,0.942857,0.002857,0.01,0.01,0.004286,0.912857,114.35,0.09,3
3,1,58.6,-1,0,-1,-1,-1,-1,0,0.911429,...,0.002857,0.985714,0.002857,0.008571,0.007143,0.011429,0.954286,114.65,0.26,4
4,1,54.3,-1,1,-1,-1,-1,-1,-1,8.752857,...,0.012857,0.325714,0.004286,0.011429,0.011429,0.012857,0.431429,114.7,0.04,5


In [42]:
# Define split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# First split: separate train + temp (val + test)
train_df, temp_df = train_test_split(df, test_size=(1 - train_ratio), random_state=42)

# Second split: split temp into val and test
# test_size = test_ratio / (val_ratio + test_ratio) because temp contains both
val_df, test_df = train_test_split(temp_df, test_size=test_ratio/(val_ratio+test_ratio), random_state=42)

print(f"Train set size: {len(train_df)} ({len(train_df)/len(df):.1%})")
print(f"Validation set size: {len(val_df)} ({len(val_df)/len(df):.1%})")
print(f"Test set size: {len(test_df)} ({len(test_df)/len(df):.1%})")

Train set size: 3469 (70.0%)
Validation set size: 743 (15.0%)
Test set size: 744 (15.0%)


In [43]:
# Check that the splits are disjoint and cover all rows
assert set(train_df.index).isdisjoint(set(val_df.index)), "Train and validation overlap!"
assert set(train_df.index).isdisjoint(set(test_df.index)), "Train and test overlap!"
assert set(val_df.index).isdisjoint(set(test_df.index)), "Validation and test overlap!"
assert len(train_df) + len(val_df) + len(test_df) == len(df), "Row count mismatch!"

print("All checks passed: splits are correct and cover the whole dataset.")

All checks passed: splits are correct and cover the whole dataset.


In [44]:
# Define output directory
output_dir = "../data/splits"
os.makedirs(output_dir, exist_ok=True)

# Save each split
train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
val_df.to_csv(os.path.join(output_dir, "validation.csv"), index=False)
test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)

print(f"Splits saved to {output_dir}")
print("Files created: train.csv, validation.csv, test.csv")

Splits saved to ../data/splits
Files created: train.csv, validation.csv, test.csv
