In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Load the original cleaned dataset
input_path = "../data/preprocessed/vegetable_prices_clean.csv"
df = pd.read_csv(input_path)

print(f"Original shape: {df.shape}")
df.head()

Original shape: (4956, 43)


Unnamed: 0,year,week,vegetable,price,Badulla_actual_class,Hambantota_actual_class,Jaffna_actual_class,Kurunegala_actual_class,Matale_actual_class,Nuwara_Eliya_actual_class,...,Hambantota_prob_normal,Jaffna_prob_normal,Kurunegala_prob_normal,Matale_prob_normal,Nuwara_Eliya_prob_normal,Ratnapura_prob_normal,USD_LKR_avg,RateChange_avg_%,week_num,year_week
0,2010,W1,Bitter Gourd,52.4,drought,normal,drought,drought,drought,drought,...,0.994286,0.017143,0.015714,0.015714,0.035714,0.967143,114.4,0.0,1,201001
1,2010,W2,Bitter Gourd,58.6,drought,normal,drought,drought,drought,drought,...,0.911429,0.04,0.047143,0.05,0.055714,0.905714,114.25,-0.13,2,201002
2,2010,W3,Bitter Gourd,58.6,drought,normal,drought,drought,drought,drought,...,0.942857,0.002857,0.01,0.01,0.004286,0.912857,114.35,0.09,3,201003
3,2010,W4,Bitter Gourd,58.6,drought,normal,drought,drought,drought,drought,...,0.985714,0.002857,0.008571,0.007143,0.011429,0.954286,114.65,0.26,4,201004
4,2010,W5,Bitter Gourd,54.3,drought,flood_risk,drought,drought,drought,drought,...,0.325714,0.004286,0.011429,0.011429,0.012857,0.431429,114.7,0.04,5,201005


In [3]:
# Columns to drop
columns_to_drop = ['year', 'week', 'year_week']
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

print("Remaining columns:", list(df.columns))
df.head()

Remaining columns: ['vegetable', 'price', 'Badulla_actual_class', 'Hambantota_actual_class', 'Jaffna_actual_class', 'Kurunegala_actual_class', 'Matale_actual_class', 'Nuwara_Eliya_actual_class', 'Ratnapura_actual_class', 'Badulla_precipitation', 'Hambantota_precipitation', 'Jaffna_precipitation', 'Kurunegala_precipitation', 'Matale_precipitation', 'Nuwara_Eliya_precipitation', 'Ratnapura_precipitation', 'Badulla_prob_drought', 'Hambantota_prob_drought', 'Jaffna_prob_drought', 'Kurunegala_prob_drought', 'Matale_prob_drought', 'Nuwara_Eliya_prob_drought', 'Ratnapura_prob_drought', 'Badulla_prob_flood_risk', 'Hambantota_prob_flood_risk', 'Jaffna_prob_flood_risk', 'Kurunegala_prob_flood_risk', 'Matale_prob_flood_risk', 'Nuwara_Eliya_prob_flood_risk', 'Ratnapura_prob_flood_risk', 'Badulla_prob_normal', 'Hambantota_prob_normal', 'Jaffna_prob_normal', 'Kurunegala_prob_normal', 'Matale_prob_normal', 'Nuwara_Eliya_prob_normal', 'Ratnapura_prob_normal', 'USD_LKR_avg', 'RateChange_avg_%', 'week_n

Unnamed: 0,vegetable,price,Badulla_actual_class,Hambantota_actual_class,Jaffna_actual_class,Kurunegala_actual_class,Matale_actual_class,Nuwara_Eliya_actual_class,Ratnapura_actual_class,Badulla_precipitation,...,Badulla_prob_normal,Hambantota_prob_normal,Jaffna_prob_normal,Kurunegala_prob_normal,Matale_prob_normal,Nuwara_Eliya_prob_normal,Ratnapura_prob_normal,USD_LKR_avg,RateChange_avg_%,week_num
0,Bitter Gourd,52.4,drought,normal,drought,drought,drought,drought,normal,2.89,...,0.004286,0.994286,0.017143,0.015714,0.015714,0.035714,0.967143,114.4,0.0,1
1,Bitter Gourd,58.6,drought,normal,drought,drought,drought,drought,normal,7.071429,...,0.012857,0.911429,0.04,0.047143,0.05,0.055714,0.905714,114.25,-0.13,2
2,Bitter Gourd,58.6,drought,normal,drought,drought,drought,drought,normal,4.504286,...,0.011429,0.942857,0.002857,0.01,0.01,0.004286,0.912857,114.35,0.09,3
3,Bitter Gourd,58.6,drought,normal,drought,drought,drought,drought,normal,0.911429,...,0.002857,0.985714,0.002857,0.008571,0.007143,0.011429,0.954286,114.65,0.26,4
4,Bitter Gourd,54.3,drought,flood_risk,drought,drought,drought,drought,drought,8.752857,...,0.012857,0.325714,0.004286,0.011429,0.011429,0.012857,0.431429,114.7,0.04,5


In [4]:
# Mapping for actual_class
class_mapping = {
    'normal': 0,
    'flood_risk': 1,
    'drought': -1
}

class_columns = [
    'Badulla_actual_class', 'Hambantota_actual_class', 'Jaffna_actual_class',
    'Kurunegala_actual_class', 'Matale_actual_class', 'Nuwara_Eliya_actual_class',
    'Ratnapura_actual_class'
]

for col in class_columns:
    if col in df.columns:
        df[col] = df[col].map(class_mapping)

print("Unique values after encoding:")
for col in class_columns:
    if col in df.columns:
        print(f"{col}: {df[col].unique()[:5]}")
df.head()

Unique values after encoding:
Badulla_actual_class: [-1  0  1]
Hambantota_actual_class: [ 0  1 -1]
Jaffna_actual_class: [-1  0  1]
Kurunegala_actual_class: [-1  0  1]
Matale_actual_class: [-1  0  1]
Nuwara_Eliya_actual_class: [-1  0  1]
Ratnapura_actual_class: [ 0 -1  1]


Unnamed: 0,vegetable,price,Badulla_actual_class,Hambantota_actual_class,Jaffna_actual_class,Kurunegala_actual_class,Matale_actual_class,Nuwara_Eliya_actual_class,Ratnapura_actual_class,Badulla_precipitation,...,Badulla_prob_normal,Hambantota_prob_normal,Jaffna_prob_normal,Kurunegala_prob_normal,Matale_prob_normal,Nuwara_Eliya_prob_normal,Ratnapura_prob_normal,USD_LKR_avg,RateChange_avg_%,week_num
0,Bitter Gourd,52.4,-1,0,-1,-1,-1,-1,0,2.89,...,0.004286,0.994286,0.017143,0.015714,0.015714,0.035714,0.967143,114.4,0.0,1
1,Bitter Gourd,58.6,-1,0,-1,-1,-1,-1,0,7.071429,...,0.012857,0.911429,0.04,0.047143,0.05,0.055714,0.905714,114.25,-0.13,2
2,Bitter Gourd,58.6,-1,0,-1,-1,-1,-1,0,4.504286,...,0.011429,0.942857,0.002857,0.01,0.01,0.004286,0.912857,114.35,0.09,3
3,Bitter Gourd,58.6,-1,0,-1,-1,-1,-1,0,0.911429,...,0.002857,0.985714,0.002857,0.008571,0.007143,0.011429,0.954286,114.65,0.26,4
4,Bitter Gourd,54.3,-1,1,-1,-1,-1,-1,-1,8.752857,...,0.012857,0.325714,0.004286,0.011429,0.011429,0.012857,0.431429,114.7,0.04,5


In [5]:
# Mapping for vegetables
veg_mapping = {
    'Bitter Gourd': 1,
    'Brinjals': 2,
    'Cabbage': 3,
    'Carrot': 4,
    'Pumpkin': 5,
    'Tomatoes': 6
}

if 'vegetable' in df.columns:
    df['vegetable'] = df['vegetable'].map(veg_mapping)
    print("Unique vegetable codes:", df['vegetable'].unique())
else:
    print("Warning: 'vegetable' column not found.")
df.head()

Unique vegetable codes: [1 2 3 4 5 6]


Unnamed: 0,vegetable,price,Badulla_actual_class,Hambantota_actual_class,Jaffna_actual_class,Kurunegala_actual_class,Matale_actual_class,Nuwara_Eliya_actual_class,Ratnapura_actual_class,Badulla_precipitation,...,Badulla_prob_normal,Hambantota_prob_normal,Jaffna_prob_normal,Kurunegala_prob_normal,Matale_prob_normal,Nuwara_Eliya_prob_normal,Ratnapura_prob_normal,USD_LKR_avg,RateChange_avg_%,week_num
0,1,52.4,-1,0,-1,-1,-1,-1,0,2.89,...,0.004286,0.994286,0.017143,0.015714,0.015714,0.035714,0.967143,114.4,0.0,1
1,1,58.6,-1,0,-1,-1,-1,-1,0,7.071429,...,0.012857,0.911429,0.04,0.047143,0.05,0.055714,0.905714,114.25,-0.13,2
2,1,58.6,-1,0,-1,-1,-1,-1,0,4.504286,...,0.011429,0.942857,0.002857,0.01,0.01,0.004286,0.912857,114.35,0.09,3
3,1,58.6,-1,0,-1,-1,-1,-1,0,0.911429,...,0.002857,0.985714,0.002857,0.008571,0.007143,0.011429,0.954286,114.65,0.26,4
4,1,54.3,-1,1,-1,-1,-1,-1,-1,8.752857,...,0.012857,0.325714,0.004286,0.011429,0.011429,0.012857,0.431429,114.7,0.04,5


In [6]:
# Sort by vegetable and week number to ensure correct order
df = df.sort_values(['vegetable', 'week_num']).reset_index(drop=True)

print("Data sorted.")
df[['vegetable', 'week_num', 'price']].head(10)

Data sorted.


Unnamed: 0,vegetable,week_num,price
0,1,1,52.4
1,1,1,67.5
2,1,1,75.6
3,1,1,60.0
4,1,1,66.0
5,1,1,132.0
6,1,1,134.0
7,1,1,164.0
8,1,1,160.0
9,1,1,112.0


In [7]:
# For each vegetable, shift price to get previous weeks' prices
df['price_lag1'] = df.groupby('vegetable')['price'].shift(1)
df['price_lag2'] = df.groupby('vegetable')['price'].shift(2)

print("Lag features created.")
df[['vegetable', 'week_num', 'price', 'price_lag1', 'price_lag2']].head(10)

Lag features created.


Unnamed: 0,vegetable,week_num,price,price_lag1,price_lag2
0,1,1,52.4,,
1,1,1,67.5,52.4,
2,1,1,75.6,67.5,52.4
3,1,1,60.0,75.6,67.5
4,1,1,66.0,60.0,75.6
5,1,1,132.0,66.0,60.0
6,1,1,134.0,132.0,66.0
7,1,1,164.0,134.0,132.0
8,1,1,160.0,164.0,134.0
9,1,1,112.0,160.0,164.0


In [8]:
# Rolling mean and standard deviation over 4 weeks (minimum 1 observation)
df['price_roll_mean_4'] = df.groupby('vegetable')['price'].transform(
    lambda x: x.rolling(4, min_periods=1).mean()
)
df['price_roll_std_4'] = df.groupby('vegetable')['price'].transform(
    lambda x: x.rolling(4, min_periods=1).std()
)

print("Rolling features created.")
df[['vegetable', 'week_num', 'price', 'price_roll_mean_4', 'price_roll_std_4']].head(10)

Rolling features created.


Unnamed: 0,vegetable,week_num,price,price_roll_mean_4,price_roll_std_4
0,1,1,52.4,52.4,
1,1,1,67.5,59.95,10.677312
2,1,1,75.6,65.166667,11.77469
3,1,1,60.0,63.875,9.955024
4,1,1,66.0,67.275,6.426702
5,1,1,132.0,83.4,33.030895
6,1,1,134.0,98.0,40.496913
7,1,1,164.0,124.0,41.344085
8,1,1,160.0,147.5,16.842407
9,1,1,112.0,142.5,24.296776


In [9]:
# Lag features will be NaN for the first two weeks of each vegetable.
# We drop those rows to have a complete dataset.
initial_len = len(df)
df.dropna(subset=['price_lag1', 'price_lag2'], inplace=True)
df.reset_index(drop=True, inplace=True)

print(f"Dropped {initial_len - len(df)} rows with missing lags.")
print(f"New shape: {df.shape}")

Dropped 12 rows with missing lags.
New shape: (4944, 44)


In [10]:
missing = df.isnull().sum()
print("Missing values per column:")
print(missing[missing > 0] if any(missing > 0) else "No missing values found.")

Missing values per column:
No missing values found.


In [11]:
# Define output directory and filename
output_dir = "../data/feature_engineered"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "vegetable_prices_fe.csv")

# Save to CSV
df.to_csv(output_path, index=False)
print(f"Enhanced feature set saved to: {output_path}")

Enhanced feature set saved to: ../data/feature_engineered\vegetable_prices_fe.csv
