In [18]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('../data/raw/Vegetable_prices_weekly.csv')

# Convert week column (e.g., "W1") to integer week number
df['week_num'] = df['week'].str.replace('W', '').astype(int)

# Create a numeric year_week column for chronological ordering
# This works because week numbers are between 1 and 53, so year*100 + week gives a unique increasing value
df['year_week'] = df['year'] * 100 + df['week_num']

In [19]:
# Identify W53 rows
w53_mask = df['week_num'] == 53

# Keep W53 rows with nonâ€‘null price, drop those with null price
df = df[~(w53_mask & df['price'].isna())].copy()

In [20]:
# List of vegetables of interest (optional: you can filter now or later)
vegetables = ['Carrot', 'Cabbage', 'Tomatoes', 'Brinjals', 'Pumpkin', 'Bitter Gourd']
df = df[df['vegetable'].isin(vegetables)]

# Sort globally by vegetable and year_week to ensure correct order per group
df = df.sort_values(['vegetable', 'year_week']).reset_index(drop=True)

# Interpolate missing prices per vegetable
df['price'] = df.groupby('vegetable')['price'].transform(
    lambda group: group.interpolate(method='linear', limit_direction='both')
)

In [21]:
df = df.dropna(subset=['price']).reset_index(drop=True)

In [22]:
# For each vegetable, check if there are any missing weeks between min and max year_week
for veg in vegetables:
    veg_df = df[df['vegetable'] == veg].sort_values('year_week')
    expected_weeks = set(range(veg_df['year_week'].min(), veg_df['year_week'].max() + 1))
    actual_weeks = set(veg_df['year_week'])
    missing = expected_weeks - actual_weeks
    if missing:
        print(f"{veg} missing weeks: {missing}")
    else:
        print(f"{veg} is continuous")

Carrot missing weeks: {201053, 201054, 201055, 201056, 201057, 201058, 201059, 201060, 201061, 201062, 201063, 201064, 201065, 201066, 201067, 201068, 201069, 201070, 201071, 201072, 201073, 201074, 201075, 201076, 201077, 201078, 201079, 201080, 201081, 201082, 201083, 201084, 201085, 201086, 201087, 201088, 201089, 201090, 201091, 201092, 201093, 201094, 201095, 201096, 201097, 201098, 201099, 201100, 201153, 201154, 201155, 201156, 201157, 201158, 201159, 201160, 201161, 201162, 201163, 201164, 201165, 201166, 201167, 201168, 201169, 201170, 201171, 201172, 201173, 201174, 201175, 201176, 201177, 201178, 201179, 201180, 201181, 201182, 201183, 201184, 201185, 201186, 201187, 201188, 201189, 201190, 201191, 201192, 201193, 201194, 201195, 201196, 201197, 201198, 201199, 201200, 201253, 201254, 201255, 201256, 201257, 201258, 201259, 201260, 201261, 201262, 201263, 201264, 201265, 201266, 201267, 201268, 201269, 201270, 201271, 201272, 201273, 201274, 201275, 201276, 201277, 201278, 2

In [23]:
import os

# Create the directory if it doesn't exist
os.makedirs('../data/preprocessed', exist_ok=True)

# Save the cleaned DataFrame
df.to_csv('../data/preprocessed/vegetable_prices_clean.csv', index=False)

In [24]:
df = pd.read_csv('../data/preprocessed/vegetable_prices_clean.csv')

# Check for missing values in each column
missing = df.isnull().sum()
print("Missing values per column:")
if missing.sum() == 0:
    print("No missing values found!")
else:
    print(missing[missing > 0])

# Total missing cells
total_missing = missing.sum()
print(f"\nTotal missing cells: {total_missing}")

# If there are missing values, show the rows that contain them
if total_missing > 0:
    print("\nRows with missing values:")
    print(df[df.isnull().any(axis=1)])

Missing values per column:
Badulla_actual_class            42
Hambantota_actual_class         42
Jaffna_actual_class             42
Kurunegala_actual_class         42
Matale_actual_class             42
Nuwara_Eliya_actual_class       42
Ratnapura_actual_class          42
Badulla_precipitation           42
Hambantota_precipitation        42
Jaffna_precipitation            42
Kurunegala_precipitation        42
Matale_precipitation            42
Nuwara_Eliya_precipitation      42
Ratnapura_precipitation         42
Badulla_prob_drought            42
Hambantota_prob_drought         42
Jaffna_prob_drought             42
Kurunegala_prob_drought         42
Matale_prob_drought             42
Nuwara_Eliya_prob_drought       42
Ratnapura_prob_drought          42
Badulla_prob_flood_risk         42
Hambantota_prob_flood_risk      42
Jaffna_prob_flood_risk          42
Kurunegala_prob_flood_risk      42
Matale_prob_flood_risk          42
Nuwara_Eliya_prob_flood_risk    42
Ratnapura_prob_flood_risk   

In [25]:
file_path = '../data/preprocessed/vegetable_prices_clean.csv'
df = pd.read_csv(file_path)

# Check for missing values
initial_rows = len(df)
missing_mask = df.isnull().any(axis=1)
rows_with_missing = missing_mask.sum()

if rows_with_missing > 0:
    print(f"Found {rows_with_missing} rows with missing values. Removing them...")
    df_clean = df.dropna().reset_index(drop=True)
    print(f"Rows before: {initial_rows}, after: {len(df_clean)}")
    # Save back to the same file
    df_clean.to_csv(file_path, index=False)
    print("Cleaned file saved.")
else:
    print("No missing values found. File remains unchanged.")

Found 42 rows with missing values. Removing them...
Rows before: 4998, after: 4956
Cleaned file saved.
