In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
# Prices datasets
farmgate = pd.read_csv('../data/raw/FarmgatePrice.csv')
wholesale = pd.read_csv('../data/raw/WholesalePrice.csv')
retail = pd.read_csv('../data/raw/RetailPrice.csv')

# Additional datasets
cpi = pd.read_csv('../data/raw/ConsumerPriceIndex.csv')
vop = pd.read_csv('../data/raw/VolumeOfProductionByMetricTons.csv')
harvested = pd.read_csv('../data/raw/AreaHarvestedInHectares.csv')

In [None]:
# clean data and reshaped data
def clean_data(df, column_1, column_2, value_column_name, monthly, is_cpi = False):
    df = df.copy()

    df_long = df.melt(id_vars=[column_1, column_2], var_name='Date', value_name=value_column_name) if monthly else distribute_quarterly_to_monthly(df, column_1, column_2, value_column_name)

    df_long = clean_geolocation(df_long)
    
    df_long[value_column_name] = pd.to_numeric(df_long[value_column_name], errors='coerce')
    
    if is_cpi:
        df_long['Date'] = pd.to_datetime(df_long['Date'], format='%Y %b')
    else:
        df_long['Date'] = pd.to_datetime(df_long['Date'], format='%Y %B')
    
    df_long = df_long.sort_values(by=[column_1, "Commodity", 'Date'])
    
    df_long[column_1] = df_long[column_1].str.lower().str.strip()
    
    df_long["Commodity"] = df_long["Commodity"].str.lower().str.strip()
    
    return df_long

def distribute_quarterly_to_monthly(df, column_1, column_2, value_column_name):
    df = df.copy()
    
    # Melt dataset to long format
    df_long = df.melt(id_vars=[column_1, column_2], var_name="Quarter", value_name=value_column_name)
    
    # Extract year and quarter from the Quarter column
    df_long["Year"] = df_long["Quarter"].str[:4].astype(int)
    df_long["Quarter"] = df_long["Quarter"].str[-1].astype(int)

    quarter_to_months = {
        1: [1, 2, 3],
        2: [4, 5, 6],
        3: [7, 8, 9],
        4: [10, 11, 12]
    }
    
    expanded_rows = []
    
    for _, row in df_long.iterrows():
        for month in quarter_to_months[row["Quarter"]]:
            expanded_rows.append({
                column_1: row[column_1],
                column_2: row[column_2],
                "Date": pd.Timestamp(year=row["Year"], month=month, day=1),
                value_column_name: row[value_column_name] / 3
            })
            
    df_monthly = pd.DataFrame(expanded_rows)
    
    return df_monthly

# remove .. in geolocation values and replace .. values to na
def clean_geolocation(df):
    df = df.rename(columns={df.columns.values[0]: "Geolocation", df.columns.values[1]: "Commodity"})
    
    
    
    return df

def clean_cpi_commodity(commodity):
    return commodity.split(' - ')[-1].strip().lower()

def map_commodity_to_standard(commodity):
    
    commodity_lower = commodity.lower()
    
    if 'corn' in commodity_lower or 'maize' in commodity_lower:
        return 'corn'
    elif 'palay' in commodity_lower or 'rice' in commodity_lower:
        return 'rice'
    else:
        return None
    
def merge_datasets(price_df, cpi_df, vop_df, harvested_df):
    
    price_df = price_df.merge(cpi_df, left_on=["Geolocation", "Commodity_Type", "Date"], 
                              right_on=["Geolocation", "Commodity", "Date"], how="left").drop(columns=["Commodity_y"]).rename(columns={"Commodity_x": "Commodity"})
    
    price_df = price_df.merge(vop_df, left_on=["Geolocation", "Commodity_Type", "Date"], 
                              right_on=["Geolocation", "Commodity_Type", "Date"], how="left").drop(columns=["Commodity_y"]).rename(columns={"Commodity_x": "Commodity"})
    
    price_df = price_df.merge(harvested_df, left_on=["Geolocation", "Commodity_Type", "Date"], 
                              right_on=["Geolocation", "Commodity_Type", "Date"], how="left").drop(columns=["Commodity_y"]).rename(columns={"Commodity_x": "Commodity"})

    # Drop CPI_Commodity, Standard_Commodity, and duplicate Commodity columns
    price_df = price_df.drop(columns=["CPI_Commodity"], errors="ignore")

    return price_df

In [4]:
farmgate = clean_data(farmgate, "Geolocation", "Commodity", "Price", True)
wholesale = clean_data(wholesale, "Geolocation", "Commodity", "Price", True)
retail = clean_data(retail, "Geolocation", "Commodity", "Price", True)

vop = clean_data(vop, "Geolocation", "Ecosystem/Croptype", "Production_Volume_by_MetricTons", False)
harvested = clean_data(harvested, "Geolocation", "Ecosystem/Croptype", "Area_Harvested_in_Hectares", False)
cpi = clean_data(cpi, "Geolocation", "Commodity Description", "CPI", True, is_cpi=True)

cpi["Commodity"] = cpi["Commodity"].apply(clean_cpi_commodity)

In [5]:
farmgate['Commodity_Type'] = farmgate['Commodity'].apply(map_commodity_to_standard)
wholesale['Commodity_Type'] = wholesale['Commodity'].apply(map_commodity_to_standard)
retail['Commodity_Type'] = retail['Commodity'].apply(map_commodity_to_standard)
vop['Commodity_Type'] = vop['Commodity'].apply(map_commodity_to_standard)
harvested['Commodity_Type'] = harvested['Commodity'].apply(map_commodity_to_standard)

In [6]:
# remove commodtities with high missing values per region
def filter_missing_values(df, threshold = 0.4):
    filtered_dfs = []
    
    for region in df['Geolocation'].unique():
        region_df = df[df['Geolocation'] == region]
        
        missing_ratio = region_df.groupby('Commodity')['Price'].apply(lambda x: x.isna().mean())
        
        valid_commodities = missing_ratio[missing_ratio < threshold].index
        filtered_region_df = region_df[region_df['Commodity'].isin(valid_commodities)]
        
        filtered_dfs.append(filtered_region_df)
        
    return pd.concat(filtered_dfs, ignore_index=True)

In [7]:
# apply filtering
farmgate = filter_missing_values(farmgate)
wholesale = filter_missing_values(wholesale)
retail = filter_missing_values(retail)

In [8]:
# Fill remaining missing values using forward and backward fill per region & commodity
def fill_missing_values(df):
    return df.groupby(["Geolocation", "Commodity"], group_keys=False).apply(lambda group: group.ffill().bfill()).reset_index(drop=True)

In [9]:
cleaned_farmgate = fill_missing_values(farmgate)
cleaned_wholesale = fill_missing_values(wholesale)
cleaned_retail = fill_missing_values(retail)

  return df.groupby(["Geolocation", "Commodity"], group_keys=False).apply(lambda group: group.ffill().bfill()).reset_index(drop=True)
  return df.groupby(["Geolocation", "Commodity"], group_keys=False).apply(lambda group: group.ffill().bfill()).reset_index(drop=True)
  return df.groupby(["Geolocation", "Commodity"], group_keys=False).apply(lambda group: group.ffill().bfill()).reset_index(drop=True)


In [10]:
REGIONS = {
    "autonomous region in muslim mindanao (armm)" : "bangsamoro autonomous region in muslim mindanao (barmm)",
    "bangsamoro autonomous region in muslim mindanao (barmm)" : "bangsamoro autonomous region in muslim mindanao (barmm)",
    "cordillera administrative region (car)" : "cordillera administrative region (car)",
    "mimaropa region" : "mimaropa region",
    "region i (ilocos region)" : "region i (ilocos region)",
    "region ii (cagayan valley)" : "region ii (cagayan valley)",
    "region iii (central luzon)" : "region iii (central luzon)",
    "region iv-a (calabarzon)" : "region iv-a (calabarzon)",
    "region ix (zamboanga peninsula)" : "region ix (zamboanga peninsula)",
    "region v (bicol region)" : "region v (bicol region)",
    "region vi (western visayas)" : "region vi (western visayas)",
    "region v (bicol region)" : "region vi (western visayas)",
    "region vii (central visayas)" : "region vii (central visayas)",
    "region viii (eastern visayas)" : "region viii (eastern visayas)",
    "region ix (zamboanga peninsula)" : "region x (northern mindanao)",
    "region x (northern mindanao)" : "region x (northern mindanao)",
    "region xi (davao region)" : "region xi (davao region)",
    "region xii (soccsksargen)" : "region xii (soccsksargen)",
    "region xiii (caraga)" : "region xiii (caraga)"
}

In [11]:
def region_mapping(df, column_name, region_mapping):
    df = df.copy()
    
    df[column_name] = df[column_name].map(region_mapping)
    
    return df

cleaned_farmgate = region_mapping(cleaned_farmgate, 'Geolocation', REGIONS)
cleaned_retail = region_mapping(cleaned_retail, 'Geolocation', REGIONS)
cleaned_wholesale = region_mapping(cleaned_wholesale, 'Geolocation', REGIONS)
cleaned_vop = region_mapping(vop, 'Geolocation', REGIONS)
cleaned_harvested = region_mapping(harvested, 'Geolocation', REGIONS)
cleaned_cpi = region_mapping(cpi, 'Geolocation', REGIONS)

In [12]:
# save cleaned data
cleaned_farmgate.to_csv('../data/cleaned/cleaned_farmgate.csv', index=False)
cleaned_wholesale.to_csv('../data/cleaned/cleaned_wholesale.csv', index=False)
cleaned_retail.to_csv('../data/cleaned/cleaned_retail.csv', index=False)

cleaned_vop.to_csv('../data/cleaned/cleaned_vop.csv', index=False)
cleaned_harvested.to_csv('../data/cleaned/cleaned_harvested.csv', index=False)
cleaned_cpi.to_csv('../data/cleaned/cleaned_cpi.csv', index=False)

In [13]:
merged_farmgate = merge_datasets(cleaned_farmgate, cleaned_cpi, cleaned_vop, cleaned_harvested)
merged_wholesale = merge_datasets(cleaned_wholesale, cleaned_cpi, cleaned_vop, cleaned_harvested)
merged_retail = merge_datasets(cleaned_retail, cleaned_cpi, cleaned_vop, cleaned_harvested)

In [21]:
merged_farmgate.to_csv('../data/merged/merged_farmgate.csv', index=False)
merged_wholesale.to_csv('../data/merged/merged_wholesale.csv', index=False)
merged_retail.to_csv('../data/merged/merged_retail.csv', index=False)

In [15]:
def feature_engineering(df):
    df = df.copy()
    
    df['Date'] = pd.to_datetime(df['Date'])
    
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Quarter'] = df['Date'].dt.quarter
    
    df['Price_Lag_1'] = df.groupby(['Geolocation', 'Commodity_Type'])['Price'].shift(1)
    
    df["Price_MA_3"] = df.groupby(["Geolocation", "Commodity_Type"])["Price"].rolling(window=3, min_periods=1).mean().reset_index(level=[0,1], drop=True)
    df["Price_MA_6"] = df.groupby(["Geolocation", "Commodity_Type"])["Price"].rolling(window=6, min_periods=1).mean().reset_index(level=[0,1], drop=True)
    
    df["Price_Std_3"] = df.groupby(["Geolocation", "Commodity_Type"])["Price"].rolling(window=3, min_periods=1).std().reset_index(level=[0,1], drop=True)
    
    df["CPI_Lag_1"] = df.groupby(["Geolocation", "Commodity_Type"])["CPI"].shift(1)
    
    df = df.ffill().bfill()
    
    return df

In [16]:
processed_farmgate = feature_engineering(merged_farmgate)
processed_wholesale = feature_engineering(merged_wholesale)
processed_retail = feature_engineering(merged_retail)

In [17]:
processed_farmgate['Price_Type'] = 'Farmgate'
processed_wholesale['Price_Type'] = 'Wholesale'
processed_retail['Price_Type'] = 'Retail'

common_columns = ['Geolocation', 'Commodity', 'Commodity_Type', 'Date', 'Year', 'Month', 'Quarter', 
                  'Price', 'Price_Lag_1', 'Price_MA_3', 'Price_MA_6', 'Price_Std_3',
                  "CPI_Lag_1", 'CPI', 'Production_Volume_by_MetricTons', 'Area_Harvested_in_Hectares', 'Price_Type']

processed_farmgate = processed_farmgate[common_columns]
processed_wholesale = processed_wholesale[common_columns]
processed_retail = processed_retail[common_columns]

final_dataset = pd.concat([processed_farmgate, processed_wholesale, processed_retail], ignore_index=True)
final_dataset.to_csv('../data/final/final_dataset.csv', index=False)

encoded_dataset = final_dataset.copy()

le_geo = LabelEncoder()
le_commodity = LabelEncoder()
le_commodity_type = LabelEncoder()

encoded_dataset['Geolocation_Encoded'] = le_geo.fit_transform(encoded_dataset["Geolocation"])
encoded_dataset['Commodity_Encoded'] = le_geo.fit_transform(encoded_dataset["Commodity"])
encoded_dataset['Commodity_Type_Encoded'] = le_geo.fit_transform(encoded_dataset["Commodity_Type"])

encoded_dataset.to_csv('../data/final/final_dataset_encoded.csv', index=False)

In [18]:
def plot_trends(df, price_type):
    regions = df['Geolocation'].unique()
    
    for region in regions:
        plt.figure(figsize=(12, 6))
        
        region_df = df[df['Geolocation'] == region]
        
        commodities = region_df['Commodity'].unique()
        
        for commodity in commodities:
            subset = region_df[region_df['Commodity'] == commodity].groupby('Date')['Price'].mean()
            plt.plot(subset.index, subset.values, label=commodity, marker='o', markersize=5, linestyle='-')
            
        
        plt.xlabel('Date')
        plt.ylabel('Average Price (PHP)')
        plt.title(f'{price_type} Trends for Each Commodity in {region} {2020-2024}')
        plt.xticks(rotation=45)
        plt.legend()
        plt.grid(True)
        
        plt.show()