In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [48]:
# Prices datasets
farmgate = pd.read_csv('../data/raw/FarmgatePrice.csv')
wholesale = pd.read_csv('../data/raw/WholesalePrice.csv')
retail = pd.read_csv('../data/raw/RetailPrice.csv')

# Additional datasets
cpi = pd.read_csv('../data/raw/ConsumerPriceIndex.csv')
vop = pd.read_csv('../data/raw/VolumeOfProductionByMetricTons.csv')
harvested = pd.read_csv('../data/raw/AreaHarvestedInHectares.csv')

In [49]:
# clean data and reshaped data
def clean_data(df, column_1, column_2):
    df = df.copy()
    
    df = clean_geolocation(df, column_1, column_2)

    df_long = df.melt(id_vars=[column_1, column_2], var_name='Date', value_name='Price')
    
    df_long['Price'] = pd.to_numeric(df_long['Price'], errors='coerce')
    
    df_long['Date'] = pd.to_datetime(df_long['Date'], format='%Y %B')
    
    df_long = df_long.sort_values(by=[column_1, column_2, 'Date'])
    
    df_long[column_1] = df_long[column_1].str.lower().str.strip()
    
    df_long[column_2] = df_long[column_2].str.lower().str.strip()
    
    return df_long

# remove .. in geolocation values and replace .. values to na
def clean_geolocation(df, column_1, column_2):
    df.columns.values[0] = column_1
    df.columns.values[1] = column_2
    
    df.replace('..', pd.NA, inplace=True)
    df[column_1] = df[column_1].str.replace(r'^\.\.', '', regex=True).str.strip()
    
    return df

In [50]:
farmgate = clean_data(farmgate, "Geolocation", "Commodity")
wholesale = clean_data(wholesale, "Geolocation", "Commodity")
retail = clean_data(retail, "Geolocation", "Commodity")

In [51]:
# remove commodtities with high missing values per region
def filter_missing_values(df, threshold = 0.4):
    filtered_dfs = []
    
    for region in df['Geolocation'].unique():
        region_df = df[df['Geolocation'] == region]
        
        missing_ratio = region_df.groupby('Commodity')['Price'].apply(lambda x: x.isna().mean())
        
        valid_commodities = missing_ratio[missing_ratio < threshold].index
        filtered_region_df = region_df[region_df['Commodity'].isin(valid_commodities)]
        
        filtered_dfs.append(filtered_region_df)
        
    return pd.concat(filtered_dfs, ignore_index=True)

In [52]:
# apply filtering
farmgate = filter_missing_values(farmgate)
wholesale = filter_missing_values(wholesale)
retail = filter_missing_values(retail)

In [53]:
# Fill remaining missing values using forward and backward fill per region & commodity
def fill_missing_values(df):
    return df.groupby(["Geolocation", "Commodity"], group_keys=False).apply(lambda group: group.ffill().bfill()).reset_index(drop=True)

In [54]:
cleaned_farmgate = fill_missing_values(farmgate)
cleaned_wholesale = fill_missing_values(wholesale)
cleaned_retail = fill_missing_values(retail)

  return df.groupby(["Geolocation", "Commodity"], group_keys=False).apply(lambda group: group.ffill().bfill()).reset_index(drop=True)
  return df.groupby(["Geolocation", "Commodity"], group_keys=False).apply(lambda group: group.ffill().bfill()).reset_index(drop=True)
  return df.groupby(["Geolocation", "Commodity"], group_keys=False).apply(lambda group: group.ffill().bfill()).reset_index(drop=True)


In [55]:
# save cleaned data
cleaned_farmgate.to_csv('../data/cleaned/cleaned_farmgate.csv', index=False)
cleaned_wholesale.to_csv('../data/cleaned/cleaned_wholesale.csv', index=False)
cleaned_retail.to_csv('../data/cleaned/cleaned_retail.csv', index=False)

In [56]:
def plot_trends(df, price_type):
    regions = df['Geolocation'].unique()
    
    for region in regions:
        plt.figure(figsize=(12, 6))
        
        region_df = df[df['Geolocation'] == region]
        
        commodities = region_df['Commodity'].unique()
        
        for commodity in commodities:
            subset = region_df[region_df['Commodity'] == commodity].groupby('Date')['Price'].mean()
            plt.plot(subset.index, subset.values, label=commodity, marker='o', markersize=5, linestyle='-')
            
        
        plt.xlabel('Date')
        plt.ylabel('Average Price (PHP)')
        plt.title(f'{price_type} Price Trends for Each Commodity in {region} {2020-2024}')
        plt.xticks(rotation=45)
        plt.legend()
        plt.grid(True)
        
        plt.show()
    

In [57]:
def distribute_quarterly_to_monthly(df, column_name):
    df = df.copy()

    # Melt dataset to long format
    df_long = df.melt(id_vars=["Geolocation", "Ecosystem/Croptype"], var_name="Quarter", value_name=column_name)

    # Extract year and quarter from the Quarter column
    df_long["Year"] = df_long["Quarter"].str[:4].astype(int)
    df_long["Quarter"] = df_long["Quarter"].str[-1].astype(int)

    quarter_to_months = {
        1: [1, 2, 3],
        2: [4, 5, 6],
        3: [7, 8, 9],
        4: [10, 11, 12]
    }
    
    expanded_rows = []
    
    for _, row in df_long.iterrows():
        for month in quarter_to_months[row["Quarter"]]:
            expanded_rows.append({
                "Geolocation": row["Geolocation"],
                "Ecosystem/Croptype": row["Ecosystem/Croptype"],
                "Date": pd.Timestamp(year=row["Year"], month=month, day=1),
                column_name: row[column_name] / 3
            })
            
    df_monthly = pd.DataFrame(expanded_rows)
    
    return df_monthly

In [58]:
cleaned_vop = distribute_quarterly_to_monthly(vop, "Volume_of_Production_in_MetricTons")
cleaned_harvested = distribute_quarterly_to_monthly(harvested, "Area_Harvested_in_Hectares")

In [59]:
cleaned_vop

Unnamed: 0,Geolocation,Ecosystem/Croptype,Date,Volume_of_Production_in_MetricTons
0,..CORDILLERA ADMINISTRATIVE REGION (CAR),Irrigated Palay,2020-01-01,10281.910000
1,..CORDILLERA ADMINISTRATIVE REGION (CAR),Irrigated Palay,2020-02-01,10281.910000
2,..CORDILLERA ADMINISTRATIVE REGION (CAR),Irrigated Palay,2020-03-01,10281.910000
3,..REGION I (ILOCOS REGION),Irrigated Palay,2020-01-01,110109.616667
4,..REGION I (ILOCOS REGION),Irrigated Palay,2020-02-01,110109.616667
...,...,...,...,...
5755,..REGION XIII (CARAGA),Corn,2024-11-01,22739.066667
5756,..REGION XIII (CARAGA),Corn,2024-12-01,22739.066667
5757,..BANGSAMORO AUTONOMOUS REGION IN MUSLIM MINDA...,Corn,2024-10-01,82519.426667
5758,..BANGSAMORO AUTONOMOUS REGION IN MUSLIM MINDA...,Corn,2024-11-01,82519.426667


In [60]:
cleaned_harvested

Unnamed: 0,Geolocation,Ecosystem/Croptype,Date,Area_Harvested_in_Hectares
0,..CORDILLERA ADMINISTRATIVE REGION (CAR),Irrigated Palay,2020-01-01,2753.060000
1,..CORDILLERA ADMINISTRATIVE REGION (CAR),Irrigated Palay,2020-02-01,2753.060000
2,..CORDILLERA ADMINISTRATIVE REGION (CAR),Irrigated Palay,2020-03-01,2753.060000
3,..REGION I (ILOCOS REGION),Irrigated Palay,2020-01-01,22528.666667
4,..REGION I (ILOCOS REGION),Irrigated Palay,2020-02-01,22528.666667
...,...,...,...,...
5755,..REGION XIII (CARAGA),Corn,2024-11-01,5561.666667
5756,..REGION XIII (CARAGA),Corn,2024-12-01,5561.666667
5757,..BANGSAMORO AUTONOMOUS REGION IN MUSLIM MINDA...,Corn,2024-10-01,23769.516667
5758,..BANGSAMORO AUTONOMOUS REGION IN MUSLIM MINDA...,Corn,2024-11-01,23769.516667
