<a href="https://colab.research.google.com/github/Bourbon-Rye/Baesian-Cropability/blob/main/PilipiNuts_2023_Baesian_Cropability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [64]:
# @Libraries
import numpy as np
import pandas as pd
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import matplotlib.pyplot as plt
import json
import re
import functools

from pathlib import Path
from sklearn import preprocessing
from plotly.subplots import make_subplots
from plotly import offline
from scipy import stats
from sklearn.impute import SimpleImputer
from matplotlib import figure

In [65]:
# @title Corrections
with open('datasets/region_provinces.json') as jsonfile:
    regions_provinces = json.load(jsonfile)['PHILIPPINES']
regions_provinces = {key.lower():regions_provinces[key] for key in regions_provinces}
regions = regions_provinces.keys()
provinces = set([item.lower() for key in regions for item in regions_provinces[key]])
regions = set(regions)
regions.add("philippines")

# NOTE: Fix for bad regions, thanks PSA
bad_regions = ['AONCR', 'BARMM', 'CAR', 'MIMAROPA', 'NCR',
           'Region 1', 'Region 2', 'Region 3', 'Region 4A',
           'Region 5', 'Region 6', 'Region 7', 'Region 8',
           'Region 9', 'Region 10', 'Region 11', 'Reg12', 'CARAGA']
region_mapping = {bad.lower():good.lower() for (bad,good) in zip(bad_regions, regions_provinces.keys())}
corrections = {
    "AUTONOMOUS REGION IN MUSLIM MINDANAO (ARMM)": "bangsamoro autonomous region in muslim mindanao (barmm)",
    "autonomous region in muslim mindanao (armm)": "bangsamoro autonomous region in muslim mindanao (barmm)",
    "mimaropa region": "mimaropa region (mimaropa)"
}
region_mapping.update(corrections)

temp = [key.split('(')[1].rstrip(')') for key in regions_provinces]
region_short_to_short_long = {bad:good for (bad,good) in zip(temp, regions_provinces.keys())}

In [66]:
# @title Utilities
def read_csv_to_df(csvfile: Path, comment_symbol='#') -> pd.DataFrame:
    """Read CSV to DataFrame with comment validation.
        Allows comment lines in CSVs where line[0] == comment_symbol.
        Also removes newlines.
        Does not recognize comment_symbol anywhere else. 
    """
    tempfile = Path('temp.csv')
    with open(csvfile, 'r') as csv, open(tempfile, 'w+') as temp:
        lines = csv.readlines()
        for line in lines:
            if line[0] != comment_symbol:
                temp.write(line)
    return pd.read_csv(tempfile)

def is_region(x: str, regions=regions) -> bool:
    x = x.strip(' .')
    x = x.lower()
    if x == "cagayan":
        return False
    for region in regions:
        if x in region:
            return True
    else:
        return False
    
def is_province(x: str, provinces=provinces) -> bool:
    x = x.strip(' .')
    x = x.lower()
    for province in provinces:
        if x in province:
            return True
    else:
        return False

def get_quarter_columns(df: pd.DataFrame, year_range: range, period_idx: int):
    """Assumes contiguous period (Year Month) columns and that columns before period_idx are ID columns"""
    df_quarter = df.iloc[:, :period_idx].copy()
    for year in year_range:
        for q in range(0, 12, 3):
            df_quarter[f"{year} Q{q//3+1}"] = df.filter(regex=str(year), axis=1).iloc[:, q:q+3].mean(axis=1)
    return df_quarter

def get_annual_columns(df: pd.DataFrame, year_range: range, period_idx: int):
    """Assumes contiguous period (Year Month|Quarter) columns and that columns before period_idx are ID columns
    Note that this also works with Quarters"""
    df_annual = df.iloc[:, :period_idx].copy()
    for year in year_range:
        df_annual[f"{year}"] = df.filter(regex=str(year), axis=1).mean(axis=1)
    return df_annual

def swap_columns(df: pd.DataFrame, col1: str, col2: str):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df

def normalize(df: pd.DataFrame, col: str, minmax = True):
    """Can use mean normalization and minmax normalization."""
    tdf = df[col]
    if minmax:
        tdf = (tdf-tdf.min())/(tdf.max()-tdf.min())
    else:
        tdf = (tdf-tdf.mean())/tdf.std()
    df[col] = tdf
    return df

def drop_rows_with_zeros(df: pd.DataFrame, ref_col_idx: int, all_zeros=False):
    """Drop rows if some values are zeros, or if all values are zeros.
    Assumes contiguous reference columns, i.e. columns to use in deciding whether to drop."""
    return df[~(df.iloc[:, ref_col_idx:] == 0).all(axis=1)] if all_zeros else df[~(df.iloc[:, ref_col_idx:] == 0).any(axis=1)]

def dual_plot(df: pd.DataFrame, x: str, y1: str, y2: str,
              title=None, xtitle=None, y1title=None, y2title=None):
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    # Add traces
    fig.add_trace(
        go.Scatter(x=df[x], y=df[y1], name=y1),
        secondary_y=False,
    )
    fig.add_trace(
        go.Scatter(x=df[x], y=df[y2], name=y2),
        secondary_y=True,
    )
    # # Add titles
    if title is not None: fig.update_layout(title_text=title)
    if xtitle is not None: fig.update_xaxes(title_text=xtitle)
    if y1title is not None: fig.update_yaxes(title_text=y1title, secondary_y=False)
    if y2title is not None: fig.update_yaxes(title_text=y2title, secondary_y=True)

    return fig

def move_column(df: pd.DataFrame, col: str, new_idx: int):
    """This is an inplace method."""
    df.insert(new_idx, col, df.pop(col))
    
def preprocess_baesians_1(df: pd.DataFrame, commodity: str, rename_to: str, melt_value: str|None, regional=True):
    """Assumes Geolocation | Commodity | Period columns.
    Filters to regional if regional=True, else filters to provincial. Retains "philippines".
    """
    # df.dropna(inplace=True)
    df["Geolocation"] = df["Geolocation"].str.lstrip(".").str.lower()
    df["Geolocation"] = df["Geolocation"].replace(region_mapping)
    df = df[df["Geolocation"].apply(is_region)] if regional else df[df["Geolocation"].apply(is_province)]
    df = df[df["Commodity"] == commodity]
    df["Commodity"] = rename_to # Rename everything in Commodity to Rice
    if melt_value:
        df = df.melt(id_vars=["Geolocation", "Commodity"], value_vars=df.columns[2:], var_name="Period", value_name=melt_value)
    return df

def imputer(df: pd.DataFrame, start_idx: int, end_idx=None):
    """If only start index is provided, will impute from start index column to last column,
    else limit from start_idx to end_idx. Inplace imputation"""
    imp = SimpleImputer(missing_values=pd.NA, strategy='mean')
    if end_idx:
        imp.fit(df.iloc[:, start_idx:end_idx])
        df[df.columns[start_idx:end_idx]] = imp.transform(df.iloc[:, start_idx:])
    else:
        imp.fit(df.iloc[:, start_idx:])
        df[df.columns[start_idx:]] = imp.transform(df.iloc[:, start_idx:])

def preprocess_baesians_2(df: pd.DataFrame, commodity_map: dict, melt_value: str|None, regional=True, impute=False):
    """Assumes Geolocation | Commodity | Period ... columns.
    Filters to regional if regional=True, else filters to provincial. Retains "philippines".
    Filters to raw commodity name (key in commodity_map), and then renames to standard commodity_map[key]
    Ex. {"RICE, REGULAR-MILLED, 1 KG" : "Rice"}
    """
    df["Geolocation"] = df["Geolocation"].str.lstrip(".").str.lower()
    df["Geolocation"] = df["Geolocation"].replace(region_mapping)
    df = df[df["Geolocation"].apply(is_region)] if regional else df[df["Geolocation"].apply(is_province)]
    df = df[df["Commodity"].isin(commodity_map)]
    df.loc[:, "Commodity"] = df["Commodity"].replace(commodity_map)
    if melt_value:
        df = df.melt(id_vars=["Geolocation", "Commodity"], value_vars=df.columns[2:], var_name="Period", value_name=melt_value)
    if impute:
        imputer(df, 3)  # inplace imputation of melt_value
    return df

def filter_to_regions(df: pd.DataFrame):
    return df[df["Geolocation"].apply(is_region)]
    
def filter_to_provinces(df: pd.DataFrame):
    return df[df["Geolocation"].apply(is_province)]

def fig_to_div(fig: figure.Figure, filename: Path):
    """Optional: Add pretiffication."""
    filename = Path(filename)
    filename.parent.mkdir(exist_ok=True, parents=True)
    with open(filename, "w+") as f:
        f.write(offline.plot(fig, include_plotlyjs=False, output_type='div'))
        
def fig_to_html(fig: figure.Figure, filename: Path):
    filename = Path(filename)
    filename.parent.mkdir(exist_ok=True, parents=True)
    fig.write_html(filename)
    
def fig_to_png(fig: figure.Figure, filename: Path):
    """NOTE: Requires ORCA to be installed!"""
    filename = Path(filename)
    filename.parent.mkdir(exist_ok=True, parents=True)
    pio.write_image(fig, filename,scale=6, width=1080, height=720)

In [67]:
# @title OTG Cleanup
# VALUE OF PRODUCTION (5/24/2024)
datapath = Path("datasets/agricultural-indicators/value-of-production/")
writepath = Path("datasets")

files = list(datapath.glob("*.csv"))
df = pd.DataFrame()
for file in files:
    with open(file) as f:
        region = f.readlines()[0].split(':')[0].lstrip("\"")
        if region == u"\ufeffTest\n":
            region = "Cordillera Administrative Region (CAR)"
        tdf = pd.read_csv(file, skiprows=1, na_values=".")
        tdf["Geolocation"] = region
        df = pd.concat([df, tdf])
df["Subsector"] = "Rice"
df.drop(columns=["Type of Valuation"], inplace=True)
df.rename({"Subsector": "Commodity"}, axis=1, inplace=True)

# Imputation of VOP df
imp = SimpleImputer(missing_values=pd.NA, strategy='mean')
imp.fit(df.iloc[:, 1:-1])
df[df.columns[1:-1]] = imp.transform(df.iloc[:, 1:-1])
move_column(df, "Geolocation", 0)
df_val_of_prod = df

# ANNUAL RICE STOCKS (5/24/2024) NOTE: Perhaps try to match this with the prices as this has monthly, for national only
df = read_csv_to_df("datasets/agricultural-indicators/stocks-palay-corn_yearly_1980-2024.csv")
df = df[(df["Sector"] == "Rice: Total Stock") | (df["Sector"] == "Corn: Total Stock")]
df["Sector"] = df["Sector"].replace({"Rice: Total Stock": "Rice", "Corn: Total Stock": "Corn"})
df = pd.concat([df.iloc[:, :2].copy(), df.mean(axis=1, numeric_only=True)], axis=1)
df.rename(columns={"Sector": "Commodity", "Year": "Period", 0: "Stocks"}, inplace=True)
df = df[df["Period"].isin(range(2012, 2024))]
df_stocks = df

# Baesian Plots


Hypotheses:
- **H0.1:** There is no significant difference in the productivity of major food crops when grouped according to their crop type, geolocation, and/or market profile.
- **H0.2:** There is no significant relationship between market conditions and food crop production.

Goal: Visually and statistically assess the relationship between market indicators and crop yield indicators.

Visual tests are dual plots and scatterplots. Statistics for relationship testing: contingency tables and t-tests (?).

In [68]:
# Sample Relationship Test
data = [[207, 282, 241], [234, 242, 232]]
stat, p, dof, expected = chi2_contingency(data)
 
# interpret p-value
alpha = 0.05
print("p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

p value is 0.10319714047309392
Independent (H0 holds true)


## Data Overview

Selected major crops (analyzed are checked):
- ✅ Rice / Palay
- ✅ Corn / Maize
- Sweet Potato / Camote
- Mongo / Monggo / Mung Beans
- Banana
- Coconut
- Onion
- Garlic
- Sugarcane

In [69]:
# Volume of Production of Selected Major Crops
df = pd.read_csv("datasets/agricultural-indicators/volume_rice-corn.csv", skiprows=2)
df = df[df["Geolocation"] == "PHILIPPINES"]
df = df.filter(regex="Commodity|Geolocation|Annual", axis=1)
df.columns = map(lambda x: x.replace(" Annual", "") if "Annual" in x else x, df.columns)
df = df.melt(id_vars=["Commodity", "Geolocation"], value_vars=df.columns[2:], var_name="Period", value_name="Volume")
fig = px.line(df, x="Period", y="Volume", color="Commodity", title='Annual Volume of Production of Rice and Corn<br>National average in metric tons').update_layout(
    xaxis_title="Period", yaxis_title = "Volume of Production")
fig.show()

df = pd.read_csv("datasets/agricultural-indicators/volume_other-crops.csv", skiprows=2)
df = df[df["Geolocation"] == "PHILIPPINES"]
df = df.filter(regex="Commodity|Geolocation|Annual", axis=1)
df.columns = map(lambda x: x.replace(" Annual", "") if "Annual" in x else x, df.columns)
df = df.melt(id_vars=["Commodity", "Geolocation"], value_vars=df.columns[2:], var_name="Period", value_name="Volume")
fig = px.line(df, x="Period", y="Volume", color="Commodity", title='Annual Volume of Production of Other Major Crops<br>National average in metric tons').update_layout(
    xaxis_title="Period", yaxis_title = "Volume of Production")
fig.show()

## Annual Analysis
Annual-National and Annual-Regional analysis of the commodities Rice and Corn. Other major crops to follow.

**NOTE:** The core agricultural indicators are Stocks, Volume, and Area Harvested, while the core market indicators are Farmgate Price, Wholesale Price, and Retail Price.

This is important to know especially during visualization where Farmgate price is used as a slicer.

In [70]:
# @title Annual megadataset for rice and corn
# NOTE: Filter this during visualization and analysis to just Philippines or Regions
def filter_period(df: pd.DataFrame, _type: str):
    if _type == "Annual":
        df = df.filter(regex="Commodity|Geolocation|Annual", axis=1)
        df.columns = map(lambda x: x.replace(" Annual", "") if "Annual" in x else x, df.columns)
    elif _type == "Quarterly":
        df = df.filter(regex="Commodity|Geolocation|Q\d", axis=1)
    return df

# Volume of Rice and Corn
df1 = pd.read_csv("datasets/agricultural-indicators/volume_rice-corn.csv", skiprows=2, na_values=[".."])
df1 = filter_period(df1, "Annual")
df1 = preprocess_baesians_2(df1, {"Palay": "Rice", "Corn": "Corn"}, melt_value="Volume")

# Farmgate Price of Rice and Corn
df2 = pd.read_csv("datasets/prices/prices_farmgate-new-series_2010-2023.csv")
df2 = get_annual_columns(df2, range(2012, 2024), 2)
df2 = preprocess_baesians_2(df2, {"Palay [Paddy] Other Variety, dry (conv. to 14% mc)": "Rice",
                                  "Corngrain [Maize] Yellow, matured": "Corn"}, melt_value="Farmgate Price")

# Wholesale Price of Rice and Corn
df3 = pd.read_csv("datasets/prices/prices_wholesale-new-series_2010-2023.csv",)
df3 = get_annual_columns(df3, range(2012, 2024), 2)
df3 = preprocess_baesians_2(df3, {"Well Milled Rice (WMR)": "Rice",
                                  "Corngrits White": "Corn"}, melt_value="Wholesale Price")

# Retail Price of Rice and Corn
df4 = pd.read_csv("datasets/prices/prices_retail_2012-2023.csv")
df4 = get_annual_columns(df4, range(2012, 2024), 2)
df4 = preprocess_baesians_2(df4, {"RICE, REGULAR-MILLED, 1 KG": "Rice",
                                  'WHOLE CORN GRAIN, YELLOW, 1 KG': "Corn"}, melt_value="Retail Price")

# Area Harvested of Rice and Corn
df5 = pd.read_csv("datasets/agricultural-indicators/area-harvested-palay-corn_quarterly-annual_2010-2023.csv", skiprows=2)
df5 = filter_period(df5, "Annual")
df5 = preprocess_baesians_2(df5, {"Palay": "Rice", "Corn": "Corn"}, melt_value="Area Harvested")

# Consumer Price Index (All Income) per Region of Rice and Corn
df6 = pd.read_csv("datasets/price-indices-2018-based/cpi_all-income-households-by-cg-with-backcasting_1994-2023.csv")
df6 = get_annual_columns(df6, range(2012, 2024), 2)
df6 = preprocess_baesians_2(df6, {"01.1.1.12 - Rice": "Rice",
                                  "01.1.1.16 - Corn": "Corn"}, melt_value="CPI All Income")

# Consumer Price Index (Bottom 30) per Region of Rice and Corn
df7 = pd.read_csv("datasets/price-indices-2018-based/cpi_bottom-30-by-cg-with-backcasting_2012-2017.csv")
df7 = preprocess_baesians_2(df7, {"01.1.1.12 - Rice": "Rice", "01.1.1.16 - Corn": "Corn"}, melt_value=None)
tdf = pd.read_csv("datasets/price-indices-2018-based/cpi_bottom-30-by-cg_2018-2023.csv")
tdf = preprocess_baesians_2(tdf, {"01.1.1.12 - Rice": "Rice", "01.1.1.16 - Corn": "Corn"}, melt_value=None)
df7 = pd.merge(df7, tdf, on=["Geolocation", "Commodity"])
df7 = get_annual_columns(df7, range(2012, 2024), 2)
df7 = preprocess_baesians_2(df7, {"Rice": "Rice", "Corn": "Corn"}, melt_value="CPI Bottom 30")

# Costs and Returns per Region of Rice and Corn
df8 = pd.read_csv("datasets/agricultural-indicators/costs-and-returns_rice-and-corn.csv", skiprows=1, na_values=[".."])
df8 = df8[df8["Item"] == "NET RETURNS"]
df8.drop("Item", axis=1, inplace=True)
df8.rename({"Type": "Commodity"}, axis=1, inplace=True)
df8 = preprocess_baesians_2(df8, {"All Palay": "Rice", "All Corn": "Corn"}, melt_value="Net Returns")
df8 = df8[df8["Period"].str.contains("Average")]
df8["Period"] = df8["Period"].apply(lambda x: x.split()[1])

# National Inflation Rate
df9 = pd.read_csv("datasets/statista_inflation-rate-in-the-philippines-2029.csv", dtype={"Period": object, "Inflation Rate": float})

# NCR Retail Price Index on Food
df10 = pd.read_csv("datasets/price-indices-2018-based/rpi-in-ncr_food-only_1998-2023.csv")
df10 = get_annual_columns(df10, range(2010, 2024), 2)
df10 = preprocess_baesians_2(df10, {"Food": "Food"}, melt_value="NCR RPI").drop(["Geolocation", "Commodity"], axis=1)

# Agricultural Self-Sufficiency for Rice and Corn
df11 = pd.read_csv("datasets/agricultural-indicators/agri-self-sufficiency-ratio.csv", skiprows=1)
df11 = df11[(df11["Commodity"] == "Rice") | (df11["Commodity"] == "Corn")]
df11 = df11.melt("Commodity", df11.columns[1:], "Period", "Self-Sufficiency Ratio")

# Agricultural Import-Dependency for Rice and Corn
df12 = pd.read_csv("datasets/agricultural-indicators/agri-import-dependency-ratio.csv", skiprows=1)
df12 = df12[(df12["Commodity"] == "Rice") | (df12["Commodity"] == "Corn")]
df12 = df12.melt("Commodity", df12.columns[1:], "Period", "Import-Dependency Ratio")

# SU Gross Supply and UT Total Net Food Disposable for Rice and Corn
# Explained here: https://openstat.psa.gov.ph/Metadata/2B5FSUA0
df13 = pd.read_csv("datasets/agricultural-indicators/SUT_Rice and Corn.csv")
df13 = df13.filter(regex="Commodity|Year|Gross|Food")
df13.rename({"Year": "Period", "UT Total Net Food Disposable": "UT Consumable"}, axis=1, inplace=True)
df13.Period = df13.Period.astype(str)

# Merge all dfs into a single df
# NOTE: Remove df8 or add more year data or commodities to increase number of samples
# NOTE: You may adjust here which dfs are included in the final df
dfs = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13]
df = functools.reduce(lambda left, right: pd.merge(left, right), dfs)
move_column(df, "Area Harvested", 4)
move_column(df, "Self-Sufficiency Ratio", 5)
move_column(df, "Import-Dependency Ratio", 6)
move_column(df, "SU Gross Supply", 7)
move_column(df, "UT Consumable", 8)

# # Stocks of Rice (Annual)
tdf1 = df_stocks.copy()
tdf2 = df[["Commodity", "Period"]].copy()
tdf2["Period"] = pd.to_numeric(tdf2["Period"])
df.insert(3, "Stocks", pd.merge(tdf2, tdf1)["Stocks"])

# Additional Datasets -------------------------------------
# Uncomment the merge to include
# Poverty Incidence per Region (only on 2015, 2018, 2021)
df_poverty = pd.read_csv("datasets/poverty_incidence.csv", skiprows=1)
df_poverty["Commodity"] = "Rice"
df_poverty = preprocess_baesians_2(df_poverty, {"Rice": "Rice"}, melt_value="Poverty Incidence")
df_poverty.drop("Commodity", axis=1, inplace=True)

# df = pd.merge(df, df_poverty) # NOTE: Uncomment to include

# Enrollment Rate per Region
df_enrollment = pd.read_csv("datasets/enrollment.csv", na_values=[".."])
df_enrollment["Geolocation"] = df["Geolocation"].str.lower().replace(region_short_to_short_long)
df_enrollment = df_enrollment.filter(regex="Geolocation|Both Sexes")
df_enrollment.rename(lambda x: x.split()[2] if "Both" in x else x, axis=1, inplace=True)
df_enrollment = df_enrollment.melt(["Geolocation"], df_enrollment.columns[1:], "Period", "Enrollment Rate")

# df = pd.merge(df, df_enrollment) # NOTE: Uncomment to include

# Cleanup
del dfs, df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13

df

Unnamed: 0,Geolocation,Commodity,Period,Stocks,Volume,Area Harvested,Self-Sufficiency Ratio,Import-Dependency Ratio,SU Gross Supply,UT Consumable,Farmgate Price,Wholesale Price,Retail Price,CPI All Income,CPI Bottom 30,Net Returns,Inflation Rate,NCR RPI
0,philippines,Rice,2012,2227.093846,18032525.47,4690061.17,91.9,8.1,15465,11473,15.923333,32.800833,33.476667,81.632653,82.987552,19891.0,3.16,100.000000
1,region i (ilocos region),Rice,2012,2227.093846,1737695.00,403169.00,91.9,8.1,15465,11473,17.065000,31.941667,31.070000,79.113924,79.617834,21639.0,3.16,100.000000
2,region ii (cagayan valley),Rice,2012,2227.093846,2425536.47,582557.17,91.9,8.1,15465,11473,16.246667,30.702500,31.335833,80.971660,83.752094,16638.0,3.16,100.000000
3,region iii (central luzon),Rice,2012,2227.093846,3220607.00,675781.00,91.9,8.1,15465,11473,17.145000,32.866667,31.469167,79.617834,79.808460,30333.0,3.16,100.000000
4,region iv-a (calabarzon),Rice,2012,2227.093846,389272.00,113010.00,91.9,8.1,15465,11473,14.383333,32.295000,31.097500,81.168831,81.234768,8926.0,3.16,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,region x (northern mindanao),Corn,2021,794.027692,1455030.45,384864.96,94.8,5.2,9661,2609,14.165000,29.346667,24.740833,100.408755,100.126349,29280.0,3.93,125.858333
316,region xi (davao region),Corn,2021,794.027692,266894.04,175295.00,94.8,5.2,9661,2609,12.265000,30.242500,20.895000,90.597157,90.246810,,3.93,125.858333
317,region xii (soccsksargen),Corn,2021,794.027692,1105238.85,397363.00,94.8,5.2,9661,2609,15.142500,30.406667,,102.613642,102.581773,14274.0,3.93,125.858333
318,region xiii (caraga),Corn,2021,794.027692,141648.84,40933.00,94.8,5.2,9661,2609,12.490833,32.963333,,115.890568,115.778763,15564.0,3.93,125.858333


In [87]:
# @Visualizations

# for x in ["Volume", "Area Harvested"]:
#     for y in ["Retail Price", "Wholesale Price", "Farmgate Price", "CPI All Income", "CPI Bottom 30"]:
#         fig = px.scatter(df, x=x, y=y)
#         # fig.show()
        
# Correlation Matrix for Annual-Regions
# NOTE: Filter != or == philippines
df_ph = df[df["Geolocation"] != "philippines"]
for commodity in df_ph["Commodity"].unique():
    tdf = df_ph[df_ph["Commodity"] == commodity]
    samples = tdf.shape[0]
    # NOTE: pearson|kendall|spearman
    contingency = tdf.iloc[:, 3:].corr(method="pearson")
    idx = list(contingency.columns).index("Farmgate Price")
    cont_1 = contingency.iloc[:idx, idx:]
    fig = px.imshow(cont_1, text_auto=True, title=f"{commodity}: Correlation Matrix on {samples} Annual-Regional Samples",
                    labels=dict(x=f"Socioeconomic Indicator for {commodity}", y=f"Agricultural Indicator for {commodity}"))
    fig.update_traces(
        texttemplate='%{z:.2f}',
        hovertemplate=("x: %{x}<br>y: %{y}<br><b>Correlation: %{z}</b><extra></extra>")
    )
    fig.show()
    fig_to_div(fig, f"plots/annual-regional/{commodity}_heatmap_div.html")
    fig_to_html(fig, f"plots/annual-regional/{commodity}_heatmap.html")
    fig_to_png(fig, f"plots/annual-regional/{commodity}_heatmap.png")
    
    mask = (df["Geolocation"] == "philippines") & (df["Commodity"] == commodity)
    fig = dual_plot(df[mask], "Period", "Volume", "Retail Price",
                    f"{commodity}: Volume of Production and Retail Price in the Philippines",
                    "<b>Year</b>", "<b>Volume of Production</b> (metric tons)", "<b>Retail Price</b> (in PHP)")
    fig.show()
    fig_to_div(fig, f"plots/annual-regional/{commodity}_volume-retail_div.html")
    fig_to_html(fig, f"plots/annual-regional/{commodity}_volume-retail.html")
    fig_to_png(fig, f"plots/annual-regional/{commodity}_volume-retail.png")

    fig = dual_plot(df[mask], "Period", "Stocks", "Inflation Rate",
                    f"{commodity}: Volume of Production and National Inflation Rate in the Philippines",
                    "<b>Year</b>", "<b>Stocks</b> (metric tons)", "<b>Inflation Rate</b> (in %)")
    fig.show()
    fig_to_div(fig, f"plots/annual-regional/{commodity}_stocks-inflation_div.html")
    fig_to_html(fig, f"plots/annual-regional/{commodity}_stocks-inflation.html")
    fig_to_png(fig, f"plots/annual-regional/{commodity}_stocks-inflation.png")


In [102]:
# @Hypotheses Testing H0.2 Significant Relationship Test using Pearson's r
from termcolor import cprint

samples = df_ph.shape[0]
# NOTE: pearson|kendall|spearman
contingency = df_ph.iloc[:, 3:].corr(method="pearson")
idx = list(contingency.columns).index("Farmgate Price")
cont_1 = contingency.iloc[:idx, idx:]
fig = px.imshow(cont_1, text_auto=True, title=f"ALL: Correlation Matrix on {samples} Annual-Regional Samples",
                labels=dict(x="Socioeconomic Indicator for ALL crops", y="Agricultural Indicator for ALL crops"))
fig.update_traces(
    texttemplate='%{z:.2f}',
    hovertemplate=("x: %{x}<br>y: %{y}<br><b>Correlation: %{z}</b><extra></extra>")
)
fig.show()
fig_to_div(fig, "plots/annual-regional/ALL_heatmap_div.html")
fig_to_html(fig, "plots/annual-regional/ALL_heatmap.html")
fig_to_png(fig, "plots/annual-regional/ALL_heatmap.png")

# Agri and Market Indicators for Annual-Regional Data
df_ph = df_ph.dropna()
agri_cols = df.columns[3:10]
market_cols = df.columns[10:]
alpha = 0.05
for x in agri_cols:
    for y in market_cols:
        agri, market = df_ph[x], df_ph[y]
        r_stat, p_value = stats.pearsonr(agri, market)  # Pearson's r
        if p_value < alpha:
            cprint(f"{x} <-> {y} = {round(p_value, 3)}: Significant relationship exists.", "red")
        else:
            print(f"{x} </> {y} = {round(p_value, 3)}: No significant relationship exists.")


[31mStocks <-> Farmgate Price = 0.0: Significant relationship exists.[0m
[31mStocks <-> Wholesale Price = 0.0: Significant relationship exists.[0m
[31mStocks <-> Retail Price = 0.0: Significant relationship exists.[0m
Stocks </> CPI All Income = 0.121: No significant relationship exists.
[31mStocks <-> CPI Bottom 30 = 0.0: Significant relationship exists.[0m
[31mStocks <-> Net Returns = 0.043: Significant relationship exists.[0m
[31mStocks <-> Inflation Rate = 0.002: Significant relationship exists.[0m
Stocks </> NCR RPI = 0.118: No significant relationship exists.
Volume </> Farmgate Price = 0.11: No significant relationship exists.
Volume </> Wholesale Price = 0.933: No significant relationship exists.
Volume </> Retail Price = 0.823: No significant relationship exists.
Volume </> CPI All Income = 0.204: No significant relationship exists.
Volume </> CPI Bottom 30 = 0.148: No significant relationship exists.
[31mVolume <-> Net Returns = 0.0: Significant relationship exis

## Quarterly-Regional Analysis

In [100]:
# @title Quarterly megadataset for rice
# Volume of Rice
df1 = pd.read_csv("datasets/agricultural-indicators/volume_rice-corn.csv", skiprows=2, na_values=["..", ""])
df1 = preprocess_baesians_1(df1, "Palay", "Rice", melt_value="Volume")

# Farmgate Price of Rice
df2 = read_csv_to_df("datasets/prices/prices_farmgate-new-series_2010-2023.csv")
df2 = get_quarter_columns(df2, range(2012, 2024), 2)
df2 = preprocess_baesians_1(df2, "Palay [Paddy] Other Variety, dry (conv. to 14% mc)", "Rice", melt_value="Farmgate Price")

# Wholesale Price of Rice
df3 = pd.read_csv("datasets/prices/prices_wholesale-new-series_2010-2023.csv",)
df3 = get_quarter_columns(df3, range(2012, 2024), 2)
df3 = preprocess_baesians_1(df3, "Well Milled Rice (WMR)", "Rice", melt_value="Wholesale Price")

# Retail Price of Rice
df4 = read_csv_to_df("datasets/prices/prices_retail_2012-2023.csv")
df4 = get_quarter_columns(df4, range(2012, 2024), 2)
df4 = preprocess_baesians_1(df4, "RICE, REGULAR-MILLED, 1 KG", "Rice", melt_value="Retail Price")

# Cropyield (Area Harvested) of Rice
df5 = pd.read_csv("datasets/agricultural-indicators/area-harvested-palay-corn_quarterly-annual_2010-2023.csv", skiprows=1)
df5 = preprocess_baesians_1(df5, "Palay", "Rice", melt_value="Area Harvested")
df5 = df5[df5["Period"].str.contains("Q")]

# Consumer Price Index (All Income) per Region of Rice
df6 = pd.read_csv("datasets/price-indices-2018-based/cpi_all-income-households-by-cg-with-backcasting_1994-2023.csv")
df6 = get_quarter_columns(df6, range(2012, 2024), 2)
df6 = preprocess_baesians_1(df6, "01.1.1.12 - Rice", "Rice", melt_value="CPI All Income")

# Consumer Price Index (Bottom 30) per Region of Rice
df7 = pd.read_csv("datasets/price-indices-2018-based/cpi_bottom-30-by-cg-with-backcasting_2012-2017.csv")
df7 = preprocess_baesians_1(df7, "01.1.1.12 - Rice", "Rice", melt_value=None)
tdf = pd.read_csv("datasets/price-indices-2018-based/cpi_bottom-30-by-cg_2018-2023.csv")
tdf = preprocess_baesians_1(tdf, "01.1.1.12 - Rice", "Rice", melt_value=None)
df7 = pd.merge(df7, tdf, on=["Geolocation", "Commodity"])
df7 = get_quarter_columns(df7, range(2012, 2024), 2)
df7 = preprocess_baesians_1(df7, "Rice", "Rice", melt_value="CPI Bottom 30")

# Merge all dfs into a single df NOTE: geolocation must be in lower case
# NOTE: barmm is removed because it has null values, unfortunately
dfs = [df1, df2, df3, df4, df5, df6, df7]
df = functools.reduce(lambda left, right: pd.merge(left, right), dfs)
move_column(df, "Area Harvested", 4)

# ------------------------------------

# Value of Production of Rice (Annual)
tdf1 = df_val_of_prod
tdf1 = preprocess_baesians_1(tdf1, "Rice", "Rice", melt_value="Value of Production")

tdf2 = df.iloc[:, :3].copy()
tdf2["Period"] = df["Period"].str.replace(" Q\d", "", regex=True)
df["Value of Production"] = pd.merge(tdf2, tdf1)["Value of Production"]

# # Stocks of Rice (Annual)
tdf1 = df_stocks[df_stocks["Commodity"] == "Rice"][["Period", "Stocks"]]
tdf2 = pd.DataFrame(df["Period"].str.replace(" Q\d", "", regex=True).astype(dtype="int64"))
df.insert(3, "Stocks", pd.merge(tdf2, tdf1)["Stocks"])

# Remove Philippines outlier
df = df[df["Geolocation"] != "philippines"]
for x in ["Volume", "Area Harvested", "Stocks"]:
    for y in ["Retail Price", "Wholesale Price", "Farmgate Price", "CPI All Income", "CPI Bottom 30", "Value of Production"]:
        fig = px.scatter(df, x=x, y=y, color="Geolocation")
        # fig.show()

# Correlation Matrix for Quarterly-Regions
df.info()
contingency = df.iloc[:, 3:].corr()
cont_1 = contingency.iloc[3:, 0:3]
fig = px.imshow(cont_1, text_auto=True)
fig.show()

contingency
df

<class 'pandas.core.frame.DataFrame'>
Index: 672 entries, 1 to 719
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Geolocation          672 non-null    object 
 1   Commodity            672 non-null    object 
 2   Period               672 non-null    object 
 3   Stocks               672 non-null    float64
 4   Volume               672 non-null    float64
 5   Area Harvested       672 non-null    float64
 6   Farmgate Price       672 non-null    float64
 7   Wholesale Price      670 non-null    float64
 8   Retail Price         672 non-null    float64
 9   CPI All Income       672 non-null    float64
 10  CPI Bottom 30        672 non-null    float64
 11  Value of Production  627 non-null    float64
dtypes: float64(9), object(3)
memory usage: 68.2+ KB


Unnamed: 0,Geolocation,Commodity,Period,Stocks,Volume,Area Harvested,Farmgate Price,Wholesale Price,Retail Price,CPI All Income,CPI Bottom 30,Value of Production
1,region i (ilocos region),Rice,2012 Q1,2227.093846,313643.00,67944.00,17.200000,31.533333,29.416667,77.630850,78.960939,39075.0
2,region ii (cagayan valley),Rice,2012 Q1,2227.093846,463414.00,118916.00,16.203333,30.670000,31.926667,82.072230,84.694894,56876.0
3,region iii (central luzon),Rice,2012 Q1,2227.093846,628879.00,131940.00,17.193333,32.796667,30.800000,79.460275,79.446607,5753.0
4,region iv-a (calabarzon),Rice,2012 Q1,2227.093846,101543.00,27805.00,14.473333,31.990000,30.676667,80.329063,80.496204,16263.0
5,mimaropa region (mimaropa),Rice,2012 Q1,2227.093846,211461.00,62191.00,15.146667,31.466667,30.976667,78.580435,80.003065,16368.0
...,...,...,...,...,...,...,...,...,...,...,...,...
715,region ix (zamboanga peninsula),Rice,2023 Q4,1786.925385,172537.76,45245.35,21.753333,50.176667,49.353333,104.480498,104.342834,
716,region x (northern mindanao),Rice,2023 Q4,1786.925385,229851.35,46539.00,24.213333,46.096667,48.320000,111.711589,109.703895,
717,region xi (davao region),Rice,2023 Q4,1786.925385,123140.78,26758.00,21.996667,49.786667,49.313333,110.723788,111.782338,
718,region xii (soccsksargen),Rice,2023 Q4,1786.925385,357203.95,96022.00,21.650000,46.526667,46.030000,103.242814,103.195269,


In [None]:
# Import the necessary libraries:
from scipy import stats
from termcolor import colored

sample_size = 500
# Randomly sample 500 rows from the Quarterly-Regional Market-Farm dataset:
df = df.sample(n=sample_size)

# Filter the dataset for the two variables we want to compare:
for x in ["Volume", "Area Harvested", "Stocks"]:
    for y in ["Retail Price", "Wholesale Price", "Farmgate Price", "CPI All Income", "CPI Bottom 30", "Value of Production"]:
        farm = df[x]
        market = df[y]

        # Perform the t-test:
        t_stat, p_value = stats.ttest_ind(farm, market)

        # Interpret the results:
        alpha = 0.05
        print(colored(p_value, "red"))
        if p_value < alpha:
            print(f"Reject the null hypothesis; there is a significant difference between {x} and {y}.")
        else:
            print(f"Fail to reject the null hypothesis; there is no significant difference between {x} and {y}.")