<a href="https://colab.research.google.com/github/Bourbon-Rye/Baesian-Cropability/blob/main/PilipiNuts_2023_Baesian_Cropability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
# @Libraries
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
import json
import re

from pathlib import Path
from sklearn import preprocessing
from plotly.subplots import make_subplots
from scipy.stats import chi2_contingency

In [None]:
# @Utilities
def read_csv_to_df(csvfile: Path, comment_symbol='#') -> pd.DataFrame:
    """Read CSV to DataFrame with comment validation.
        Allows comment lines in CSVs where line[0] == comment_symbol.
        Does not recognize comment_symbol anywhere else. 
    """
    tempfile = Path('temp.csv')
    with open(csvfile, 'r') as csv, open(tempfile, 'w+') as temp:
        lines = csv.readlines()
        for line in lines:
            if line[0] != comment_symbol:
                temp.write(line)
    tempfile.replace(csvfile)
    return pd.read_csv(csvfile)

with open('datasets/region_provinces.json') as jsonfile:
    regions_provinces = json.load(jsonfile)
regions = set(regions_provinces['PHILIPPINES'])
provinces = set([item.lower() for key in regions for item in regions_provinces['PHILIPPINES'][key]])
regions = set(map(str.lower, regions))

def is_region(x: str, regions=regions) -> bool:
    x = x.strip(' .')
    x = x.lower()
    if x in regions:
        return True
    else:
        return False
    
def is_province(x: str, provinces=provinces) -> bool:
    x = x.strip(' .')
    x = x.lower()
    if x in provinces:
        return True
    else:
        return False

def get_quarter_columns(df: pd.DataFrame, year_range: range, period_idx: int):
    """Assumes contiguous period columns and that columns before  are ID columns"""
    df_quarter = df.iloc[:, :period_idx]
    for year in year_range:
        for q in range(0, 12, 3):
            df_quarter[f"{year} Q{q//3+1}"] = df.filter(regex=str(year), axis=1).iloc[:, q:q+3].mean(axis=1)
    return df_quarter

def get_annual_columns(df: pd.DataFrame, year_range: range, period_idx: int):
    """Assumes contiguous period columns and that columns before year_start are ID columns"""
    df_annual = df.iloc[:, :period_idx]
    for year in year_range:
        df_annual[f"{year}"] = df.filter(regex=str(year), axis=1).mean(axis=1)
    return df_annual

def swap_columns(df: pd.DataFrame, col1: str, col2: str):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df

def normalize(df: pd.DataFrame):
    x = df.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    return pd.DataFrame(x_scaled)

def dual_plot(df: pd.DataFrame, y1: str, y2: str,
              x_title: str, y1_title: str, y2_title: str, title_text: str):
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    # Add traces
    fig.add_trace(
        go.Scatter(x=df.index, y=df[y1], name=y1),
        secondary_y=False,
    )
    fig.add_trace(
        go.Scatter(x=df.index, y=df[y2], name=y2),
        secondary_y=True,
    )
    # Add titles
    fig.update_layout(title_text=title_text)
    fig.update_xaxes(title_text=x_title)
    # Set y-axes titles
    fig.update_yaxes(title_text=y1_title, secondary_y=False)
    fig.update_yaxes(title_text=y2_title, secondary_y=True)

    fig.show()

# Baesian Plots


## Yenzy Plots
H0.2: There is no significant relationship between market conditions and food crop production.

Goal: Visually and statistically assess the relationship between market indicators and crop yield.

Visual tests are dual plots and scatterplots. Statistics for relationship testing: contingency table and chi-square test.

**Tested Assessments**
| Cropyield Indicator | Market Indicator | Geolocation | Commodity
| --- | --- | --- | --- |
| Volume | Retail Price | Philippines | Palay and Corn | 

In [None]:
# Sample Relationship Test
data = [[207, 282, 241], [234, 242, 232]]
stat, p, dof, expected = chi2_contingency(data)
 
# interpret p-value
alpha = 0.05
print("p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

In [None]:
# Volume of Production of Palay and Corn across the Years (Quarterly and Annual)
df2 = read_csv_to_df("datasets/agricultural-indicators/Volume_Rice and Corn_quarterly.csv")
df2 = df2[df2["Geolocation"] == "PHILIPPINES"]
df2 = df2.melt(id_vars=["Commodity", "Geolocation"], value_vars=df2.columns[2:], var_name="Period", value_name="Volume")
fig = px.line(df2, x="Period", y="Volume", color="Commodity", title='Quarterly Volume of Production of Crops across the Years<br>National Averages in Metric Tons').update_layout(
    xaxis_title="Period", yaxis_title = "Volume of Production")
fig.show()

In [None]:
# Annual Volume and Retail Price
# Volume of Rice
df1 = read_csv_to_df("datasets/agricultural-indicators/Volume_Rice and Corn_annual.csv")
df1 = df1[df1["Geolocation"].apply(is_region)]
df1 = df1[df1["Commodity"] == "Palay"]
df1["Commodity"] = df1["Commodity"].apply(lambda val: "Rice")
df1 = df1.melt(id_vars=["Commodity", "Geolocation"], value_vars=df1.columns[2:], var_name="Period", value_name="Volume")
df1 = swap_columns(df1, "Commodity", "Geolocation")

# Retail Price of Palay
df2 = read_csv_to_df("datasets/prices/prices_retail_2012-2023.csv")
df2 = df2[df2["Geolocation"].apply(is_region)]
df2.dropna(inplace=True)
df2 = get_annual_columns(df2, range(2012, 2024), 2)
df2.dropna(inplace=True)
df2 = df2[df2["Commodity"] == 'RICE, REGULAR-MILLED, 1 KG']
df2["Commodity"] = df2["Commodity"].apply(lambda val: "Rice")
df2 = df2.melt(["Geolocation", "Commodity"], df2.columns[2:], "Period", "Price")

# Volume and Price DataFrame
df = pd.merge(df1, df2)
df["Geolocation"] = df["Geolocation"].apply(lambda val: val.lstrip("."))
fig = px.scatter(df, x="Volume", y="Price", color="Geolocation")
fig.show()

In [None]:
# Annual
df2 = read_csv_to_df("datasets/agricultural-indicators/Volume_Rice and Corn_annual.csv")
df2 = df2.melt(id_vars=["Geolocation", "Commodity"], value_vars=df2.columns[2:], var_name="Period", value_name="Volume")
df_volume = df2[df2["Geolocation"] == "PHILIPPINES"]

df2 = read_csv_to_df("datasets/prices/prices_retail_2012-2023.csv")
df2 = df2[df2["Geolocation"].apply(is_region)]
df2.dropna(inplace=True)
df2 = get_annual_columns(df2, range(2012, 2024), 2)
df2.melt(id_vars=["Geolocation", "Commodity"], value_vars=df2.columns[2:], var_name="Year", value_name="Volume")
