Notes
* It appears that parents of siblings are simple averages of the siblings.

In [None]:
import pandas as pd
import datetime as dt
from pathlib import Path

DEBUG = False

datapath = Path("../datasets/prices")
writepath = Path("../datasets/prices-preclean")

def fix_month_year_ordering(df):
    cols = df.columns.tolist()
    date_month_cols = sorted([dt.datetime.strptime(x, "%Y %b") for x in cols[2:]])
    date_month_cols = [dt.date.strftime(x, "%Y %b") for x in date_month_cols]
    cols[2:] = date_month_cols
    df = df[cols]
    return df

files = list(datapath.glob("*.csv"))
file = files[0]
df = pd.read_csv(file, index_col=0)
df.drop(list(df.filter(regex="Ave|Annual|2024")), axis=1, inplace=True)  # Remove Ave, Annual, and 2024 Columns
df = fix_month_year_ordering(df)
print(df.isna().any(axis=1).sum())
 


In [None]:
# We noticed that there are a lot of "<month> Annual" values that are NA 
# We explore this
for year in range(2018, 2025):
    # print(df.filter(regex=f"{year} Annual"))
    year_na = df.filter(regex=f"{year} Annual").isnull().any(axis=1).sum()
    print(f"Dataset filtered to {year} has {year_na} rows with NA values")
# df = df[df.columns.drop(list(df.filter(regex='Annual')))]

# null_count = df.isna().any(axis=1).sum()
# print('Number of rows with null values:', null_count)
# # df.dropna(axis=0, inplace=True)
# output_df = df[(df.drop(["Geolocation", "Commodity"], axis=1) != float(0)).any(axis=1)]
# output_df.to_csv("datasets/clean/dummy.csv")

In [None]:
import json
df_grouped = df.groupby("Commodity Description")
geolocs = df_grouped.get_group("0 - ALL ITEMS").Geolocation
geolocs = list(geolocs)
geolocs_rels = {}
natl = "PHILIPPINES"
region = None
while len(geolocs) != 0:
    loc = str(geolocs.pop(0))
    if (loc == natl):
        # Natl case
        geolocs_rels.update({natl:{}})
    elif (loc.startswith("....")):
        # Province or HUC case (discard if HUC)
        if "City" in loc:
            continue
        province = loc.strip(".")
        geolocs_rels[natl][region].append(province)
    elif (loc.startswith("..")):
        # Region case
        region = loc.strip(".")
        geolocs_rels[natl].update({region:[]})
        
with open('region_provinces.json', 'w') as fp:
    json.dump(geolocs_rels, fp)

In [None]:
# Concatenator
import pandas as pd
from pathlib import Path

CONCAT = True
MERGE = False
datapath = Path("../datasets/New")

files = []
for file in datapath.glob("*.csv"):
    files.append(file)

print(files)

if CONCAT:
    df1 = pd.read_csv(files[0], skiprows=2)
    for file in files[1:]:
        df2 = pd.read_csv(file, skiprows=2)
        df1 = pd.concat([df1, df2], axis=0)
print(df1.columns)
        
if MERGE:
    df1 = pd.read_csv(files[0])
    df2 = pd.read_csv(files[1])
    df1 = pd.merge(df1, df2, on=["Geolocation", "Commodity Description"], how='inner')

# df1[~df1["Geolocation"].str.contains("City")]

df1.to_csv(Path(datapath, "done/test.csv"), index=False)
df1

In [None]:
# Basic Cleanup
# I think it's safe to assume that all 0s are from NAs
# Assumes that the first row is the header row
# (because when initially downloaded from OpenStat, the first two rows are description and newline)
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path

def fix_month_year_ordering(df):
    cols = df.columns.tolist()
    if "Type" in cols:
        cols.remove("Type")
    print(cols)
    try:
        date_month_cols = sorted([dt.datetime.strptime(x, "%Y %b") for x in cols[2:]])
        date_month_cols = [dt.date.strftime(x, "%Y %b") for x in date_month_cols]
    except ValueError as v:
        if len(v.args) > 0 and v.args[0].startswith('unconverted data remains: '):
            date_month_cols = sorted([dt.datetime.strptime(x, "%Y %B") for x in cols[2:]])
            date_month_cols = [dt.date.strftime(x, "%Y %B") for x in date_month_cols]
        else:
            raise
    cols[2:] = date_month_cols
    df = df[cols]
    return df

datapath = Path("../datasets/economic-indicators/")
files = list(datapath.glob("*.csv"))

print(files)

for file in files:
    df = pd.read_csv(file, na_values=[0, ".."], encoding="cp1252")
    # df.drop(list(df.filter(regex="Ave|Annual|2024")), axis=1, inplace=True)  # Remove Ave, Annual, and 2024 Columns
    # zero_rows = df.iloc[:,2:].eq(0).all(axis=1)
    # df.iloc[zero_rows, 2:] = np.nan
    # df = fix_month_year_ordering(df)
    df.to_csv(Path(datapath, "precleaned", file.name), index=False, encoding="utf-8")


In [14]:
# Value of Agricultural Production
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path

datapath = Path("../datasets/new/value")
files = list(datapath.glob("*.csv"))

df1 = pd.read_csv(files[0], na_values=[".", ".."])
region = files[0].stem.split("_")[1]
df1.insert(1, "Geolocation", pd.Series([region for x in range(df1.shape[0])]))
for file in files:
    df2 = pd.read_csv(file, na_values=[".", ".."])
    region = file.stem.split("_")[1]
    df2.insert(1, "Geolocation", pd.Series([region for x in range(df2.shape[0])]))
    pd.concat([df1, df2])
df1

df1.to_csv("../datasets/value.csv", index=False)
