Notes
* It appears that parents of siblings are simple averages of the siblings.

In [136]:
import pandas as pd
import datetime as dt
from pathlib import Path

DEBUG = False

datapath = Path("../datasets/prices")
writepath = Path("../datasets/prices-preclean")

def fix_month_year_ordering(df):
    cols = df.columns.tolist()
    date_month_cols = sorted([dt.datetime.strptime(x, "%Y %b") for x in cols[2:]])
    date_month_cols = [dt.date.strftime(x, "%Y %b") for x in date_month_cols]
    cols[2:] = date_month_cols
    df = df[cols]
    return df

files = list(datapath.glob("*.csv"))
file = files[0]
df = pd.read_csv(file, index_col=0)
df.drop(list(df.filter(regex="Ave|Annual|2024")), axis=1, inplace=True)  # Remove Ave, Annual, and 2024 Columns
df = fix_month_year_ordering(df)
print(df.isna().any(axis=1).sum())
 


40120


In [20]:
# We noticed that there are a lot of "<month> Annual" values that are NA 
# We explore this
for year in range(2018, 2025):
    # print(df.filter(regex=f"{year} Annual"))
    year_na = df.filter(regex=f"{year} Annual").isnull().any(axis=1).sum()
    print(f"Dataset filtered to {year} has {year_na} rows with NA values")
# df = df[df.columns.drop(list(df.filter(regex='Annual')))]

# null_count = df.isna().any(axis=1).sum()
# print('Number of rows with null values:', null_count)
# # df.dropna(axis=0, inplace=True)
# output_df = df[(df.drop(["Geolocation", "Commodity"], axis=1) != float(0)).any(axis=1)]
# output_df.to_csv("datasets/clean/dummy.csv")

Dataset filtered to 2018 has 6018 rows with NA values
Dataset filtered to 2019 has 6018 rows with NA values
Dataset filtered to 2020 has 6018 rows with NA values
Dataset filtered to 2021 has 6018 rows with NA values
Dataset filtered to 2022 has 6018 rows with NA values
Dataset filtered to 2023 has 6018 rows with NA values
Dataset filtered to 2024 has 6018 rows with NA values


In [None]:
import json
df_grouped = df.groupby("Commodity Description")
geolocs = df_grouped.get_group("0 - ALL ITEMS").Geolocation
geolocs = list(geolocs)
geolocs_rels = {}
natl = "PHILIPPINES"
region = None
while len(geolocs) != 0:
    loc = str(geolocs.pop(0))
    if (loc == natl):
        # Natl case
        geolocs_rels.update({natl:{}})
    elif (loc.startswith("....")):
        # Province or HUC case (discard if HUC)
        if "City" in loc:
            continue
        province = loc.strip(".")
        geolocs_rels[natl][region].append(province)
    elif (loc.startswith("..")):
        # Region case
        region = loc.strip(".")
        geolocs_rels[natl].update({region:[]})
        
with open('region_provinces.json', 'w') as fp:
    json.dump(geolocs_rels, fp)

In [126]:
# Concatenator
import pandas as pd
from pathlib import Path

CONCAT = True
MERGE = False
datapath = Path("../datasets/New")

files = []
for file in datapath.glob("*.csv"):
    files.append(file)

print(files)

if CONCAT:
    df1 = pd.read_csv(files[0], skiprows=2)
    for file in files[1:]:
        df2 = pd.read_csv(file, skiprows=2)
        df1 = pd.concat([df1, df2], axis=0)
print(df1.columns)
        
if MERGE:
    df1 = pd.read_csv(files[0])
    df2 = pd.read_csv(files[1])
    df1 = pd.merge(df1, df2, on=["Geolocation", "Commodity Description"], how='inner')

# df1[~df1["Geolocation"].str.contains("City")]

df1.to_csv(Path(datapath, "done/test.csv"), index=False)
df1

[WindowsPath('../datasets/New/2M4ARN01 (1).csv'), WindowsPath('../datasets/New/2M4ARN01.csv'), WindowsPath('../datasets/New/2M4ARN02.csv'), WindowsPath('../datasets/New/2M4ARN03.csv'), WindowsPath('../datasets/New/2M4ARN04.csv'), WindowsPath('../datasets/New/2M4ARN05.csv'), WindowsPath('../datasets/New/2M4ARN06.csv'), WindowsPath('../datasets/New/2M4ARN07 (1).csv'), WindowsPath('../datasets/New/2M4ARN07.csv'), WindowsPath('../datasets/New/2M4ARN08.csv'), WindowsPath('../datasets/New/2M4ARN09.csv'), WindowsPath('../datasets/New/2M4ARN10.csv'), WindowsPath('../datasets/New/2M4ARN11 (1).csv'), WindowsPath('../datasets/New/2M4ARN11 (2).csv'), WindowsPath('../datasets/New/2M4ARN11.csv')]
Index(['Region/Province', 'Commodity', '2012 January', '2012 February',
       '2012 March', '2012 April', '2012 May', '2012 June', '2012 July',
       '2012 August', '2012 September', '2012 October', '2012 November',
       '2012 December', '2013 January', '2013 February', '2013 March',
       '2013 April'

Unnamed: 0,Region/Province,Commodity,2012 January,2012 February,2012 March,2012 April,2012 May,2012 June,2012 July,2012 August,...,2017 March,2017 April,2017 May,2017 June,2017 July,2017 August,2017 September,2017 October,2017 November,2017 December
0,PHILIPPINES,"CORN, WHOLE GRAIN, YELLOW, 1 KG",20.89,20.76,21.31,21.39,21.06,21.55,21.55,21.55,...,23.02,23.02,23.02,23.02,23.02,23.02,23.02,22.77,22.77,22.77
1,PHILIPPINES,"WHOLE CORN GRAIN, POPCORN, YELLOW, 1 KG",18.48,18.48,18.48,18.48,18.48,18.48,20.79,20.79,...,32.50,32.50,32.50,32.50,32.50,32.50,32.50,32.50,32.50,32.50
2,PHILIPPINES,"WHOLE CORN GRAIN, WHITE, 1 KG",25.02,24.69,23.85,24.18,24.22,24.45,24.72,24.74,...,26.94,27.06,27.01,26.63,26.88,26.88,27.18,27.26,27.58,27.68
3,PHILIPPINES,"WHOLE CORN GRAIN, YELLOW, 1 KG",24.31,25.10,25.13,25.04,25.01,24.86,24.77,24.58,...,25.08,25.14,25.17,25.30,25.33,25.46,25.90,25.96,25.66,25.74
4,PHILIPPINES,"WHOLE CORN ON THE COB, SWEET CORN, 1 KG",29.07,29.21,29.29,29.63,29.98,30.83,31.50,31.55,...,38.40,38.48,39.00,38.79,38.70,38.98,40.44,39.64,39.51,39.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,....Tawi-Tawi,"FRESH FISH, MACKEREL, ALUMAHAN, MEDIUM, 1 KG",..,..,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
996,....Tawi-Tawi,"FRESH FISH, MACKEREL, HASA-HASA, MEDIUM, 1 KG",..,..,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
997,....Tawi-Tawi,"FRESH FISH, MILKFISH, BANGUS, DAGUPAN, MEDIUM,...",..,..,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
998,....Tawi-Tawi,"FRESH FISH, MILKFISH, BANGUS, MEDIUM, 1 KG",..,..,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..


In [129]:
# Basic Cleanup
import pandas as pd
from pathlib import Path

datapath = Path("../datasets/prices")
files = datapath.glob("*.csv")

print(*files)



..\datasets\prices\prices_dealers-fertilizers_1990-2019.csv ..\datasets\prices\prices_farmgate-new-series_2010-2023.csv ..\datasets\prices\prices_retail-2012-based_2012-2017.csv ..\datasets\prices\prices_retail-2018-based_2018-2023.csv ..\datasets\prices\prices_wholesale-new-series_2010-2023.csv
