In [1]:
import pandas as pd

# Data Inflasi

In [2]:
df = pd.read_excel('data/Data Inflasi (2000-2025).xlsx')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,,,,
1,Data Inflasi,Data Inflasi,Data Inflasi,Data Inflasi
2,,,,
3,No,Periode,Data Inflasi,
4,1,Maret 2025,1.03 %,
...,...,...,...,...
267,264,April 2003,7.62 %,
268,265,Maret 2003,7.17 %,
269,266,Februari 2003,7.6 %,
270,267,Januari 2003,8.68 %,


In [4]:
df = df.iloc[4:].reset_index(drop=True).drop(columns=['Unnamed: 0', 'Unnamed: 3'])

In [5]:
df = df.rename(columns={'Unnamed: 1': 'Tahun', 'Unnamed: 2': 'Inflation Percentage'})

In [6]:
df

Unnamed: 0,Tahun,Inflation Percentage
0,Maret 2025,1.03 %
1,Februari 2025,-0.09 %
2,Januari 2025,0.76 %
3,Desember 2024,1.57 %
4,November 2024,1.55 %
...,...,...
263,April 2003,7.62 %
264,Maret 2003,7.17 %
265,Februari 2003,7.6 %
266,Januari 2003,8.68 %


In [7]:
# Split the Tahun column into Bulan and Tahun
df[['Bulan', 'Tahun']] = df['Tahun'].str.split(' ', expand=True)

In [8]:
df

Unnamed: 0,Tahun,Inflation Percentage,Bulan
0,2025,1.03 %,Maret
1,2025,-0.09 %,Februari
2,2025,0.76 %,Januari
3,2024,1.57 %,Desember
4,2024,1.55 %,November
...,...,...,...
263,2003,7.62 %,April
264,2003,7.17 %,Maret
265,2003,7.6 %,Februari
266,2003,8.68 %,Januari


In [9]:
df.to_csv('data-cleaned/inflation_data_cleaned.csv')

# Data Clean UMP

In [81]:
import pandas as pd
import numpy as np

def load_ump_data(start_year: int, end_year: int, folder: str = "data") -> pd.DataFrame:
    """
    Loads and cleans ump data from a CSV file with the given year range.

    Parameters:
        start_year (int): The first year in the range (inclusive).
        end_year (int): The last year in the range (inclusive).
        folder (str): Path to the folder where the CSV file is stored.

    Returns:
        pd.DataFrame: Cleaned DataFrame with numeric UMR values.
    """
    # Generate list of years
    year = list(range(start_year, end_year + 1))
    
    # Build the filename
    filename = f"{folder}/Upah Minimum Regional_Propinsi, {year[0]}-{year[-1]}.csv"

    # Read and clean the data
    df = pd.read_csv(filename)
    df = df.iloc[2:].reset_index(drop=True)

    # Rename columns
    column_renames = {'38 Provinsi': 'Provinsi'}
    for i, y in enumerate(year):
        column_renames[f'Unnamed: {i + 1}'] = y
    df.rename(columns=column_renames, inplace=True)

    # Replace "-" with NaN and convert to numeric
    df.replace("-", np.nan, inplace=True)
    for y in year:
        df[y] = pd.to_numeric(df[y], errors='coerce')

    # Fill missing values with the global mean
    global_mean = df[year].stack().mean()
    df[year] = df[year].fillna(global_mean)

    return df


In [98]:
import os
import pandas as pd
import numpy as np
import re

# Folder paths
raw_folder = "data"
clean_folder = "data-cleaned"

# Make sure the output folder exists
os.makedirs(clean_folder, exist_ok=True)

# Function to load and clean a single UMP file
def clean_umr_file(filepath, years):
    df = pd.read_csv(filepath)
    df = df.iloc[2:].reset_index(drop=True)

    # Rename columns
    column_renames = {'38 Provinsi': 'Provinsi'}
    for i, y in enumerate(years):
        column_renames[f'Unnamed: {i + 1}'] = str(y)
    df.rename(columns=column_renames, inplace=True)

    # Replace "-" with NaN and convert to numeric
    df.replace("-", np.nan, inplace=True)
    for y in years:
        df[str(y)] = pd.to_numeric(df[str(y)], errors='coerce')

    # Fill NaNs with global mean
    global_mean = df[[str(y) for y in years]].stack().mean()
    df[[str(y) for y in years]] = df[[str(y) for y in years]].fillna(global_mean)
    return df

# Loop through all CSV files in the data folder
for filename in os.listdir(raw_folder):
    match = re.search(r'(\d{4})-(\d{4})', filename)
    if match and filename.endswith(".csv"):
        start_year = int(match.group(1))
        end_year = int(match.group(2))

        # 👉 Skip if the file ends after 2014
        if end_year > 2014:
            continue

        years = list(range(start_year, end_year + 1))
        file_path = os.path.join(raw_folder, filename)
        cleaned_df = clean_umr_file(file_path, years)

        # Save cleaned DataFrame
        output_filename = f"UMP_{start_year}-{end_year}.csv"
        output_path = os.path.join(clean_folder, output_filename)
        cleaned_df.to_csv(output_path, index=False)
        print(f"✅ Saved cleaned file to: {output_path}")


✅ Saved cleaned file to: data-cleaned\UMP_2000-2002.csv
✅ Saved cleaned file to: data-cleaned\UMP_2003-2005.csv
✅ Saved cleaned file to: data-cleaned\UMP_2006-2008.csv
✅ Saved cleaned file to: data-cleaned\UMP_2009-2011.csv
✅ Saved cleaned file to: data-cleaned\UMP_2012-2014.csv


In [99]:
filename = 'data/Upah Minimum Regional_Propinsi, 2015-2018.csv'
year = [2015,2016,2018]
df = pd.read_csv(filename)
df

Unnamed: 0,38 Provinsi,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,,Upah Minimum Regional/Propinsi (Rupiah),,
1,,2015,2016,2018
2,ACEH,1900000,2118500,2700000
3,SUMATERA UTARA,1625000,1811875,2132189
4,SUMATERA BARAT,1615000,1800725,2119067
5,RIAU,1878000,2095000,2464154
6,JAMBI,1710000,1906650,2243719
7,SUMATERA SELATAN,1974346,2206000,2595995
8,BENGKULU,1500000,1605000,1888741
9,LAMPUNG,1581000,1763000,2074673


In [100]:
df = pd.read_csv(filename)
df = df.iloc[2:].reset_index(drop=True)

    # Rename columns
column_renames = {'38 Provinsi': 'Provinsi'}
for i, y in enumerate(year):
    column_renames[f'Unnamed: {i + 1}'] = y
    df.rename(columns=column_renames, inplace=True)

    # Replace "-" with NaN and convert to numeric
df.replace("-", np.nan, inplace=True)
for y in year:
    df[y] = pd.to_numeric(df[y], errors='coerce')

    # Fill missing values with the global mean
global_mean = df[year].stack().mean()
df[year] = df[year].fillna(global_mean)

df.to_csv('data-cleaned/ump_2015-2018.csv')