In [80]:
import pandas as pd
import os
from pathlib import Path

In [81]:
# Define column mapping to match data.md convention
column_mapping = {
    'pm10': 'pm_sepuluh',
    'so2': 'sulfur_dioksida',
    'co': 'karbon_monoksida',
    'o3': 'ozon',
    'no2': 'nitrogen_dioksida',
    'critical': 'parameter_pencemar_kritis',
    'categori': 'kategori',
    'lokasi_spku': 'stasiun'
}

# Define column order to match data.md
column_order = [
    'periode_data', 'tanggal', 'stasiun', 'pm_sepuluh',
    'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida',
    'max', 'parameter_pencemar_kritis', 'kategori'
]


final_year = 2019

# Define the folder path and years to combine
ispu_folder = Path("data/ISPU")
years_to_combine = [i for i in range(2015, final_year + 1, 1)]

# List to store dataframes
dfs = []

# Load and standardize files that contain the desired years
for file_path in ispu_folder.glob("*.csv"):
    # Check if any of the desired years is in the filename
    for year in years_to_combine:
        if str(year) in file_path.name:
            df_temp = pd.read_csv(file_path)
            
            print(f"Loaded: {file_path.name}")
            print(f"  Original columns: {list(df_temp.columns)}")
            
            # Rename columns to match convention
            df_temp = df_temp.rename(columns=column_mapping)
            
            # Add missing columns with None
            for col in column_order:
                if col not in df_temp.columns:
                    df_temp[col] = None
            
            # Reorder columns to match data.md
            df_temp = df_temp[column_order]
            
            print(f"  Standardized to: {list(df_temp.columns)}")
            print(f"  Rows: {len(df_temp)}\n")
            
            dfs.append(df_temp)
            break

print(f"Total files loaded: {len(dfs)}")

# Combine all dataframes
df_combined = pd.concat(dfs, ignore_index=True)

Loaded: indeks-standar-pencemaran-udara-(ispu)-tahun-2015-komponen-data.csv
  Original columns: ['periode_data', 'tanggal', 'pm10', 'so2', 'co', 'o3', 'no2', 'max', 'critical', 'categori', 'lokasi_spku']
  Standardized to: ['periode_data', 'tanggal', 'stasiun', 'pm_sepuluh', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max', 'parameter_pencemar_kritis', 'kategori']
  Rows: 365

Loaded: indeks-standar-pencemaran-udara-(ispu)-tahun-2016-komponen-data.csv
  Original columns: ['periode_data', 'tanggal', 'stasiun', 'pm10', 'so2', 'co', 'o3', 'no2', 'max', 'critical', 'categori']
  Standardized to: ['periode_data', 'tanggal', 'stasiun', 'pm_sepuluh', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max', 'parameter_pencemar_kritis', 'kategori']
  Rows: 1830

Loaded: indeks-standar-pencemaran-udara-(ispu)-tahun-2017-komponen-data.csv
  Original columns: ['periode_data', 'tanggal', 'stasiun', 'pm10', 'so2', 'co', 'o3', 'no2', 'max', 'critical', 'cate

In [82]:
# Replace empty strings and "---" with NULL (NaN)
df_combined = df_combined.replace(['', '---', ' ', '  '], pd.NA)
df_combined = df_combined[df_combined['kategori'] != 'TIDAK ADA DATA']

df_combined['tanggal'] = pd.to_datetime(
    df_combined['tanggal'], 
    format='mixed',  # Allows multiple formats
    dayfirst=False,  # Month comes first in ambiguous cases
    errors='raise'
)

# Convert pollutant columns to integers (pd.NA will remain as NaN for nullable Int64)
numeric_columns = ['pm_sepuluh', 'sulfur_dioksida', 
                   'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max']

for col in numeric_columns:
    df_combined[col] = pd.to_numeric(df_combined[col], errors='coerce').astype('Int64')

In [84]:
import re

# Function to extract station ID (DKIx format)
def extract_station_id(station_name):
    if pd.isna(station_name):
        return station_name
    # Extract DKI followed by a number
    match = re.search(r'DKI\d+', str(station_name), re.IGNORECASE)
    if match:
        return match.group(0).upper()
    return station_name

# Apply normalization to stasiun column
df_combined['stasiun'] = df_combined['stasiun'].apply(extract_station_id)

In [86]:
df_combined['ID'] = df_combined['tanggal'].astype(str) + '_' + df_combined['stasiun'].astype(str)
df_combined = df_combined[['ID'] + column_order]
df_combined = df_combined.sort_values(by=['tanggal']).reset_index(drop=True)
df_combined = df_combined.ffill()

In [87]:
df_combined.info()

<class 'pandas.DataFrame'>
RangeIndex: 4663 entries, 0 to 4662
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   ID                         4663 non-null   str           
 1   periode_data               4663 non-null   int64         
 2   tanggal                    4663 non-null   datetime64[us]
 3   stasiun                    4663 non-null   str           
 4   pm_sepuluh                 4663 non-null   Int64         
 5   sulfur_dioksida            4663 non-null   Int64         
 6   karbon_monoksida           4663 non-null   Int64         
 7   ozon                       4663 non-null   Int64         
 8   nitrogen_dioksida          4663 non-null   Int64         
 9   max                        4663 non-null   Int64         
 10  parameter_pencemar_kritis  4663 non-null   str           
 11  kategori                   4663 non-null   str           
dtypes: Int64(6), date

In [88]:
df_combined.to_csv('data/ispu_2015-2019.csv', index=False)

# QC

In [90]:
# Check for issues that might need cleanup
print("=== Data Quality Check ===\n")

# 1. Check for duplicate IDs
duplicates = df_combined[df_combined['ID'].duplicated(keep=False)]
print(f"Duplicate IDs: {len(duplicates)}")
if len(duplicates) > 0:
    print(duplicates[['ID', 'tanggal', 'stasiun', 'kategori']].head())

# 2. Check for missing values in key columns
print("\nMissing values in critical columns:")
print(df_combined[['ID', 'tanggal', 'stasiun', 'kategori']].isnull().sum())

# 3. Check for invalid station IDs
print("\nUnique station IDs:")
print(sorted(df_combined['stasiun'].unique()))

# 4. Check kategori values
print("\nKategori distribution:")
print(df_combined['kategori'].value_counts())

# 5. Check for invalid dates
print(f"\nInvalid dates (NaT): {df_combined['tanggal'].isna().sum()}")

# 6. Check for negative or unrealistic pollutant values
print("\nNegative pollutant values:")
for col in numeric_columns:
    neg_count = (df_combined[col] < 0).sum()
    if neg_count > 0:
        print(f"  {col}: {neg_count}")

=== Data Quality Check ===

Duplicate IDs: 345
                  ID    tanggal stasiun kategori
420  2016-01-12_DKI4 2016-01-12    DKI4   SEDANG
421  2016-01-12_DKI3 2016-01-12    DKI3     BAIK
423  2016-01-12_DKI3 2016-01-12    DKI3   SEDANG
424  2016-01-12_DKI2 2016-01-12    DKI2   SEDANG
425  2016-01-12_DKI5 2016-01-12    DKI5   SEDANG

Missing values in critical columns:
ID          0
tanggal     0
stasiun     0
kategori    0
dtype: int64

Unique station IDs:
['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']

Kategori distribution:
kategori
SEDANG                2938
BAIK                   911
TIDAK SEHAT            778
SANGAT TIDAK SEHAT      36
Name: count, dtype: int64

Invalid dates (NaT): 0

Negative pollutant values:
