# ISPU Data Cleanup for 2025

This notebook cleans up the 2025 ISPU (Air Quality Index) data from Jakarta.

In [1]:
import pandas as pd
import numpy as np

# Load the 2025 ISPU data
df = pd.read_csv('data/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2025.csv')

print("Original shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumns:", df.columns.tolist())
print("\nData types:")
print(df.dtypes)

Original shape: (1215, 13)

First few rows:
   periode_data  bulan  tanggal           stasiun  pm_sepuluh  pm_duakomalima  \
0        202504      4       24  DKI5 Kebon Jeruk        16.0            50.0   
1        202504      4       25  DKI5 Kebon Jeruk        23.0            81.0   
2        202504      4       26  DKI5 Kebon Jeruk        30.0            77.0   
3        202504      4       27  DKI5 Kebon Jeruk        31.0            85.0   
4        202504      4       28  DKI5 Kebon Jeruk        24.0            78.0   

   sulfur_dioksida  karbon_monoksida  ozon  nitrogen_dioksida   max  \
0             18.0               3.0  16.0                4.0  50.0   
1             19.0               9.0  24.0                3.0  81.0   
2             20.0              11.0  29.0                3.0  77.0   
3             21.0              12.0  47.0                4.0  85.0   
4             20.0               9.0  37.0                4.0  78.0   

  parameter_pencemar_kritis kategori  
0  

In [2]:
# Step 1: Drop rows where kategori == "TIDAK ADA DATA"
df = df[df['kategori'] != 'TIDAK ADA DATA'].copy()
print(f"After removing 'TIDAK ADA DATA': {df.shape[0]} rows")

# Step 2: Extract only the DKIx code from stasiun column
df['stasiun'] = df['stasiun'].str.extract(r'(DKI\d)')
print("\nUnique stations:", df['stasiun'].unique())

# Step 3: Create proper date column from periode_data and tanggal
# periode_data is YYYYMM format (e.g., 202504 for April 2025)
# tanggal is the day (e.g., 24 for day 24)
df['year'] = df['periode_data'] // 100  # 202504 // 100 = 2025
df['month'] = df['periode_data'] % 100  # 202504 % 100 = 4
df['day'] = df['tanggal'].astype(int)

# Create date using pd.to_datetime with year/month/day columns
date_df = df[['year', 'month', 'day']].copy()
date_df.columns = ['year', 'month', 'day']
df['date'] = pd.to_datetime(date_df, errors='coerce')

print("\nDate range:", df['date'].min(), "to", df['date'].max())
print(df[['periode_data', 'bulan', 'tanggal', 'date', 'stasiun']].head(10))

After removing 'TIDAK ADA DATA': 1203 rows

Unique stations: <StringArray>
['DKI5', 'DKI1', 'DKI2', 'DKI3', 'DKI4']
Length: 5, dtype: str

Date range: 2025-01-01 00:00:00 to 2025-08-31 00:00:00
   periode_data  bulan  tanggal       date stasiun
0        202504      4       24 2025-04-24    DKI5
1        202504      4       25 2025-04-25    DKI5
2        202504      4       26 2025-04-26    DKI5
3        202504      4       27 2025-04-27    DKI5
4        202504      4       28 2025-04-28    DKI5
5        202504      4       29 2025-04-29    DKI5
6        202504      4       30 2025-04-30    DKI5
7        202505      5        1 2025-05-01    DKI1
8        202505      5        2 2025-05-02    DKI1
9        202505      5        3 2025-05-03    DKI1


In [3]:
# Step 4: Sort temporally
df = df.sort_values('date').reset_index(drop=True)
print(f"Sorted by date: {df.shape[0]} rows")

# Step 5: Group by stasiun and fill NULL values using ffill()
pollution_cols = ['pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 'karbon_monoksida', 
                  'ozon', 'nitrogen_dioksida', 'max']

df[pollution_cols] = df.groupby('stasiun')[pollution_cols].ffill()
print(f"\nNull values after ffill():\n{df[pollution_cols].isnull().sum()}")

# Step 6: Format date column to %Y-%m-%d
df['tanggal'] = df['date'].dt.strftime('%Y-%m-%d')
print("\nFormatted dates (first 5):")
print(df[['date', 'tanggal']].head())

# Step 7: Create ID column as YYYY-MM-DD_DKIx
df['ID'] = df['tanggal'] + '_' + df['stasiun']
print("\nNew ID column (first 5):")
print(df['ID'].head())

Sorted by date: 1203 rows

Null values after ffill():
pm_sepuluh           6
pm_duakomalima       0
sulfur_dioksida      3
karbon_monoksida     0
ozon                 0
nitrogen_dioksida    0
max                  0
dtype: int64

Formatted dates (first 5):
        date     tanggal
0 2025-01-01  2025-01-01
1 2025-01-01  2025-01-01
2 2025-01-01  2025-01-01
3 2025-01-01  2025-01-01
4 2025-01-01  2025-01-01

New ID column (first 5):
0    2025-01-01_DKI1
1    2025-01-01_DKI2
2    2025-01-01_DKI3
3    2025-01-01_DKI4
4    2025-01-01_DKI5
Name: ID, dtype: str


In [4]:
# Step 8: Select and reorder columns to match ISPU_2010-2024.csv structure
final_df = df[[
    'ID', 'periode_data', 'tanggal', 'stasiun', 
    'pm_sepuluh', 'sulfur_dioksida', 'karbon_monoksida',
    'ozon', 'nitrogen_dioksida', 'max', 'parameter_pencemar_kritis', 'kategori'
]].copy()

print("Final cleaned data shape:", final_df.shape)
print("\nFinal data (first 10 rows):")
print(final_df.head(10))

print("\n\nFinal data (last 10 rows):")
print(final_df.tail(10))

print("\n\nData summary:")
print(f"Date range: {final_df['tanggal'].min()} to {final_df['tanggal'].max()}")
print(f"Unique stations: {final_df['stasiun'].unique()}")
print(f"Total rows: {len(final_df)}")

Final cleaned data shape: (1203, 12)

Final data (first 10 rows):
                ID  periode_data     tanggal stasiun  pm_sepuluh  \
0  2025-01-01_DKI1        202501  2025-01-01    DKI1        46.0   
1  2025-01-01_DKI2        202501  2025-01-01    DKI2        48.0   
2  2025-01-01_DKI3        202501  2025-01-01    DKI3        49.0   
3  2025-01-01_DKI4        202501  2025-01-01    DKI4        46.0   
4  2025-01-01_DKI5        202501  2025-01-01    DKI5         NaN   
5  2025-01-02_DKI2        202501  2025-01-02    DKI2        29.0   
6  2025-01-02_DKI3        202501  2025-01-02    DKI3        28.0   
7  2025-01-02_DKI4        202501  2025-01-02    DKI4        31.0   
8  2025-01-02_DKI1        202501  2025-01-02    DKI1        30.0   
9  2025-01-02_DKI5        202501  2025-01-02    DKI5         NaN   

   sulfur_dioksida  karbon_monoksida  ozon  nitrogen_dioksida   max  \
0             12.0              22.0  10.0               16.0  63.0   
1             25.0              13.0  12.0 

In [8]:
# Optional: Save the cleaned data to a new CSV file
# Uncomment the line below to save
# final_df.to_csv('data/ISPU/ISPU_2025_cleaned.csv', index=False)

# Or append to the main ISPU file
# You can use this to combine with the historical data
print("To save the cleaned data, uncomment the lines above.")

To save the cleaned data, uncomment the lines above.


In [5]:
# Step 9: Check compatibility with ISPU_2010-2024.csv and merge
historical_df = pd.read_csv('ISPU_2010-2024.csv')

print("=== COMPATIBILITY CHECK ===")
print(f"\nHistorical data shape: {historical_df.shape}")
print(f"New 2025 data shape: {final_df.shape}")

print("\nHistorical columns:", historical_df.columns.tolist())
print("New 2025 columns:", final_df.columns.tolist())

print("\nColumn differences:")
hist_cols = set(historical_df.columns)
new_cols = set(final_df.columns)
print(f"  - In historical but not in 2025: {hist_cols - new_cols}")
print(f"  - In 2025 but not in historical: {new_cols - hist_cols}")

# Check for duplicate rows (by ID)
print("\n=== CHECKING FOR DUPLICATES ===")
hist_ids = set(historical_df['ID'].unique())
new_ids = set(final_df['ID'].unique())
overlapping_ids = hist_ids & new_ids
print(f"Overlapping IDs: {len(overlapping_ids)}")
if len(overlapping_ids) > 0:
    print("Sample overlapping IDs:", list(overlapping_ids)[:5])

# Check historical data types
print("\nHistorical data types:")
print(historical_df.dtypes)
print("\n2025 data types:")
print(final_df.dtypes)

=== COMPATIBILITY CHECK ===

Historical data shape: (14208, 12)
New 2025 data shape: (1203, 12)

Historical columns: ['ID', 'periode_data', 'tanggal', 'stasiun', 'pm_sepuluh', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max', 'parameter_pencemar_kritis', 'kategori']
New 2025 columns: ['ID', 'periode_data', 'tanggal', 'stasiun', 'pm_sepuluh', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max', 'parameter_pencemar_kritis', 'kategori']

Column differences:
  - In historical but not in 2025: set()
  - In 2025 but not in historical: set()

=== CHECKING FOR DUPLICATES ===
Overlapping IDs: 0

Historical data types:
ID                               str
periode_data                   int64
tanggal                          str
stasiun                          str
pm_sepuluh                   float64
sulfur_dioksida              float64
karbon_monoksida             float64
ozon                         float64
nitrogen_dioksida            float64
max

In [6]:
# Step 10: Merge datasets to create 2010-2025 complete dataset
# Drop pm_duakomalima column from 2025 data to match historical data structure
if 'pm_duakomalima' in final_df.columns:
    final_df = final_df.drop('pm_duakomalima', axis=1)

# Reorder columns to match final_df column order
columns_order = ['ID', 'periode_data', 'tanggal', 'stasiun', 'pm_sepuluh', 
                 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 
                 'max', 'parameter_pencemar_kritis', 'kategori']

historical_df = historical_df[columns_order]
final_df_reordered = final_df[columns_order]

# Verify columns match
print("Historical columns (after reordering):", historical_df.columns.tolist())
print("2025 columns:", final_df_reordered.columns.tolist())
print("Columns match:", historical_df.columns.tolist() == final_df_reordered.columns.tolist())

# Merge the datasets
combined_df = pd.concat([historical_df, final_df_reordered], ignore_index=True)
print(f"\n=== MERGE RESULT ===")
print(f"Historical rows: {len(historical_df)}")
print(f"2025 rows: {len(final_df_reordered)}")
print(f"Combined rows: {len(combined_df)}")

# Sort by date and station for consistency
combined_df['tanggal_datetime'] = pd.to_datetime(combined_df['tanggal'])
combined_df = combined_df.sort_values(['tanggal_datetime', 'stasiun']).reset_index(drop=True)
combined_df = combined_df.drop('tanggal_datetime', axis=1)

print(f"\nCombined data date range: {combined_df['tanggal'].min()} to {combined_df['tanggal'].max()}")
print(f"Total unique stations: {combined_df['stasiun'].nunique()}")
print(f"Stations: {sorted(combined_df['stasiun'].unique())}")

print("\nFirst few rows:")
print(combined_df.head(10))

print("\nLast few rows:")
print(combined_df.tail(10))

Historical columns (after reordering): ['ID', 'periode_data', 'tanggal', 'stasiun', 'pm_sepuluh', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max', 'parameter_pencemar_kritis', 'kategori']
2025 columns: ['ID', 'periode_data', 'tanggal', 'stasiun', 'pm_sepuluh', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max', 'parameter_pencemar_kritis', 'kategori']
Columns match: True

=== MERGE RESULT ===
Historical rows: 14208
2025 rows: 1203
Combined rows: 15411

Combined data date range: 2010-01-01 to 2025-08-31
Total unique stations: 5
Stations: ['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']

First few rows:
                ID  periode_data     tanggal stasiun  pm_sepuluh  \
0  2010-01-01_DKI1        201001  2010-01-01    DKI1        60.0   
1  2010-01-02_DKI1        201001  2010-01-02    DKI1        32.0   
2  2010-01-03_DKI1        201001  2010-01-03    DKI1        27.0   
3  2010-01-04_DKI1        201001  2010-01-04    DKI1        22.0   
4  2010-01

In [7]:
# Step 11: Check and clean any data quality issues before saving
print("=== DATA QUALITY CHECK ===")

# Check for NaN in tanggal
nan_tanggal = combined_df[combined_df['tanggal'].isna()]
print(f"\nRows with NaN tanggal: {len(nan_tanggal)}")
if len(nan_tanggal) > 0:
    print("\nSample rows with NaN tanggal:")
    print(nan_tanggal[['ID', 'tanggal', 'stasiun', 'periode_data']].head(10))

# Check the end of the dataset after sorting
print("\n\nData sorted correctly by date?")
combined_df['tanggal_temp'] = pd.to_datetime(combined_df['tanggal'], errors='coerce')
print(f"First date: {combined_df['tanggal_temp'].min()}")
print(f"Last date: {combined_df['tanggal_temp'].max()}")

# Check if there are any data quality issues
print(f"\nNull values per column:")
print(combined_df.isnull().sum())

print(f"\nDataset summary:")
print(f"  Total rows: {len(combined_df)}")
print(f"  Date range: {combined_df['tanggal_temp'].min().date()} to {combined_df['tanggal_temp'].max().date()}")
print(f"  Columns: {list(combined_df.columns)}")

=== DATA QUALITY CHECK ===

Rows with NaN tanggal: 30

Sample rows with NaN tanggal:
                   ID tanggal stasiun  periode_data
15381   9/4/2013_DKI2     NaN    DKI2        201309
15382   9/2/2013_DKI3     NaN    DKI3        201309
15383   9/8/2013_DKI3     NaN    DKI3        201309
15384   9/9/2013_DKI3     NaN    DKI3        201309
15385  9/10/2013_DKI3     NaN    DKI3        201309
15386  9/11/2013_DKI3     NaN    DKI3        201309
15387  9/13/2013_DKI3     NaN    DKI3        201309
15388  9/15/2013_DKI3     NaN    DKI3        201309
15389  9/20/2013_DKI3     NaN    DKI3        201309
15390  9/22/2013_DKI3     NaN    DKI3        201309


Data sorted correctly by date?
First date: 2010-01-01 00:00:00
Last date: 2025-08-31 00:00:00

Null values per column:
ID                             0
periode_data                   0
tanggal                       30
stasiun                        0
pm_sepuluh                   118
sulfur_dioksida               66
karbon_monoksida        

In [8]:
# Step 12: Final cleanup and save the combined 2010-2025 dataset
# Remove the temporary column if it exists
if 'tanggal_temp' in combined_df.columns:
    combined_df = combined_df.drop('tanggal_temp', axis=1)

# Remove rows with NaN tanggal (30 rows from original 2013 data)
combined_df_clean = combined_df[combined_df['tanggal'].notna()].copy()

print("=== FINAL DATASET ===")
print(f"Total rows after removing NaN tanggal: {len(combined_df_clean)}")
print(f"Rows removed: {len(combined_df) - len(combined_df_clean)}")

print(f"\nFinal date range: {combined_df_clean['tanggal'].min()} to {combined_df_clean['tanggal'].max()}")
print(f"Total unique stations: {combined_df_clean['stasiun'].nunique()}")
print(f"Stations: {sorted(combined_df_clean['stasiun'].unique())}")

print(f"\nColumn breakdown:")
for col in combined_df_clean.columns:
    null_count = combined_df_clean[col].isnull().sum()
    null_pct = (null_count / len(combined_df_clean)) * 100
    print(f"  {col}: {null_count:,} nulls ({null_pct:.1f}%)")

print(f"\nFinal rows by year:")
combined_df_clean['year'] = combined_df_clean['tanggal'].str[:4]
print(combined_df_clean['year'].value_counts().sort_index())

# Save the combined dataset
output_path = 'ISPU_2010-2025_combined.csv'
combined_df_clean.to_csv(output_path, index=False)
print(f"\n✓ Combined dataset saved to: {output_path}")
print(f"  File size: {combined_df_clean.memory_usage(deep=True).sum() / 1024:.1f} KB")

# Display sample from different years
print("\n\nSample from 2010:")
print(combined_df_clean[combined_df_clean['year'] == '2010'].head(3).to_string())

print("\n\nSample from 2024:")
print(combined_df_clean[combined_df_clean['year'] == '2024'].tail(3).to_string())

print("\n\nSample from 2025:")
print(combined_df_clean[combined_df_clean['year'] == '2025'].head(3).to_string())

=== FINAL DATASET ===
Total rows after removing NaN tanggal: 15381
Rows removed: 30

Final date range: 2010-01-01 to 2025-08-31
Total unique stations: 5
Stations: ['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']

Column breakdown:
  ID: 0 nulls (0.0%)
  periode_data: 0 nulls (0.0%)
  tanggal: 0 nulls (0.0%)
  stasiun: 0 nulls (0.0%)
  pm_sepuluh: 118 nulls (0.8%)
  sulfur_dioksida: 66 nulls (0.4%)
  karbon_monoksida: 23 nulls (0.1%)
  ozon: 133 nulls (0.9%)
  nitrogen_dioksida: 64 nulls (0.4%)
  max: 0 nulls (0.0%)
  parameter_pencemar_kritis: 4 nulls (0.0%)
  kategori: 0 nulls (0.0%)

Final rows by year:
year
2010     523
2011     365
2012     366
2013     335
2014    1791
2015     365
2016    1793
2017    1795
2018     365
2019     345
2020     362
2021    1809
2022     491
2023    1649
2024    1824
2025    1203
Name: count, dtype: int64

✓ Combined dataset saved to: ISPU_2010-2025_combined.csv
  File size: 5900.8 KB


Sample from 2010:
                ID  periode_data     tanggal stasiun  p