In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [3]:
from src.data.read_file import load_params, read_dataset
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd


In [17]:
df = pd.DataFrame({
      'feature1': [1, 2, 3, 4, 5],
      'feature2': [5, 4, 3, 2, 1],
      'target': [0, 1, 0, 1, 0]
  })
X = df[['feature1', 'feature2']]
y = df['target']
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
print("X_train:\n", X_train)
print("X_test:\n", X_test)
print("y_train:\n", y_train)
print("y_test:\n", y_test)

X_train:
    feature1  feature2
1         2         4
2         3         3
4         5         1
3         4         2
X_test:
    feature1  feature2
0         1         5
y_train:
 1    1
2    0
4    0
3    1
Name: target, dtype: int64
y_test:
 0    0
Name: target, dtype: int64


**[ DATA PREPARATION ]**

In [18]:
params_dir = "../config/config.yaml"
params = load_params(params_dir)
params

{'dataset_dir': 'data/raw/',
 'datetime_columns': ['tanggal'],
 'int32_columns': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'label': 'categori',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'missing_value_co': 11,
 'missing_value_no2': 18,
 'missing_value_o3': 29,
 'missing_value_pm10': {'BAIK': 28, 'TIDAK BAIK': 55},
 'missing_value_pm25': {'BAIK': 38, 'TIDAK BAIK': 82},
 'missing_value_so2': 35,
 'object_columns': ['stasiun', 'critical', 'categori'],
 'predictors': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'range_co': [-1, 100],
 'range_no2': [-1, 100],
 'range_o3': [-1, 160],
 'range_pm10': [-1, 800],
 'range_pm25': [-1, 400],
 'range_so2': [-1, 500],
 'range_stasiun': ['DKI1 (Bunderan HI)',
  'DKI2 (Kelapa Gading)',
  'DKI3 (Jagakarsa)',
  'DKI4 (Lubang Buaya)',
  'DKI5 (Kebon Jeruk) Jakarta Barat']}

In [19]:
dataset = read_dataset('../' + params["dataset_dir"])
dataset.head(4)

100%|██████████| 12/12 [00:00<00:00, 248.41it/s]


Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-09-01,DKI1 (Bunderan HI),63,88,29,15,24,38,88,PM25,SEDANG
1,2021-09-02,DKI1 (Bunderan HI),60,83,29,11,30,28,83,PM25,SEDANG
2,2021-09-03,DKI1 (Bunderan HI),60,82,27,11,37,30,82,PM25,SEDANG
3,2021-09-04,DKI1 (Bunderan HI),58,77,26,10,31,28,77,PM25,SEDANG


In [20]:
dataset.tanggal = pd.to_datetime(dataset.tanggal)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1830 entries, 0 to 1829
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1830 non-null   datetime64[ns]
 1   stasiun   1830 non-null   object        
 2   pm10      1830 non-null   object        
 3   pm25      1768 non-null   object        
 4   so2       1830 non-null   object        
 5   co        1830 non-null   object        
 6   o3        1830 non-null   object        
 7   no2       1830 non-null   object        
 8   max       1830 non-null   object        
 9   critical  1813 non-null   object        
 10  categori  1829 non-null   object        
dtypes: datetime64[ns](1), object(10)
memory usage: 157.4+ KB


In [21]:
dataset_int = dataset[params["int32_columns"]]
dataset_int.head(3)

Unnamed: 0,pm10,pm25,so2,co,o3,no2,max
0,63,88,29,15,24,38,88
1,60,83,29,11,30,28,83
2,60,82,27,11,37,30,82


In [40]:
# change datatype "pm10" to int
dataset.pm10[dataset.pm10 == '---'].value_counts()
dataset["pm10"] = dataset["pm10"].replace("---", -1).astype(int)
dataset.head(2)

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-09-01,DKI1 (Bunderan HI),63,88,29,15,24,38,88,PM25,SEDANG
1,2021-09-02,DKI1 (Bunderan HI),60,83,29,11,30,28,83,PM25,SEDANG


In [None]:
# change datatype "pm25" to int
dataset.pm25.fillna(-1, inplace = True) # Because there are missing values
dataset.pm25[dataset.pm25 == '---'].value_counts()
dataset["pm25"] = dataset["pm25"].replace("---", -1).astype(int)
dataset.head(2)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset.pm25.fillna(-1, inplace = True)


Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-09-01,DKI1 (Bunderan HI),63,88,29,15,24,38,88,PM25,SEDANG
1,2021-09-02,DKI1 (Bunderan HI),60,83,29,11,30,28,83,PM25,SEDANG


In [None]:
# change datatype "so2" to int
dataset.so2[dataset.so2 == '---'].value_counts()
dataset["so2"] = dataset["so2"].replace("---", -1).astype(int)
dataset.head(2)

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-09-01,DKI1 (Bunderan HI),63,88,29,15,24,38,88,PM25,SEDANG
1,2021-09-02,DKI1 (Bunderan HI),60,83,29,11,30,28,83,PM25,SEDANG


In [46]:
# change datatype "co" to int
dataset.co.isna().sum()
dataset.co[dataset.co == '---'].value_counts()
dataset["co"] = dataset["co"].replace("---", -1).astype(int)
dataset.head(2)

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-09-01,DKI1 (Bunderan HI),63,88,29,15,24,38,88,PM25,SEDANG
1,2021-09-02,DKI1 (Bunderan HI),60,83,29,11,30,28,83,PM25,SEDANG


In [49]:
# change datatype "o3" to int
dataset.o3.isna().sum()
dataset.o3[dataset.o3 == '---'].value_counts()
dataset["o3"] = dataset["o3"].replace("---", -1).astype(int)
dataset.head(2)

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-09-01,DKI1 (Bunderan HI),63,88,29,15,24,38,88,PM25,SEDANG
1,2021-09-02,DKI1 (Bunderan HI),60,83,29,11,30,28,83,PM25,SEDANG


In [50]:
# change datatype "no2" to int
dataset.no2[dataset.no2 == '---'].value_counts()
dataset["no2"] = dataset["no2"].replace("---", -1).astype(int)
dataset.head(2)

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-09-01,DKI1 (Bunderan HI),63,88,29,15,24,38,88,PM25,SEDANG
1,2021-09-02,DKI1 (Bunderan HI),60,83,29,11,30,28,83,PM25,SEDANG


In [60]:
# change datatype "max" to int
dataset["max"].isna().sum()
dataset[dataset["max"] == "PM25"]
# TRANSFORMATION
dataset.loc[1372, "max"] = 49
dataset.loc[1372, "critical"] = "PM10"
dataset.loc[1372, "categori"] = "BAIK"
dataset.iloc[1372]
dataset["max"][dataset["max"] == '---'].value_counts()
dataset["max"] = dataset["max"].replace("---", -1).astype(int)
dataset.head(2)

# Sanity Check
# dataset[dataset["max"] == 'PM25']

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-09-01,DKI1 (Bunderan HI),63,88,29,15,24,38,88,PM25,SEDANG
1,2021-09-02,DKI1 (Bunderan HI),60,83,29,11,30,28,83,PM25,SEDANG


In [65]:
# check columns "categori"
dataset.categori.value_counts()
# Remove that contains "TIDAK ADA DATA" because that indicates there are empty value
dataset[dataset["categori"] == "TIDAK ADA DATA"]
# Drop it
dataset.drop(index = dataset[dataset.categori == "TIDAK ADA DATA"].index, inplace = True)
# Sanity Check
dataset[dataset.categori == "TIDAK ADA DATA"]

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori


In [66]:
dataset.describe()

Unnamed: 0,pm10,pm25,so2,co,o3,no2,max
count,1813.0,1813.0,1813.0,1813.0,1813.0,1813.0,1813.0
mean,51.199117,75.34749,33.357419,11.541644,31.14396,19.105902,78.68781
std,17.455164,29.299559,14.73883,5.101838,15.357934,9.187574,24.060186
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,17.0
25%,42.0,60.0,25.0,8.0,21.0,13.0,62.0
50%,54.0,77.0,33.0,11.0,28.0,18.0,77.0
75%,62.0,93.0,44.0,14.0,38.0,25.0,93.0
max,179.0,174.0,82.0,47.0,151.0,65.0,179.0


In [67]:
joblib.dump(dataset, "../data/processed/dataset.pkl")

['../data/processed/dataset.pkl']

In [68]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1813 entries, 0 to 1829
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tanggal   1813 non-null   object
 1   stasiun   1813 non-null   object
 2   pm10      1813 non-null   int64 
 3   pm25      1813 non-null   int64 
 4   so2       1813 non-null   int64 
 5   co        1813 non-null   int64 
 6   o3        1813 non-null   int64 
 7   no2       1813 non-null   int64 
 8   max       1813 non-null   int64 
 9   critical  1813 non-null   object
 10  categori  1813 non-null   object
dtypes: int64(7), object(4)
memory usage: 170.0+ KB


**[ DATA DEFENSE ]**

In [69]:
def check_data(input_data, params):
    # check data types
    assert input_data.select_dtypes("datetime").columns.to_list() == params["datetime_columns"], "an error occurs in datetime column(s)."
    assert input_data.select_dtypes("object").columns.to_list() == params["object_columns"], "an error occurs in object column(s)."
    assert input_data.select_dtypes("int").columns.to_list() == params["int32_columns"], "an error occurs in int32 column(s)."

    # check range of data
    assert set(input_data.stasiun).issubset(set(params["range_stasiun"])), "an error occurs in stasiun range."
    assert input_data.pm10.between(params["range_pm10"][0], params["range_pm10"][1]).sum() == len(input_data), "an error occurs in pm10 range."
    assert input_data.pm25.between(params["range_pm25"][0], params["range_pm25"][1]).sum() == len(input_data), "an error occurs in pm25 range."
    assert input_data.so2.between(params["range_so2"][0], params["range_so2"][1]).sum() == len(input_data), "an error occurs in so2 range."
    assert input_data.co.between(params["range_co"][0], params["range_co"][1]).sum() == len(input_data), "an error occurs in co range."
    assert input_data.o3.between(params["range_o3"][0], params["range_o3"][1]).sum() == len(input_data), "an error occurs in o3 range."
    assert input_data.no2.between(params["range_no2"][0], params["range_no2"][1]).sum() == len(input_data), "an error occurs in no2 range."

In [73]:
check_dataset = check_data(dataset, params)

In [74]:
x = dataset[params["predictors"]].copy()
y = dataset.categori.copy()

In [None]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1813 entries, 0 to 1829
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   stasiun  1813 non-null   object
 1   pm10     1813 non-null   int64 
 2   pm25     1813 non-null   int64 
 3   so2      1813 non-null   int64 
 4   co       1813 non-null   int64 
 5   o3       1813 non-null   int64 
 6   no2      1813 non-null   int64 
dtypes: int64(6), object(1)
memory usage: 113.3+ KB


In [76]:
y.value_counts()

categori
SEDANG         1305
TIDAK SEHAT     319
BAIK            189
Name: count, dtype: int64

In [77]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42, stratify = y)

In [78]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

In [80]:
joblib.dump(x_train, "../data/processed/x_train.pkl")
joblib.dump(y_train, "../data/processed/y_train.pkl")
joblib.dump(x_valid, "../data/processed/x_valid.pkl")
joblib.dump(y_valid, "../data/processed/y_valid.pkl")
joblib.dump(x_test, "../data/processed/x_test.pkl")
joblib.dump(y_test, "../data/processed/y_test.pkl")

['../data/processed/y_test.pkl']