# Extract.py

In [21]:
import pandas as pd

def load_csv(filename):
    """
    Fungsi ini ditujukan untuk memuat file CSV ke pandas dataframe.

    Parameters:
        filename (str): Nama file CSV.

    Returns:
        pd.DataFrame: DataFrame yang sudah di load.

    Contoh penggunaan:
        df = load_data("c/user/titanic.csv")
    """
    df = pd.read_csv(filename)
    return df


# Transform.py

In [None]:
import pandas as pd

# Mengubah tipe data ke datetime
def convert_datetime(df, columns, format="%Y-%m-%d %H:%M"):
    '''
    Fungsi ini ditujukan untuk mengubah tipe data ke tipe datetime

    Parameters:
        df           : Nama dataframe
        columns      : List nama kolom yang akan diubah tipe datanya
        format       : Format datetime (default "%Y-%m-%d %H:%M")

    Return:
        df           : DataFrame dengan kolom yang telah dikonversi ke datetime

    Contoh penggunaan:
        df = convert_datetime(df, ["Measurement date"], "%Y-%m-%d %H:%M")
    '''
    for column in columns:
        df[column] = pd.to_datetime(df[column], format=format)
    return df

# measurement = convert_datetime(measurement, ['Measurement date'], "%Y-%m-%d %H:%M")
# -----------------------------------------------------------------------------------------------

# Drop kolom
def drop_column(df, column_name):
    '''
    Fungsi ini ditujukan untuk membuang kolom yang tidak akan digunakan di data warehouse

    Parameters:
        df           : Nama DataFrame
        column_name  : Nama kolom / list kolom yang akan di-drop

    Return:
        df           : DataFrame tanpa kolom yang telah di-drop

    Contoh penggunaan:
        df = drop_column(df, "Ticket_id")
        atau
        df = drop_column(df, ["Ticket_id", "User_ID"])
    '''
    df = df.drop(column_name, axis=1)
    return df

# measurement_2 = drop_column(measurement, "Instrument status")
# -----------------------------------------------------------------------------------------------

# Pivot_table
def pivot_and_rename(df):
    '''
    Fungsi ini mempivot tabel berdasarkan 'Measurement date' dan 'Station code' dengan 'Item code' sebagai kolom baru.

    Parameters:
        df : DataFrame yang akan di pivot

    Returns:
        df_pivoted : DataFrame yang telah dipivot dan diubah nama kolomnya

    contoh penggunaan:
        measurement_pivoted = pivot_and_rename(measurement_2)
    '''
    # Pivot table
    df_pivoted = pd.pivot_table(df,
                                index=['Measurement date', 'Station code'],
                                columns='Item code',
                                values='Average value')

    # Reset index agar kolom sejajar
    df_pivoted.columns = [f'{a}' for a in df_pivoted.columns]
    df_pivoted = df_pivoted.reset_index()

    # Rename Item Code berdasarkan Measurement Item
    rename_dict = {
        '1': 'SO2',
        '3': 'NO2',
        '5': 'CO',
        '6': 'O3',
        '8': 'PM10',
        '9': 'PM2.5'
    }
    df_pivoted = df_pivoted.rename(columns=rename_dict)

    return df_pivoted

#measurement_pivoted = pivot_and_rename(measurement_2)
# -----------------------------------------------------------------------------------------------

# Create date table
def generate_dim_date(start, end, freq):
    '''
    Fungsi ini membuat waktu berdasarkan rentang tertentu.

    Parameters:
        start : Tanggal awal ('YYYY-MM-DD').
        end   : Tanggal akhir ('YYYY-MM-DD').
        freq  : Frekuensi data ('H' = hourly).

    Returns:
        dim_date (DataFrame): DataFrame dengan kolom waktu dan atribut waktu terkait.

    contoh penggunaan:
        dim_date = generate_dim_date(start='2017-01-01', end='2022-01-01', freq='H')
    '''
    date_range = pd.date_range(start=start, end=end, freq=freq)
    dim_date = pd.DataFrame({'timetable': date_range})

    # Menarik informasi dari timteble dengan satuan waktu yang lain

    dim_date['year'] = dim_date['timetable'].dt.year
    dim_date['month'] = dim_date['timetable'].dt.month
    dim_date['day'] = dim_date['timetable'].dt.day
    dim_date['dayofweek'] = dim_date['timetable'].dt.dayofweek
    dim_date['weekofyear'] = dim_date['timetable'].dt.isocalendar().week
    dim_date['hour'] = dim_date['timetable'].dt.hour if freq.upper() == 'H' else None

    return dim_date

#dim_date = generate_dim_date(start='2017-01-01', end='2022-01-01', freq='H')
# -----------------------------------------------------------------------------------------------

--------------------------------------------------------
# Transform (raw)

In [63]:
measurement = load_csv(r"C:\Users\fahri\Downloads\Measurement_info.csv") # local directory (fahrizan), please change if you run locally
measurement.sample(3)

Unnamed: 0,Measurement date,Station code,Item code,Average value,Instrument status
2795089,2019-02-16 10:00,101,5,0.9,0
2941807,2019-04-07 10:00,117,9,18.0,0
544632,2017-06-01 06:00,123,1,0.005,0


### object to date time

In [23]:
measurement['Measurement date'] = pd.to_datetime(measurement['Measurement date']) #, format='%d%b%Y:%H:%M:%S.%f')

In [24]:
measurement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3885066 entries, 0 to 3885065
Data columns (total 5 columns):
 #   Column             Dtype         
---  ------             -----         
 0   Measurement date   datetime64[ns]
 1   Station code       int64         
 2   Item code          int64         
 3   Average value      float64       
 4   Instrument status  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 148.2 MB


### drop instrument status

In [25]:
measurement_2 = measurement.drop("Instrument status", axis='columns')
measurement_2

Unnamed: 0,Measurement date,Station code,Item code,Average value
0,2017-01-01 00:00:00,101,1,0.004
1,2017-01-01 00:00:00,101,3,0.059
2,2017-01-01 00:00:00,101,5,1.200
3,2017-01-01 00:00:00,101,6,0.002
4,2017-01-01 00:00:00,101,8,73.000
...,...,...,...,...
3885061,2019-12-31 23:00:00,123,9,13.000
3885062,2019-12-31 23:00:00,118,9,24.000
3885063,2019-12-31 23:00:00,105,8,19.000
3885064,2019-12-31 23:00:00,125,3,0.037


### pivot table

In [26]:
measurement_pivoted = pd.pivot_table(measurement_2,
                                     index=['Measurement date', 'Station code'],
                                     columns='Item code',
                                     values='Average value')

In [27]:
measurement_pivoted

Unnamed: 0_level_0,Item code,1,3,5,6,8,9
Measurement date,Station code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-01-01 00:00:00,101,0.004,0.059,1.2,0.002,73.0,57.0
2017-01-01 00:00:00,102,0.006,0.068,1.3,0.002,77.0,63.0
2017-01-01 00:00:00,103,0.005,0.039,1.4,0.002,70.0,68.0
2017-01-01 00:00:00,104,0.005,0.045,0.6,0.003,73.0,46.0
2017-01-01 00:00:00,105,0.005,0.044,1.0,0.004,81.0,44.0
...,...,...,...,...,...,...,...
2019-12-31 23:00:00,121,0.003,0.042,0.5,0.013,29.0,16.0
2019-12-31 23:00:00,122,0.003,0.047,0.4,0.007,24.0,15.0
2019-12-31 23:00:00,123,0.003,0.039,0.5,0.003,19.0,13.0
2019-12-31 23:00:00,124,0.003,0.035,0.5,0.004,20.0,13.0


In [28]:
measurement_pivoted.columns = [f'{a}' for a in measurement_pivoted.columns]

In [29]:
measurement_pivoted = measurement_pivoted.reset_index()

In [None]:
measurement_pivoted = measurement_pivoted.rename(columns={'1': 'SO2',
                                                          '3': 'NO2',
                                                          '5': 'CO',
                                                          '6': 'O3',
                                                          '8': 'PM10',
                                                          '9': 'PM2.5'})

In [31]:
measurement_pivoted

Unnamed: 0,Measurement date,Station code,SO2,NO2,CO,O3,PM10,PM2.5
0,2017-01-01 00:00:00,101,0.004,0.059,1.2,0.002,73.0,57.0
1,2017-01-01 00:00:00,102,0.006,0.068,1.3,0.002,77.0,63.0
2,2017-01-01 00:00:00,103,0.005,0.039,1.4,0.002,70.0,68.0
3,2017-01-01 00:00:00,104,0.005,0.045,0.6,0.003,73.0,46.0
4,2017-01-01 00:00:00,105,0.005,0.044,1.0,0.004,81.0,44.0
...,...,...,...,...,...,...,...,...
647506,2019-12-31 23:00:00,121,0.003,0.042,0.5,0.013,29.0,16.0
647507,2019-12-31 23:00:00,122,0.003,0.047,0.4,0.007,24.0,15.0
647508,2019-12-31 23:00:00,123,0.003,0.039,0.5,0.003,19.0,13.0
647509,2019-12-31 23:00:00,124,0.003,0.035,0.5,0.004,20.0,13.0


### dim date tables

In [40]:
date_range = pd.date_range(start='1/1/2017', end='1/1/2022', freq='H')
date_range

DatetimeIndex(['2017-01-01 00:00:00', '2017-01-01 01:00:00',
               '2017-01-01 02:00:00', '2017-01-01 03:00:00',
               '2017-01-01 04:00:00', '2017-01-01 05:00:00',
               '2017-01-01 06:00:00', '2017-01-01 07:00:00',
               '2017-01-01 08:00:00', '2017-01-01 09:00:00',
               ...
               '2021-12-31 15:00:00', '2021-12-31 16:00:00',
               '2021-12-31 17:00:00', '2021-12-31 18:00:00',
               '2021-12-31 19:00:00', '2021-12-31 20:00:00',
               '2021-12-31 21:00:00', '2021-12-31 22:00:00',
               '2021-12-31 23:00:00', '2022-01-01 00:00:00'],
              dtype='datetime64[ns]', length=43825, freq='H')

In [None]:
dim_date = pd.DataFrame({'timetable':date_range})
dim_date

Unnamed: 0,timetable
0,2017-01-01 00:00:00
1,2017-01-01 01:00:00
2,2017-01-01 02:00:00
3,2017-01-01 03:00:00
4,2017-01-01 04:00:00
...,...
43820,2021-12-31 20:00:00
43821,2021-12-31 21:00:00
43822,2021-12-31 22:00:00
43823,2021-12-31 23:00:00


In [None]:
dim_date['year'] = dim_date.timetable.dt.year
dim_date['month'] = dim_date.timetable.dt.month
dim_date['day'] = dim_date.timetable.dt.day
dim_date['dayofweek'] = dim_date.timetable.dt.dayofweek
dim_date['weekofyear'] = dim_date.timetable.dt.isocalendar().week
dim_date['hour'] = dim_date.timetable.dt.hour
dim_date.head()

Unnamed: 0,timetable,year,month,day,dayofweek,weekofyear,hour
0,2017-01-01 00:00:00,2017,1,1,6,52,0
1,2017-01-01 01:00:00,2017,1,1,6,52,1
2,2017-01-01 02:00:00,2017,1,1,6,52,2
3,2017-01-01 03:00:00,2017,1,1,6,52,3
4,2017-01-01 04:00:00,2017,1,1,6,52,4


# References:


- https://docs.vultr.com/python/third-party/pandas/date_range

- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html

- https://www.datacamp.com/tutorial/git-reset-revert-tutorial

- 