In [1]:
pip install psycopg2-binary

Note: you may need to restart the kernel to use updated packages.


In [73]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

In [74]:
engine = create_engine(f"postgresql://neondb_owner:npg_f1uYCZ8nRWzQ@ep-floral-lake-a533zre9-pooler.us-east-2.aws.neon.tech/Air-Pollution-Seoul?sslmode=require")

# Extract

In [75]:
# EXTRACT.PY
def get_data_from_db(query):
    '''
    Fungsi ini ditujukan untuk mengambil file dari SQL cloud untuk selanjutnya di load ke pandas

    Parameters:
        query       : SQL query (engine: import dari sqlalchemy)
    
    Return:
        df          : csv dari SQL yang telah diload ke pandas dataframe

    Contoh penggunaan:
        query = "SELECT * FROM customers"
        df = get_data_from_db(query)
    '''
    try:
        df = pd.read_sql(query, engine)
        return df
    except Exception as e:
        print(e)

In [76]:
query = '''SELECT * FROM "Dim_Item";'''
df_item = get_data_from_db(query)
df_item.sample(1)

Unnamed: 0,Item_Code,Item_Name,Unit_Of_Measurement,Good,Normal,Bad,Very_Bad
3,6,O3,ppm,0.03,0.09,0.15,0.5


In [77]:
query = '''SELECT * FROM "Dim_Station";'''
df_station = get_data_from_db(query)
df_station.sample(1)

Unnamed: 0,Station_Code,Station_Name,Address,Latitude,Longitude
22,123,Gangnam-gu,"426, Hakdong-ro, Gangnam-gu, Seoul, Republic o...",37.517528,127.04747


In [78]:
#query = '''SELECT * FROM "Dim_Date";'''
#df_date = get_data_from_db(query)
# df_date_sample = df_date.dropna().sample(frac=0.1)
#df_date_sample.info()

#df_date_sample.isnull().sum().sort_values(ascending=False)/len(df_date_sample)*100

query = '''SELECT * FROM "Dim_Date" LIMIT 200;'''
df_date_sample = get_data_from_db(query)
df_date_sample

Unnamed: 0,id,Timetable,Year,Month,Day,DayOfWeek,WeekOfYear,Hour,Quarter,Half
0,1,2017-01-01 00:00:00,2017,1,1,6,52,0,1,1
1,2,2017-01-01 01:00:00,2017,1,1,6,52,1,1,1
2,3,2017-01-01 02:00:00,2017,1,1,6,52,2,1,1
3,4,2017-01-01 03:00:00,2017,1,1,6,52,3,1,1
4,5,2017-01-01 04:00:00,2017,1,1,6,52,4,1,1
...,...,...,...,...,...,...,...,...,...,...
195,196,2017-01-09 03:00:00,2017,1,9,0,2,3,1,1
196,197,2017-01-09 04:00:00,2017,1,9,0,2,4,1,1
197,198,2017-01-09 05:00:00,2017,1,9,0,2,5,1,1
198,199,2017-01-09 06:00:00,2017,1,9,0,2,6,1,1


In [79]:
#query = '''SELECT * FROM "Fact_Info";'''
#df_measurement = get_data_from_db(query)
#df_measurement_sample = df_measurement.dropna().sample(frac=0.05)
#df_measurement_sample.info()

#df_measurement_sample.isnull().sum().sort_values(ascending=False)/len(df_date_sample)*100

query = '''SELECT * FROM "Fact_Info" LIMIT 1000;'''
df_measurement_sample = get_data_from_db(query)
df_measurement_sample

Unnamed: 0,id,Measurement_Date,Station_Code,Item_Code,Average_Value
0,1,2017-01-01 00:00:00,101,1,0.004
1,2,2017-01-01 00:00:00,101,3,0.059
2,3,2017-01-01 00:00:00,101,5,1.200
3,4,2017-01-01 00:00:00,101,6,0.002
4,5,2017-01-01 00:00:00,101,8,73.000
...,...,...,...,...,...
995,996,2017-01-01 06:00:00,116,9,43.000
996,997,2017-01-01 06:00:00,117,1,0.006
997,998,2017-01-01 06:00:00,117,3,0.032
998,999,2017-01-01 06:00:00,117,5,1.100


-----------------------------------------
# Transform

### Pivot table

In [80]:
def pivot_and_rename(df):
    '''
    Fungsi ini mempivot tabel berdasarkan 'Measurement date' dan 'Station code' dengan 'Item code' sebagai kolom baru.

    Parameters:
        df : DataFrame yang akan di pivot

    Returns:
        df_pivoted : DataFrame yang telah dipivot dan diubah nama kolomnya

    contoh penggunaan:
        measurement_pivoted = pivot_and_rename(measurement_2)
    '''
    # Pivot table
    df_pivoted = pd.pivot_table(df,
                                index=['Measurement_Date', 'Station_Code'],
                                columns='Item_Code',
                                values='Average_Value')
    # Reset index agar kolom sejajar
    df_pivoted.columns = [f'{a}' for a in df_pivoted.columns]
    df_pivoted = df_pivoted.reset_index()
    # Rename Item Code berdasarkan Measurement Item
    rename_dict = {
        '1': 'SO2',
        '3': 'NO2',
        '5': 'CO',
        '6': 'O3',
        '8': 'PM10',
        '9': 'PM2.5'
    }
    df_pivoted = df_pivoted.rename(columns=rename_dict)

    return df_pivoted

#df_measurement_sample_piv = pivot_and_rename(df_measurement_sample)

In [81]:
df_measurement_sample_piv = pivot_and_rename(df_measurement_sample)
df_measurement_sample_piv

Unnamed: 0,Measurement_Date,Station_Code,SO2,NO2,CO,O3,PM10,PM2.5
0,2017-01-01 00:00:00,101,0.004,0.059,1.2,0.002,73.0,57.0
1,2017-01-01 00:00:00,102,0.006,0.068,1.3,0.002,77.0,63.0
2,2017-01-01 00:00:00,103,0.005,0.039,1.4,0.002,70.0,68.0
3,2017-01-01 00:00:00,104,0.005,0.045,0.6,0.003,73.0,46.0
4,2017-01-01 00:00:00,105,0.005,0.044,1.0,0.004,81.0,44.0
...,...,...,...,...,...,...,...,...
162,2017-01-01 06:00:00,113,0.004,0.047,0.9,0.002,54.0,28.0
163,2017-01-01 06:00:00,114,0.007,0.045,1.3,0.002,74.0,58.0
164,2017-01-01 06:00:00,115,0.005,0.040,1.2,0.002,73.0,48.0
165,2017-01-01 06:00:00,116,0.006,0.049,1.0,0.001,71.0,43.0


### Merge pivot

In [82]:
def merge_pivot(df_pivoted, df_station):
    '''
    Fungsi ini menggabungkan table pivot dan table station

    Parameters:
        df_pivoted : DataFrame (measurement) yang sudah di pivot
        df_station : DataFrame yang berisi informasi stasiun

    Returns:
        summary    : DataFrame hasil merge keduanya

    contoh penggunaan:
        measurement_pivoted = pivot_and_rename(measurement_2)
    '''
    # Merge dengan station 
    summary = pd.merge(df_pivoted, df_station, on='Station_Code', how='left')

    # Memperbaiki ukuran tabel
    summary = summary[['Measurement_Date','Station_Code','Address','Latitude','Longitude','SO2','NO2','O3','CO','PM10','PM2.5']]
    
    return summary

#df_merged = merge_pivot(df_measurement_sample_piv, df_station)

In [83]:
df_merged = merge_pivot(df_measurement_sample_piv, df_station)
df_merged

Unnamed: 0,Measurement_Date,Station_Code,Address,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5
0,2017-01-01 00:00:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0
1,2017-01-01 00:00:00,102,"15, Deoksugung-gil, Jung-gu, Seoul, Republic o...",37.564263,126.974676,0.006,0.068,0.002,1.3,77.0,63.0
2,2017-01-01 00:00:00,103,"136, Hannam-daero, Yongsan-gu, Seoul, Republic...",37.540033,127.004850,0.005,0.039,0.002,1.4,70.0,68.0
3,2017-01-01 00:00:00,104,"215, Jinheung-ro, Eunpyeong-gu, Seoul, Republi...",37.609823,126.934848,0.005,0.045,0.003,0.6,73.0,46.0
4,2017-01-01 00:00:00,105,"32, Segeomjeong-ro 4-gil, Seodaemun-gu, Seoul,...",37.593742,126.949679,0.005,0.044,0.004,1.0,81.0,44.0
...,...,...,...,...,...,...,...,...,...,...,...
162,2017-01-01 06:00:00,113,"34, Sirubong-ro 2-gil, Dobong-gu, Seoul, Repub...",37.654192,127.029088,0.004,0.047,0.002,0.9,54.0,28.0
163,2017-01-01 06:00:00,114,"17, Sanggye-ro 23-gil, Nowon-gu, Seoul, Republ...",37.658774,127.068505,0.007,0.045,0.002,1.3,74.0,58.0
164,2017-01-01 06:00:00,115,"56, Jungang-ro 52-gil, Yangcheon-gu, Seoul, Re...",37.525939,126.856603,0.005,0.040,0.002,1.2,73.0,48.0
165,2017-01-01 06:00:00,116,"71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re...",37.544640,126.835151,0.006,0.049,0.001,1.0,71.0,43.0


### Process_summary

In [84]:
def process_summary(df):
    df['Measurement_Date'] = pd.to_datetime(df['Measurement_Date'])
    df['Measurement_Time'] = df['Measurement_Date'].dt.time

    def extract_district(address):
        try:
            parts = address.split(', ')
            return parts[2]
        except:
            return None

    df['District'] = df['Address'].apply(extract_district)

    # Reorder kolom
    new_cols = ['Measurement_Date', 'Measurement_Time', 'Station_Code', 'Address', 'District',
                'Latitude', 'Longitude', 'SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']
    
    return df[new_cols]

#df_summary = process_summary(df_merged)

In [85]:
df_summary = process_summary(df_merged)
df_summary

Unnamed: 0,Measurement_Date,Measurement_Time,Station_Code,Address,District,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5
0,2017-01-01 00:00:00,00:00:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",Jongno-gu,37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0
1,2017-01-01 00:00:00,00:00:00,102,"15, Deoksugung-gil, Jung-gu, Seoul, Republic o...",Jung-gu,37.564263,126.974676,0.006,0.068,0.002,1.3,77.0,63.0
2,2017-01-01 00:00:00,00:00:00,103,"136, Hannam-daero, Yongsan-gu, Seoul, Republic...",Yongsan-gu,37.540033,127.004850,0.005,0.039,0.002,1.4,70.0,68.0
3,2017-01-01 00:00:00,00:00:00,104,"215, Jinheung-ro, Eunpyeong-gu, Seoul, Republi...",Eunpyeong-gu,37.609823,126.934848,0.005,0.045,0.003,0.6,73.0,46.0
4,2017-01-01 00:00:00,00:00:00,105,"32, Segeomjeong-ro 4-gil, Seodaemun-gu, Seoul,...",Seodaemun-gu,37.593742,126.949679,0.005,0.044,0.004,1.0,81.0,44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,2017-01-01 06:00:00,06:00:00,113,"34, Sirubong-ro 2-gil, Dobong-gu, Seoul, Repub...",Dobong-gu,37.654192,127.029088,0.004,0.047,0.002,0.9,54.0,28.0
163,2017-01-01 06:00:00,06:00:00,114,"17, Sanggye-ro 23-gil, Nowon-gu, Seoul, Republ...",Nowon-gu,37.658774,127.068505,0.007,0.045,0.002,1.3,74.0,58.0
164,2017-01-01 06:00:00,06:00:00,115,"56, Jungang-ro 52-gil, Yangcheon-gu, Seoul, Re...",Yangcheon-gu,37.525939,126.856603,0.005,0.040,0.002,1.2,73.0,48.0
165,2017-01-01 06:00:00,06:00:00,116,"71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re...",Gangseo-gu,37.544640,126.835151,0.006,0.049,0.001,1.0,71.0,43.0


### AQI_index

In [86]:
def AQI_parameter(df):
    def calculate_aqi_pm25(pm25):
        if pm25 <= 30:
            return round((50 / 30) * pm25, 1)
        elif pm25 <= 60:
            return round(((100 - 51) / (60 - 31)) * (pm25 - 31) + 51, 1)
        elif pm25 <= 90:
            return round(((200 - 101) / (90 - 61)) * (pm25 - 61) + 101, 1)
        elif pm25 <= 120:
            return round(((300 - 201) / (120 - 91)) * (pm25 - 91) + 201, 1)
        elif pm25 <= 250:
            return round(((400 - 301) / (250 - 121)) * (pm25 - 121) + 301, 1)
        else:
            return round(((500 - 401) / (350 - 251)) * (pm25 - 251) + 401, 1)

    df['AQI_index'] = df['PM2.5'].apply(calculate_aqi_pm25)

    def categorize_aqi(aqi):
        if aqi <= 50:
            return "Good"
        elif aqi <= 100:
            return "Satisfactory"
        elif aqi <= 200:
            return "Moderate"
        elif aqi <= 300:
            return "Poor"
        elif aqi <= 400:
            return "Very Poor"
        else:
            return "Severe"

    df['AQI_category'] = df['AQI_index'].apply(categorize_aqi)
    return df 
    #df.to_csv('measurement_datamart.csv', index=False) 

In [87]:
datamart_2 = AQI_parameter(df_summary)
datamart_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AQI_index'] = df['PM2.5'].apply(calculate_aqi_pm25)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AQI_category'] = df['AQI_index'].apply(categorize_aqi)


Unnamed: 0,Measurement_Date,Measurement_Time,Station_Code,Address,District,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5,AQI_index,AQI_category
0,2017-01-01 00:00:00,00:00:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",Jongno-gu,37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0,94.9,Satisfactory
1,2017-01-01 00:00:00,00:00:00,102,"15, Deoksugung-gil, Jung-gu, Seoul, Republic o...",Jung-gu,37.564263,126.974676,0.006,0.068,0.002,1.3,77.0,63.0,107.8,Moderate
2,2017-01-01 00:00:00,00:00:00,103,"136, Hannam-daero, Yongsan-gu, Seoul, Republic...",Yongsan-gu,37.540033,127.004850,0.005,0.039,0.002,1.4,70.0,68.0,124.9,Moderate
3,2017-01-01 00:00:00,00:00:00,104,"215, Jinheung-ro, Eunpyeong-gu, Seoul, Republi...",Eunpyeong-gu,37.609823,126.934848,0.005,0.045,0.003,0.6,73.0,46.0,76.3,Satisfactory
4,2017-01-01 00:00:00,00:00:00,105,"32, Segeomjeong-ro 4-gil, Seodaemun-gu, Seoul,...",Seodaemun-gu,37.593742,126.949679,0.005,0.044,0.004,1.0,81.0,44.0,73.0,Satisfactory
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,2017-01-01 06:00:00,06:00:00,113,"34, Sirubong-ro 2-gil, Dobong-gu, Seoul, Repub...",Dobong-gu,37.654192,127.029088,0.004,0.047,0.002,0.9,54.0,28.0,46.7,Good
163,2017-01-01 06:00:00,06:00:00,114,"17, Sanggye-ro 23-gil, Nowon-gu, Seoul, Republ...",Nowon-gu,37.658774,127.068505,0.007,0.045,0.002,1.3,74.0,58.0,96.6,Satisfactory
164,2017-01-01 06:00:00,06:00:00,115,"56, Jungang-ro 52-gil, Yangcheon-gu, Seoul, Re...",Yangcheon-gu,37.525939,126.856603,0.005,0.040,0.002,1.2,73.0,48.0,79.7,Satisfactory
165,2017-01-01 06:00:00,06:00:00,116,"71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re...",Gangseo-gu,37.544640,126.835151,0.006,0.049,0.001,1.0,71.0,43.0,71.3,Satisfactory


---------------------------------
### AQI

In [None]:
def AQI_who(df):
    def classify_who_air_quality(row):
        violations = 0
        if row['SO2'] > 40:
            violations += 1
        if row['NO2'] > 25:
            violations += 1
        if row['CO'] > 4:  # CO dalam mg/m³
            violations += 1
        if row['O3'] > 100:
            violations += 1
        if row['PM10'] > 45:
            violations += 1
        if row['PM2.5'] > 15:
            violations += 1

        if violations == 0:
            return 'Baik'
        elif violations <= 2:
            return 'Sedang'
        elif violations <= 4:
            return 'Tidak Sehat'
        else:
            return 'Berbahaya'


    df['WHO_Air_Quality'] = df.apply(classify_who_air_quality, axis=1)
    return df

#datamart = AQI_who(df_summary)

In [67]:
datamart = AQI_who(df_summary)
datamart

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['WHO_Air_Quality'] = df.apply(classify_who_air_quality, axis=1)


Unnamed: 0,Measurement_Date,Measurement_Time,Station_Code,Address,District,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5,WHO_Air_Quality
0,2017-01-01 00:00:00,00:00:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",Jongno-gu,37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0,Sedang
1,2017-01-01 00:00:00,00:00:00,102,"15, Deoksugung-gil, Jung-gu, Seoul, Republic o...",Jung-gu,37.564263,126.974676,0.006,0.068,0.002,1.3,77.0,63.0,Sedang
2,2017-01-01 00:00:00,00:00:00,103,"136, Hannam-daero, Yongsan-gu, Seoul, Republic...",Yongsan-gu,37.540033,127.004850,0.005,0.039,0.002,1.4,70.0,68.0,Sedang
3,2017-01-01 00:00:00,00:00:00,104,"215, Jinheung-ro, Eunpyeong-gu, Seoul, Republi...",Eunpyeong-gu,37.609823,126.934848,0.005,0.045,0.003,0.6,73.0,46.0,Sedang
4,2017-01-01 00:00:00,00:00:00,105,"32, Segeomjeong-ro 4-gil, Seodaemun-gu, Seoul,...",Seodaemun-gu,37.593742,126.949679,0.005,0.044,0.004,1.0,81.0,44.0,Sedang
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,2017-01-01 06:00:00,06:00:00,113,"34, Sirubong-ro 2-gil, Dobong-gu, Seoul, Repub...",Dobong-gu,37.654192,127.029088,0.004,0.047,0.002,0.9,54.0,28.0,Sedang
163,2017-01-01 06:00:00,06:00:00,114,"17, Sanggye-ro 23-gil, Nowon-gu, Seoul, Republ...",Nowon-gu,37.658774,127.068505,0.007,0.045,0.002,1.3,74.0,58.0,Sedang
164,2017-01-01 06:00:00,06:00:00,115,"56, Jungang-ro 52-gil, Yangcheon-gu, Seoul, Re...",Yangcheon-gu,37.525939,126.856603,0.005,0.040,0.002,1.2,73.0,48.0,Sedang
165,2017-01-01 06:00:00,06:00:00,116,"71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re...",Gangseo-gu,37.544640,126.835151,0.006,0.049,0.001,1.0,71.0,43.0,Sedang


------------------------
### Split datetime

In [34]:
def split_measurement_datetime(df):
    df['Measurement_Date'] = pd.to_datetime(df['Measurement_Date'])
    #df['Measurement_Date'] = df['Measurement_Date'].dt.date
    df['Measurement_Time'] = df['Measurement_Date'].dt.time
    return df

In [37]:
df_split = split_measurement_datetime(df_summary)
df_split

Unnamed: 0,Measurement_Date,Station_Code,Address,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5,Measurement_Time
0,2017-01-01 00:00:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0,00:00:00
1,2017-01-01 00:00:00,102,"15, Deoksugung-gil, Jung-gu, Seoul, Republic o...",37.564263,126.974676,0.006,0.068,0.002,1.3,77.0,63.0,00:00:00
2,2017-01-01 00:00:00,103,"136, Hannam-daero, Yongsan-gu, Seoul, Republic...",37.540033,127.004850,0.005,0.039,0.002,1.4,70.0,68.0,00:00:00
3,2017-01-01 00:00:00,104,"215, Jinheung-ro, Eunpyeong-gu, Seoul, Republi...",37.609823,126.934848,0.005,0.045,0.003,0.6,73.0,46.0,00:00:00
4,2017-01-01 00:00:00,105,"32, Segeomjeong-ro 4-gil, Seodaemun-gu, Seoul,...",37.593742,126.949679,0.005,0.044,0.004,1.0,81.0,44.0,00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
162,2017-01-01 06:00:00,113,"34, Sirubong-ro 2-gil, Dobong-gu, Seoul, Repub...",37.654192,127.029088,0.004,0.047,0.002,0.9,54.0,28.0,06:00:00
163,2017-01-01 06:00:00,114,"17, Sanggye-ro 23-gil, Nowon-gu, Seoul, Republ...",37.658774,127.068505,0.007,0.045,0.002,1.3,74.0,58.0,06:00:00
164,2017-01-01 06:00:00,115,"56, Jungang-ro 52-gil, Yangcheon-gu, Seoul, Re...",37.525939,126.856603,0.005,0.040,0.002,1.2,73.0,48.0,06:00:00
165,2017-01-01 06:00:00,116,"71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re...",37.544640,126.835151,0.006,0.049,0.001,1.0,71.0,43.0,06:00:00


In [38]:
def extract_district(address):
    try:
        parts = address.split(', ')
        district = parts[2]  
        return district
    except:
        return None

In [40]:
df_split['District'] = df_split['Address'].apply(extract_district)

new_cols = ['Measurement_Date', 'Measurement_Time', 'Station_Code', 'Address', 'District',
            'Latitude', 'Longitude', 'SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']
summary = df_split[new_cols]

In [41]:
summary

Unnamed: 0,Measurement_Date,Measurement_Time,Station_Code,Address,District,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5
0,2017-01-01 00:00:00,00:00:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",Jongno-gu,37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0
1,2017-01-01 00:00:00,00:00:00,102,"15, Deoksugung-gil, Jung-gu, Seoul, Republic o...",Jung-gu,37.564263,126.974676,0.006,0.068,0.002,1.3,77.0,63.0
2,2017-01-01 00:00:00,00:00:00,103,"136, Hannam-daero, Yongsan-gu, Seoul, Republic...",Yongsan-gu,37.540033,127.004850,0.005,0.039,0.002,1.4,70.0,68.0
3,2017-01-01 00:00:00,00:00:00,104,"215, Jinheung-ro, Eunpyeong-gu, Seoul, Republi...",Eunpyeong-gu,37.609823,126.934848,0.005,0.045,0.003,0.6,73.0,46.0
4,2017-01-01 00:00:00,00:00:00,105,"32, Segeomjeong-ro 4-gil, Seodaemun-gu, Seoul,...",Seodaemun-gu,37.593742,126.949679,0.005,0.044,0.004,1.0,81.0,44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,2017-01-01 06:00:00,06:00:00,113,"34, Sirubong-ro 2-gil, Dobong-gu, Seoul, Repub...",Dobong-gu,37.654192,127.029088,0.004,0.047,0.002,0.9,54.0,28.0
163,2017-01-01 06:00:00,06:00:00,114,"17, Sanggye-ro 23-gil, Nowon-gu, Seoul, Republ...",Nowon-gu,37.658774,127.068505,0.007,0.045,0.002,1.3,74.0,58.0
164,2017-01-01 06:00:00,06:00:00,115,"56, Jungang-ro 52-gil, Yangcheon-gu, Seoul, Re...",Yangcheon-gu,37.525939,126.856603,0.005,0.040,0.002,1.2,73.0,48.0
165,2017-01-01 06:00:00,06:00:00,116,"71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re...",Gangseo-gu,37.544640,126.835151,0.006,0.049,0.001,1.0,71.0,43.0


--------------------------------------------
# Load

In [None]:
# LOAD.PY

import os
import psycopg2
from sqlalchemy import create_engine
from dotenv import load_dotenv

def load_data(datamart):
    '''
    Fungsi ini ditujukan untuk melakukan load data ke NeonDB    
    '''
    load_dotenv()

    engine = create_engine(os.getenv('DATABASE_URL'))
    load_data_to_db(datamart, "Datamart_1",engine)    



def get_data_from_db(query, engine):
    try:
        df = pd.read_sql(query, engine)
        return df
    except Exception as e:
        print(e)

def load_data_to_db(df, table_name, engine):
    try:
        df.to_sql(table_name, engine, if_exists="append", index=False)
    except Exception as e:
        print(e)

------------------------------------------------------------------
# Staging (raw)

In [None]:
conn = psycopg2.connect(
    "postgresql://neondb_owner:npg_f1uYCZ8nRWzQ@ep-floral-lake-a533zre9-pooler.us-east-2.aws.neon.tech/Air-Pollution-Seoul?sslmode=require"
)

cur = conn.cursor()
cur.execute('''SELECT * FROM "Dim_Item";''')
rows = cur.fetchall()
print(rows)

[(1, 'SO2', 'ppm', Decimal('0.02'), Decimal('0.05'), Decimal('0.15'), Decimal('1.0')), (3, 'NO2', 'ppm', Decimal('0.03'), Decimal('0.06'), Decimal('0.2'), Decimal('2.0')), (5, 'CO', 'ppm', Decimal('2.0'), Decimal('9.0'), Decimal('15.0'), Decimal('50.0')), (6, 'O3', 'ppm', Decimal('0.03'), Decimal('0.09'), Decimal('0.15'), Decimal('0.5')), (8, 'PM10', 'Mircrogram/m3', Decimal('30.0'), Decimal('80.0'), Decimal('150.0'), Decimal('600.0')), (9, 'PM2.5', 'Mircrogram/m3', Decimal('15.0'), Decimal('35.0'), Decimal('75.0'), Decimal('500.0'))]


In [None]:
#rows

In [None]:
conn = psycopg2.connect("postgresql://neondb_owner:npg_f1uYCZ8nRWzQ@ep-floral-lake-a533zre9-pooler.us-east-2.aws.neon.tech/Air-Pollution-Seoul?sslmode=require")
cur = conn.cursor()
cur.execute(
    """
    SELECT table_name 
    FROM information_schema.tables 
    WHERE table_schema = 'public';
    """)
tables = cur.fetchall()
print(tables)


[('Dim_Date',), ('Fact_Info',), ('Dim_Item',), ('Dim_Station',)]


In [None]:
tables

[('Dim_Date',), ('Fact_Info',), ('Dim_Item',), ('Dim_Station',)]

In [None]:
import os
print(os.path.exists(r"C:\Users\fahri\Downloads\postgresql-42.6.0.jar"))

True
