### 1. Install Library

In [1]:
# %pip install pandas-gbq google-cloud-bigquery
# %pip install pandas-gbq google-auth google-auth-oauthlib

### 2. Import Library

In [2]:
import pandas as pd
import numpy as np

import mysql.connector

from google.oauth2 import service_account
from google.cloud import bigquery
import pandas_gbq
from pandas_gbq import to_gbq

import os
from dotenv import load_dotenv

### 3. Database Configuration

In [3]:
# Load konfigurasi dari .env
load_dotenv()

# MySQL configurations
rds_host = os.getenv('RDS_HOST')
rds_dbname = os.getenv('RDS_DBNAME')
rds_user = os.getenv('RDS_USER')
rds_password = os.getenv('RDS_PASSWORD')

# Google Cloud configurations
project_id = os.getenv('GOOGLE_CLOUD_PROJECT_ID')
dataset_id = os.getenv('GOOGLE_CLOUD_DATASET_ID')
google_application_credentials = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')

# Set GOOGLE_APPLICATION_CREDENTIALS environment variable
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = google_application_credentials

# Koneksi ke MySQL di Amazon RDS
conn_rds = mysql.connector.connect(
    host=rds_host,
    database=rds_dbname,
    user=rds_user,
    password=rds_password
)

### 4. Extract

#### table_to_df function below is used to extract data from a table and convert it into a dataframe

In [4]:
def table_to_df(table_name):
   query = f"SELECT * FROM {table_name}"
   df = pd.read_sql_query(query, conn_rds)
   return df

#### 4.1 Converting data from tables to dataframes

In [5]:
destinations_df = table_to_df('destinations')
routes_df = table_to_df('routes')
route_details_df = table_to_df('route_details')
users_df = table_to_df('users')

  df = pd.read_sql_query(query, conn_rds)


#### 4.2 destinations dataframe

In [6]:
destinations_df

Unnamed: 0,id,category_id,name,description,open_time,close_time,entry_price,longitude,latitude,visit_count,created_at,updated_at,deleted_at
0,306d305e-3359-4884-8d38-89c04e8adea6,eb77b590-b255-4ea1-b11a-d445a259ac61,Kawah Ijen,Kawah Ijen adalah sebuah kompleks gunung berap...,08:00,17:00,100000.0,114.2423,-8.0582,23,2024-06-07 07:47:59,2024-06-10 08:05:22,
1,306d305e-3359-4884-8d38-89c04e8adec1,eb77b590-b255-4ea1-b11a-d445a259ac62,Candi Borobudur,Candi Borobudur adalah sebuah candi Buddha yan...,08:00,17:00,50000.0,110.2038,-7.6079,4,2024-06-07 07:47:59,2024-06-10 08:05:24,
2,306d305e-3359-4884-8d38-89c04e8adec2,eb77b590-b255-4ea1-b11a-d445a259ac61,Pantai Kuta,Pantai Kuta adalah salah satu pantai yang terk...,08:00,17:00,0.0,115.1675,-8.7174,1,2024-06-07 07:47:59,2024-06-08 15:09:44,
3,306d305e-3359-4884-8d38-89c04e8adec3,eb77b590-b255-4ea1-b11a-d445a259ac61,Danau Toba,Danau Toba adalah danau terbesar di Indonesia ...,08:00,17:00,0.0,99.0852,2.6696,0,2024-06-07 07:47:59,2024-06-07 07:47:59,
4,306d305e-3359-4884-8d38-89c04e8adec4,eb77b590-b255-4ea1-b11a-d445a259ac61,Taman Mini Indonesia Indah,Taman Mini Indonesia Indah adalah sebuah taman...,08:00,17:00,20000.0,106.8956,-6.3027,2,2024-06-07 07:47:59,2024-06-08 17:36:51,
5,306d305e-3359-4884-8d38-89c04e8adec5,eb77b590-b255-4ea1-b11a-d445a259ac62,Gunung Bromo,Gunung Bromo adalah sebuah gunung berapi aktif...,08:00,17:00,30000.0,112.9528,-7.9425,1,2024-06-07 07:47:59,2024-06-10 08:05:30,
6,306d305e-3359-4884-8d38-89c04e8adec6,eb77b590-b255-4ea1-b11a-d445a259ac61,Goa Pindul,Gua tempat Joko terbentur tersebut dinamai Gua...,08:00,17:00,25000.0,123.456,456.789,1,2024-06-07 07:47:59,2024-06-08 15:14:05,
7,306d305e-3359-4884-8d38-89c04e8adec7,eb77b590-b255-4ea1-b11a-d445a259ac62,Pulau Komodo,Pulau Komodo adalah sebuah pulau yang terletak...,08:00,17:00,150000.0,119.4986,-8.5833,0,2024-06-07 07:47:59,2024-06-07 07:47:59,
8,306d305e-3359-4884-8d38-89c04e8adec8,eb77b590-b255-4ea1-b11a-d445a259ac61,Raja Ampat,Raja Ampat adalah kepulauan yang terletak di b...,08:00,17:00,500000.0,130.5036,-1.0562,1,2024-06-07 07:47:59,2024-06-08 16:02:56,
9,306d305e-3359-4884-8d38-89c04e8adec9,eb77b590-b255-4ea1-b11a-d445a259ac61,Tanah Lot,Tanah Lot adalah sebuah formasi batuan di lepa...,08:00,17:00,20000.0,115.0865,-8.6211,0,2024-06-07 07:47:59,2024-06-07 07:47:59,


#### 4.3 routes dataframe

In [7]:
routes_df

Unnamed: 0,id,user_id,city_id,name,start_longitude,start_latitude,price,created_at,updated_at,deleted_at
0,4baa63f1-c447-4cd8-bd86-9bb7b178ef54,79b411cd-a0c4-4f63-892c-ee8643707551,1101,Rute 1,3.105625,97.394489,50000.0,2024-06-09 08:04:36,2024-06-09 08:04:36,
1,e315db1d-f065-4a5d-8219-806a809564ec,79b411cd-a0c4-4f63-892c-ee8643707551,1101,Rute 2,3.105625,97.394489,50000.0,2024-06-09 08:04:36,2024-06-09 08:04:36,


#### 4.4 route_details dataframe

In [8]:
route_details_df

Unnamed: 0,id,destination_id,route_id,longitude,latitude,duration,order,visit_start,visit_end,created_at,updated_at,deleted_at
0,179665db-dfba-4d10-8371-29d2ce169cc2,306d305e-3359-4884-8d38-89c04e8adec1,4baa63f1-c447-4cd8-bd86-9bb7b178ef54,114.2423,-8.0582,3600,1,0 days 09:00:00,0 days 11:00:00,2024-06-09 08:11:40,2024-06-09 08:11:40,
1,bfbe5a9e-8a54-402f-bbc0-9ddce0e78c93,306d305e-3359-4884-8d38-89c04e8adea6,4baa63f1-c447-4cd8-bd86-9bb7b178ef54,114.2423,-8.0582,3600,1,0 days 09:00:00,0 days 11:00:00,2024-06-09 08:11:40,2024-06-09 08:11:40,
2,dc9b287e-5b99-44e1-ba10-f269474d8d4b,306d305e-3359-4884-8d38-89c04e8adea6,e315db1d-f065-4a5d-8219-806a809564ec,114.2423,-8.0582,3600,1,0 days 09:00:00,0 days 11:00:00,2024-06-09 08:11:40,2024-06-09 08:11:40,


#### 4.5 users dataframe

In [9]:
users_df

Unnamed: 0,id,email,password,username,fullname,bio,phone_number,profile_image_url,gender,city,email_verified_at,created_at,updated_at,deleted_at,province,refresh_token
0,00862788-5ded-4065-8275-2569748f64aa,xifihi49190@huleos.com,$2a$10$X4GUlWtLjn/8M2cT7mOm/eD2nicI91W3mqm3mEI...,testverify10,John Doe,,81234567891.0,,,,NaT,2024-05-27 15:26:01.704,2024-05-27 15:26:01.704,NaT,,
1,00bddbb8-8e7d-46ab-acfc-aab981c1ada1,irsyadyazidsyafiq@gmail.com,$2a$10$HcMceoyuqQLLmjNKny2peOLz.pxxT563Mua2u1n...,yazid.syafiq,Yazid Syafiq Irsyad,,81393984849.0,,,,2024-05-31 11:59:19.353,2024-05-31 11:58:43.404,2024-06-10 06:12:29.102,NaT,,
2,03d7240b-3e9b-4105-ac93-a926b752bb81,shuuuuuuu@example.com,$2a$10$NvsLJLtnk.IW7gdbkOwK8Ob7uuAADw4o53ctlZW...,paimon,nama_lengkap,bio,81234567890.0,img.jpg,pria,Jaksel,NaT,2024-05-28 06:43:29.804,2024-06-05 14:44:35.107,NaT,Jakarta,
3,0483c5ae-046a-403a-a656-db920635396b,johndoe2345@example.com,$2a$10$Dxg27rgrHy1bZPiD0jNI8uInxr/duxnf3QEH8IS...,johndoe2443,John Doe,,81234567891.0,,,,NaT,2024-05-25 15:12:44.106,2024-05-25 15:12:44.106,2024-06-05 14:46:44.405,,
4,083cb941-9a91-4ea4-af6e-5b25384ff00b,johndoe100@example.com,$2a$10$voavmVlJHnNfrfDRqaQ4xuRc7CDj.amd3bShTYS...,johndoe100,John Doe,,81234567891.0,,,,NaT,2024-05-27 16:06:57.710,2024-05-27 16:06:57.710,NaT,,
5,0d26d129-ea9d-4a5a-9f0d-ffc968d11dfc,demex95935@cgbird.com,$2a$10$jZ0CV.Ps4fulmU/2rLMkBeZnVGEfRAw0JGMPPtG...,testverify21,John Doe,,8123456790.0,,,,2024-05-27 16:49:52.892,2024-05-27 16:49:17.606,2024-05-27 16:49:52.894,NaT,,
6,0ea8bb14-365d-4fd5-b774-868185c67fc5,bikiko364222@huleos.com,$2a$10$01f0ROVxW7QZtJQzH8frv./t5mP2LnzSQwlnlf0...,testverify1,John Doe,,81234567891.0,,,,NaT,2024-05-27 14:05:18.804,2024-05-27 14:05:18.804,NaT,,
7,10cdec36-6ed8-45ad-9a47-88438ec1a8ee,xifihi4990@huleos.com,$2a$10$Nym.xZSC.SrOt9tSQumjG.P69qwz/LW.OS7Xljq...,testverify9,John Doe,,81234567891.0,,,,2024-05-27 15:21:38.251,2024-05-27 15:19:33.409,2024-05-27 15:21:38.254,NaT,,
8,116c2659-e2c3-4346-adfd-fbf81d90b42f,johndoe16@gmail.com,$2a$10$YGTtxDW2k0QzF3yz7CcNr.S/kWzPx2ztqkYJ0Po...,johndoe16,John Doe,,8129391092.0,,,,NaT,2024-05-26 18:34:06.604,2024-05-26 18:34:06.604,2024-06-05 14:19:35.472,,
9,12d34b07-9c0f-4287-92cc-01364f7ac703,,$2a$10$0146zqADfSrhec8TVd.L8O3J5TzuzLSIviiIC7F...,,,,,,,,NaT,2024-06-05 14:02:02.608,2024-06-05 14:39:57.616,NaT,,


#### 4.6 Closing connection

In [10]:
conn_rds.close()

### 5. Transformation

#### 5.1 Selecting subset of each dataframe

In [11]:
dim_destinations = destinations_df[['name', 'description', 'open_time', 'close_time', 'entry_price', 'longitude', 'latitude', 'visit_count']]
dim_routes = routes_df[['name', 'start_longitude', 'start_latitude', 'price']]
dim_route_details = route_details_df[['longitude', 'latitude', 'duration', 'order', 'visit_start', 'visit_end']]
dim_users = users_df[['email', 'username', 'fullname', 'phone_number', 'gender', 'city', 'province']]

#### 5.2 destinations dimension

In [12]:
dim_destinations

Unnamed: 0,name,description,open_time,close_time,entry_price,longitude,latitude,visit_count
0,Kawah Ijen,Kawah Ijen adalah sebuah kompleks gunung berap...,08:00,17:00,100000.0,114.2423,-8.0582,23
1,Candi Borobudur,Candi Borobudur adalah sebuah candi Buddha yan...,08:00,17:00,50000.0,110.2038,-7.6079,4
2,Pantai Kuta,Pantai Kuta adalah salah satu pantai yang terk...,08:00,17:00,0.0,115.1675,-8.7174,1
3,Danau Toba,Danau Toba adalah danau terbesar di Indonesia ...,08:00,17:00,0.0,99.0852,2.6696,0
4,Taman Mini Indonesia Indah,Taman Mini Indonesia Indah adalah sebuah taman...,08:00,17:00,20000.0,106.8956,-6.3027,2
5,Gunung Bromo,Gunung Bromo adalah sebuah gunung berapi aktif...,08:00,17:00,30000.0,112.9528,-7.9425,1
6,Goa Pindul,Gua tempat Joko terbentur tersebut dinamai Gua...,08:00,17:00,25000.0,123.456,456.789,1
7,Pulau Komodo,Pulau Komodo adalah sebuah pulau yang terletak...,08:00,17:00,150000.0,119.4986,-8.5833,0
8,Raja Ampat,Raja Ampat adalah kepulauan yang terletak di b...,08:00,17:00,500000.0,130.5036,-1.0562,1
9,Tanah Lot,Tanah Lot adalah sebuah formasi batuan di lepa...,08:00,17:00,20000.0,115.0865,-8.6211,0


#### 5.3 routes dimension

In [13]:
dim_routes

Unnamed: 0,name,start_longitude,start_latitude,price
0,Rute 1,3.105625,97.394489,50000.0
1,Rute 2,3.105625,97.394489,50000.0


#### 5.4 route_details dimension

In [14]:
dim_route_details['visit_start'] = dim_route_details['visit_start'].astype(str)
dim_route_details['visit_end'] = dim_route_details['visit_end'].astype(str)
dim_route_details

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dim_route_details['visit_start'] = dim_route_details['visit_start'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dim_route_details['visit_end'] = dim_route_details['visit_end'].astype(str)


Unnamed: 0,longitude,latitude,duration,order,visit_start,visit_end
0,114.2423,-8.0582,3600,1,0 days 09:00:00,0 days 11:00:00
1,114.2423,-8.0582,3600,1,0 days 09:00:00,0 days 11:00:00
2,114.2423,-8.0582,3600,1,0 days 09:00:00,0 days 11:00:00


#### 5.5 users dimension

In [15]:
dim_users

Unnamed: 0,email,username,fullname,phone_number,gender,city,province
0,xifihi49190@huleos.com,testverify10,John Doe,81234567891.0,,,
1,irsyadyazidsyafiq@gmail.com,yazid.syafiq,Yazid Syafiq Irsyad,81393984849.0,,,
2,shuuuuuuu@example.com,paimon,nama_lengkap,81234567890.0,pria,Jaksel,Jakarta
3,johndoe2345@example.com,johndoe2443,John Doe,81234567891.0,,,
4,johndoe100@example.com,johndoe100,John Doe,81234567891.0,,,
5,demex95935@cgbird.com,testverify21,John Doe,8123456790.0,,,
6,bikiko364222@huleos.com,testverify1,John Doe,81234567891.0,,,
7,xifihi4990@huleos.com,testverify9,John Doe,81234567891.0,,,
8,johndoe16@gmail.com,johndoe16,John Doe,8129391092.0,,,
9,,,,,,,


#### 5.6 dim_times

In [16]:
# set seed for reproducibility
np.random.seed(42)

# generate random dates within the specified range
n_samples = 50
start_date = pd.Timestamp('2023-01-01')
end_date = pd.Timestamp('2025-12-31')
date_range = pd.date_range(start_date, end_date)

# randomly sample dates
random_dates = np.random.choice(date_range, n_samples)
random_dates = pd.to_datetime(random_dates)

dim_times = pd.DataFrame({
   'time_id': range(1, n_samples + 1),
   'tahun': random_dates.year.astype(str),
   'bulan': random_dates.month.astype(str).str.zfill(2),
   'tanggal': random_dates.day.astype(str).str.zfill(2),
   'tanggallengkap': random_dates
})

dim_times.head()

Unnamed: 0,time_id,tahun,bulan,tanggal,tanggallengkap
0,1,2025,5,10,2025-05-10
1,2,2025,12,31,2025-12-31
2,3,2025,11,10,2025-11-10
3,4,2023,5,2,2023-05-02
4,5,2024,4,11,2024-04-11


### 6. Load to Big Query

In [17]:
def load_to_gbq(credentials, project_id, dataset_id, table_names, dataframes):
   for df, table_name in zip(dataframes, table_names):
      table_full_id = f'{project_id}.{dataset_id}.{table_name}'
      to_gbq(df, table_full_id, project_id=project_id, if_exists='replace', credentials=credentials)
      print(f'Table {table_name} loaded successfully!')

In [18]:
credentials = service_account.Credentials.from_service_account_file(google_application_credentials)

dfs = [dim_destinations, dim_routes, dim_route_details, dim_users, dim_times]
tables = ['dim_destinations', 'dim_routes', 'dim_route_details', 'dim_users', 'dim_times']

load_to_gbq(credentials, project_id, dataset_id, tables, dfs)

100%|██████████| 1/1 [00:00<?, ?it/s]


Table dim_destinations loaded successfully!


100%|██████████| 1/1 [00:00<?, ?it/s]


Table dim_routes loaded successfully!


100%|██████████| 1/1 [00:00<?, ?it/s]


Table dim_route_details loaded successfully!


100%|██████████| 1/1 [00:00<?, ?it/s]


Table dim_users loaded successfully!


100%|██████████| 1/1 [00:00<?, ?it/s]

Table dim_times loaded successfully!



