In [222]:
import pandas as pd
import logging
import numpy as np

logging.basicConfig(level=logging.INFO)

In [223]:
url = "https://id.wikipedia.org/wiki/Daftar_miliarder_Forbes"

In [224]:
def scrape(url):
    logging.info(f"Scraping website with url:'{url}'...")
    return pd.read_html(url, header=None)

In [225]:
dfs=scrape(url)[3]

INFO:root:Scraping website with url:'https://id.wikipedia.org/wiki/Daftar_miliarder_Forbes'...


In [226]:
dfs

Unnamed: 0,No.,Nama,Kekayaan bersih (USD),Usia,Kebangsaan,Sumber kekayaan
0,,Jeff Bezos,$131 miliar,55,Amerika Serikat,Amazon
1,,Bill Gates,$96.5 miliar,63,Amerika Serikat,Microsoft
2,,Warren Buffett,$82.5 miliar,88,Amerika Serikat,Berkshire Hathaway
3,,Bernard Arnault,$76 miliar,70,Prancis,LVMH
4,,Carlos Slim,$64 miliar,79,Meksiko,"América Móvil, Grupo Carso"
5,,Amancio Ortega,$62.7 miliar,82,Spanyol,"Inditex, Zara"
6,,Larry Ellison,$62.5 miliar,74,Amerika Serikat,Oracle Corporation
7,,Mark Zuckerberg,$62.3 miliar,34,Amerika Serikat,"Facebook, Inc."
8,,Michael Bloomberg,$55.5 miliar,77,Amerika Serikat,Bloomberg L.P.
9,,Larry Page,$50.8 miliar,45,Amerika Serikat,Alphabet Inc.


# Cleaning

In [227]:
import re

In [228]:
def is_money_miliar(string_money):
    return string_money.lower().endswith("miliar")

In [229]:
def transform_money_format(string_money):
    half_clean_string = string_money.lower().replace(",",".").replace(" ","")
    return re.sub(r"[?\[M\]miliar|", "", half_clean_string)

In [230]:
def transform(df,tahun):
    logging.info("Transforming DataFrame ...")
    
    columns_mapping={
        "Nama":"nama",
        "No." : "nomor_urut",
        "Kekayaan bersih (USD)":"kekayaan_bersih_usd_juta",
        "Usia" : "usia",
        "Kebangsaan" : "kebangsaan",
        "Sumber kekayaan":"sumber_kekayaan"
    }
    renamed_df = df.rename(columns=columns_mapping)
    
    #renamed_df["kekayaan_bersih_usd_juta"]=renamed_df["kekayaan_bersih_usd"].apply(
     #   lambda value: float(transform_money_format(value))*1000 if is_money_miliar(value) else float(transform_money_format(value))
    #)
    
    renamed_df["tahun"]=tahun
    
    return renamed_df[["nomor_urut","nama","kekayaan_bersih_usd_juta","usia","kebangsaan","sumber_kekayaan","tahun"]]

In [231]:
df_2021=transform(dfs,2021)

INFO:root:Transforming DataFrame ...


In [232]:
df_2021

Unnamed: 0,nomor_urut,nama,kekayaan_bersih_usd_juta,usia,kebangsaan,sumber_kekayaan,tahun
0,,Jeff Bezos,$131 miliar,55,Amerika Serikat,Amazon,2021
1,,Bill Gates,$96.5 miliar,63,Amerika Serikat,Microsoft,2021
2,,Warren Buffett,$82.5 miliar,88,Amerika Serikat,Berkshire Hathaway,2021
3,,Bernard Arnault,$76 miliar,70,Prancis,LVMH,2021
4,,Carlos Slim,$64 miliar,79,Meksiko,"América Móvil, Grupo Carso",2021
5,,Amancio Ortega,$62.7 miliar,82,Spanyol,"Inditex, Zara",2021
6,,Larry Ellison,$62.5 miliar,74,Amerika Serikat,Oracle Corporation,2021
7,,Mark Zuckerberg,$62.3 miliar,34,Amerika Serikat,"Facebook, Inc.",2021
8,,Michael Bloomberg,$55.5 miliar,77,Amerika Serikat,Bloomberg L.P.,2021
9,,Larry Page,$50.8 miliar,45,Amerika Serikat,Alphabet Inc.,2021


In [233]:
df_2021['nomor_urut'] = np.arange(df_2021.shape[0])
df_2021['nomor_urut']=df_2021['nomor_urut']+1
df_2021['kekayaan_bersih_usd_juta'] = df_2021['kekayaan_bersih_usd_juta'].str.replace('miliar', '')
df_2021['kekayaan_bersih_usd_juta'] = df_2021['kekayaan_bersih_usd_juta'].replace(' ', '')
df_2021['kekayaan_bersih_usd_juta'] = df_2021['kekayaan_bersih_usd_juta'].str.replace('$', '')
df_2021

Unnamed: 0,nomor_urut,nama,kekayaan_bersih_usd_juta,usia,kebangsaan,sumber_kekayaan,tahun
0,1,Jeff Bezos,131.0,55,Amerika Serikat,Amazon,2021
1,2,Bill Gates,96.5,63,Amerika Serikat,Microsoft,2021
2,3,Warren Buffett,82.5,88,Amerika Serikat,Berkshire Hathaway,2021
3,4,Bernard Arnault,76.0,70,Prancis,LVMH,2021
4,5,Carlos Slim,64.0,79,Meksiko,"América Móvil, Grupo Carso",2021
5,6,Amancio Ortega,62.7,82,Spanyol,"Inditex, Zara",2021
6,7,Larry Ellison,62.5,74,Amerika Serikat,Oracle Corporation,2021
7,8,Mark Zuckerberg,62.3,34,Amerika Serikat,"Facebook, Inc.",2021
8,9,Michael Bloomberg,55.5,77,Amerika Serikat,Bloomberg L.P.,2021
9,10,Larry Page,50.8,45,Amerika Serikat,Alphabet Inc.,2021


In [234]:
df_2021.dtypes

nomor_urut                   int64
nama                        object
kekayaan_bersih_usd_juta    object
usia                         int64
kebangsaan                  object
sumber_kekayaan             object
tahun                        int64
dtype: object

In [235]:
df_2021['kekayaan_bersih_usd_juta']=df_2021.kekayaan_bersih_usd_juta.astype(float)*1000
df_2021

Unnamed: 0,nomor_urut,nama,kekayaan_bersih_usd_juta,usia,kebangsaan,sumber_kekayaan,tahun
0,1,Jeff Bezos,131000.0,55,Amerika Serikat,Amazon,2021
1,2,Bill Gates,96500.0,63,Amerika Serikat,Microsoft,2021
2,3,Warren Buffett,82500.0,88,Amerika Serikat,Berkshire Hathaway,2021
3,4,Bernard Arnault,76000.0,70,Prancis,LVMH,2021
4,5,Carlos Slim,64000.0,79,Meksiko,"América Móvil, Grupo Carso",2021
5,6,Amancio Ortega,62700.0,82,Spanyol,"Inditex, Zara",2021
6,7,Larry Ellison,62500.0,74,Amerika Serikat,Oracle Corporation,2021
7,8,Mark Zuckerberg,62300.0,34,Amerika Serikat,"Facebook, Inc.",2021
8,9,Michael Bloomberg,55500.0,77,Amerika Serikat,Bloomberg L.P.,2021
9,10,Larry Page,50800.0,45,Amerika Serikat,Alphabet Inc.,2021


In [236]:
df_2021['kekayaan_bersih_usd_juta'] = df_2021['kekayaan_bersih_usd_juta'].apply(lambda x: "${:.2f}".format((x/1)))
df_2021

Unnamed: 0,nomor_urut,nama,kekayaan_bersih_usd_juta,usia,kebangsaan,sumber_kekayaan,tahun
0,1,Jeff Bezos,$131000.00,55,Amerika Serikat,Amazon,2021
1,2,Bill Gates,$96500.00,63,Amerika Serikat,Microsoft,2021
2,3,Warren Buffett,$82500.00,88,Amerika Serikat,Berkshire Hathaway,2021
3,4,Bernard Arnault,$76000.00,70,Prancis,LVMH,2021
4,5,Carlos Slim,$64000.00,79,Meksiko,"América Móvil, Grupo Carso",2021
5,6,Amancio Ortega,$62700.00,82,Spanyol,"Inditex, Zara",2021
6,7,Larry Ellison,$62500.00,74,Amerika Serikat,Oracle Corporation,2021
7,8,Mark Zuckerberg,$62300.00,34,Amerika Serikat,"Facebook, Inc.",2021
8,9,Michael Bloomberg,$55500.00,77,Amerika Serikat,Bloomberg L.P.,2021
9,10,Larry Page,$50800.00,45,Amerika Serikat,Alphabet Inc.,2021


# Loading (Storing to Database)

In [237]:
pip install psycopg2-binary==2.8.6

Note: you may need to restart the kernel to use updated packages.


In [238]:
pip install Flask-SQLAlchemy

Note: you may need to restart the kernel to use updated packages.


In [240]:
from sqlalchemy import create_engine

In [241]:
DB_NAME = "postgres"
DB_USER = "user1"
DB_PASSWORD = "user1"
DB_HOST = "104.197.148.144"
DB_PORT = "5432"
CONNECTION_STRING = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
TABLE_NAME = "dwineta-peserta_orang_terkaya_forbes"

In [242]:
CONNECTION_STRING

'postgresql://user1:user1@104.197.148.144:5432/postgres'

In [243]:
def write_to_postgres(df,db_name,table_name,connection_string):
    engine = create_engine(connection_string)
    logging.info(f"Writing dataframe to database: '{db_name}',table: '{table_name}' ...")
    df.to_sql(name=table_name, con=engine, if_exists="replace",index=False)

In [245]:
write_to_postgres(df=df_2021,db_name=DB_NAME,table_name=TABLE_NAME, connection_string=CONNECTION_STRING)

INFO:root:Writing dataframe to database: 'postgres',table: 'dwineta-peserta_orang_terkaya_forbes' ...


In [246]:
def read_from_postgres(db_name,table_name,connection_string):
    engine=create_engine(connection_string)
    
    logging.info(f"reading postgres database:'{db_name}',table: '{table_name}' ...")
    return pd.read_sql_table(table_name,con=engine)

In [247]:
result_df = read_from_postgres(db_name=DB_NAME, table_name=TABLE_NAME, connection_string=CONNECTION_STRING)

INFO:root:reading postgres database:'postgres',table: 'dwineta-peserta_orang_terkaya_forbes' ...


In [248]:
print("Daftar Orang Terkaya di Indonesia:")
print(result_df.to_string())

Daftar Orang Terkaya di Indonesia:
   nomor_urut               nama kekayaan_bersih_usd_juta  usia       kebangsaan             sumber_kekayaan  tahun
0           1         Jeff Bezos               $131000.00    55  Amerika Serikat                      Amazon   2021
1           2         Bill Gates                $96500.00    63  Amerika Serikat                   Microsoft   2021
2           3     Warren Buffett                $82500.00    88  Amerika Serikat          Berkshire Hathaway   2021
3           4    Bernard Arnault                $76000.00    70          Prancis                        LVMH   2021
4           5        Carlos Slim                $64000.00    79          Meksiko  América Móvil, Grupo Carso   2021
5           6     Amancio Ortega                $62700.00    82          Spanyol               Inditex, Zara   2021
6           7      Larry Ellison                $62500.00    74  Amerika Serikat          Oracle Corporation   2021
7           8    Mark Zuckerberg     