## Importing Library
Import library yang akan digunakan dalam project

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Scrape Data
Melakukan scraping data dari web, kemudian menyimpannya ke dalam bentuk dataframe

In [2]:
url = 'https://www.dewaweb.com/domain'

response = requests.get(url)

response_content = response.content

output = pd.read_html(response_content)
output

[              TLD     New Domain Transfer / Renewal
 0             .ac    Rp. 900,000        Rp. 900,000
 1          .ac.id     Rp. 50,000         Rp. 50,000
 2        .academy    Rp. 450,000        Rp. 450,000
 3     .accountant    Rp. 420,000        Rp. 420,000
 4    .accountants  Rp. 1,350,000      Rp. 1,350,000
 ..            ...            ...                ...
 142      .guitars  Rp. 1,850,000      Rp. 1,850,000
 143         .guru    Rp. 420,000        Rp. 420,000
 144         .haus    Rp. 450,000        Rp. 450,000
 145   .healthcare    Rp. 700,000        Rp. 700,000
 146         .help    Rp. 400,000        Rp. 400,000
 
 [147 rows x 3 columns],
            TLD     New Domain Transfer / Renewal
 0         .hiv  Rp. 3,500,000      Rp. 3,500,000
 1    .holdings    Rp. 700,000        Rp. 700,000
 2     .holiday    Rp. 700,000        Rp. 700,000
 3        .host  Rp. 1,300,000      Rp. 1,300,000
 4     .hosting  Rp. 5,600,000      Rp. 5,600,000
 ..         ...            ...       

## Checking
Melakukan pengecekan pada hasil scraping

In [3]:
output[0].head()

Unnamed: 0,TLD,New Domain,Transfer / Renewal
0,.ac,"Rp. 900,000","Rp. 900,000"
1,.ac.id,"Rp. 50,000","Rp. 50,000"
2,.academy,"Rp. 450,000","Rp. 450,000"
3,.accountant,"Rp. 420,000","Rp. 420,000"
4,.accountants,"Rp. 1,350,000","Rp. 1,350,000"


In [4]:
output[0].tail()

Unnamed: 0,TLD,New Domain,Transfer / Renewal
142,.guitars,"Rp. 1,850,000","Rp. 1,850,000"
143,.guru,"Rp. 420,000","Rp. 420,000"
144,.haus,"Rp. 450,000","Rp. 450,000"
145,.healthcare,"Rp. 700,000","Rp. 700,000"
146,.help,"Rp. 400,000","Rp. 400,000"


In [5]:
output[1].head()

Unnamed: 0,TLD,New Domain,Transfer / Renewal
0,.hiv,"Rp. 3,500,000","Rp. 3,500,000"
1,.holdings,"Rp. 700,000","Rp. 700,000"
2,.holiday,"Rp. 700,000","Rp. 700,000"
3,.host,"Rp. 1,300,000","Rp. 1,300,000"
4,.hosting,"Rp. 5,600,000","Rp. 5,600,000"


In [6]:
output[1].tail()

Unnamed: 0,TLD,New Domain,Transfer / Renewal
142,.works,"Rp. 450,000","Rp. 450,000"
143,.world,"Rp. 450,000","Rp. 450,000"
144,.xyz Hot,"Rp. 25,000","Rp. 160,000"
145,.yoga,"Rp. 420,000","Rp. 420,000"
146,.zone,"Rp. 450,000","Rp. 450,000"


## Menggabungkan Hasil
Dari hasil scraping web yang telah dilakukan, didapatkan dua buah table terpisah. Untuk memudahkan proses selanjutnya, maka perlu dilakukan penggabungan kedua table tersebut.

In [7]:
dewaWeb_pricing = output[0].append(output[1], ignore_index=True)
dewaWeb_pricing.shape

  dewaWeb_pricing = output[0].append(output[1], ignore_index=True)


(294, 3)

In [8]:
dewaWeb_pricing.head()

Unnamed: 0,TLD,New Domain,Transfer / Renewal
0,.ac,"Rp. 900,000","Rp. 900,000"
1,.ac.id,"Rp. 50,000","Rp. 50,000"
2,.academy,"Rp. 450,000","Rp. 450,000"
3,.accountant,"Rp. 420,000","Rp. 420,000"
4,.accountants,"Rp. 1,350,000","Rp. 1,350,000"


In [9]:
dewaWeb_pricing.tail()

Unnamed: 0,TLD,New Domain,Transfer / Renewal
289,.works,"Rp. 450,000","Rp. 450,000"
290,.world,"Rp. 450,000","Rp. 450,000"
291,.xyz Hot,"Rp. 25,000","Rp. 160,000"
292,.yoga,"Rp. 420,000","Rp. 420,000"
293,.zone,"Rp. 450,000","Rp. 450,000"


In [10]:
dewaWeb_pricing.isnull().any()

TLD                   False
New Domain            False
Transfer / Renewal    False
dtype: bool

In [11]:
dewaWeb_pricing.duplicated().any()

False

## Data Cleaning
Membersihkan data seperti mengganti nama kolom menjadi nama yang lebih baik, membersihkan data pada kolom New Domain dan Transfer / Renewal agar dapat diolah lebih lanjut serta mengubah tipe datanya menjadi integer.

In [12]:
dewaWeb_pricing.rename(columns= {'New Domain':'new_domain', 'Transfer / Renewal':'transfer_or_renewal'}, inplace=True)

In [13]:
dewaWeb_pricing['new_domain'] = dewaWeb_pricing['new_domain'].str.replace(',', '')
dewaWeb_pricing['transfer_or_renewal'] = dewaWeb_pricing['transfer_or_renewal'].str.replace(',', '')
dewaWeb_pricing.head()

Unnamed: 0,TLD,new_domain,transfer_or_renewal
0,.ac,Rp. 900000,Rp. 900000
1,.ac.id,Rp. 50000,Rp. 50000
2,.academy,Rp. 450000,Rp. 450000
3,.accountant,Rp. 420000,Rp. 420000
4,.accountants,Rp. 1350000,Rp. 1350000


In [14]:
dewaWeb_pricing['new_domain'] = dewaWeb_pricing['new_domain'].str.replace(' ', '')
dewaWeb_pricing['transfer_or_renewal'] = dewaWeb_pricing['transfer_or_renewal'].str.replace(' ', '')
dewaWeb_pricing.head()

Unnamed: 0,TLD,new_domain,transfer_or_renewal
0,.ac,Rp.900000,Rp.900000
1,.ac.id,Rp.50000,Rp.50000
2,.academy,Rp.450000,Rp.450000
3,.accountant,Rp.420000,Rp.420000
4,.accountants,Rp.1350000,Rp.1350000


In [15]:
dewaWeb_pricing['TLD'] = dewaWeb_pricing['TLD'].str.replace(' Hot', '')
dewaWeb_pricing.head()

Unnamed: 0,TLD,new_domain,transfer_or_renewal
0,.ac,Rp.900000,Rp.900000
1,.ac.id,Rp.50000,Rp.50000
2,.academy,Rp.450000,Rp.450000
3,.accountant,Rp.420000,Rp.420000
4,.accountants,Rp.1350000,Rp.1350000


Menggunakan regular expression untuk menghilangkan mata uang di depan angka

In [16]:
import re

def remove_currency(text):
    pattern = r'(Rp\.?|IDR)'
    return re.sub(pattern, '', text)

In [17]:
dewaWeb_pricing['new_domain'] = dewaWeb_pricing['new_domain'].apply(remove_currency)
dewaWeb_pricing['transfer_or_renewal'] = dewaWeb_pricing['transfer_or_renewal'].apply(remove_currency)
dewaWeb_pricing.head()

Unnamed: 0,TLD,new_domain,transfer_or_renewal
0,.ac,900000,900000
1,.ac.id,50000,50000
2,.academy,450000,450000
3,.accountant,420000,420000
4,.accountants,1350000,1350000


In [18]:
dewaWeb_pricing['new_domain'] = dewaWeb_pricing['new_domain'].astype(int)
dewaWeb_pricing['transfer_or_renewal'] = dewaWeb_pricing['transfer_or_renewal'].astype(int)
dewaWeb_pricing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   TLD                  294 non-null    object
 1   new_domain           294 non-null    int32 
 2   transfer_or_renewal  294 non-null    int32 
dtypes: int32(2), object(1)
memory usage: 4.7+ KB


In [19]:
dewaWeb_pricing.sort_values('new_domain', ascending=False)

Unnamed: 0,TLD,new_domain,transfer_or_renewal
16,.auto,37500000,37500000
36,.cars,36000000,36000000
234,.security,34000000,34000000
151,.hosting,5600000,5600000
188,.movie,4500000,4500000
...,...,...,...
23,.biz.id,50000,50000
1,.ac.id,50000,50000
211,.ponpes.id,50000,50000
291,.xyz,25000,160000


## Export Data
Melakukan export data yang telah dibersihkan menjadi data csv

In [20]:
dewaWeb_pricing.to_csv('DewaWeb_Pricing.csv',index=False)