Data Source: SMERU

Link: http://www.smeru.or.id/en/content/ngo-database   



In [1]:
# load dependencies
from bs4 import BeautifulSoup
from selenium import webdriver
import math
import pandas as pd
import re
import numpy as np
browser = webdriver.PhantomJS(executable_path=r'../../Webscraping tools/phantomjs-2.1.1-macosx/bin/phantomjs')



In [2]:
# get website and run empty search
url = 'http://www.smeru.or.id/ngo/ngolist.php'
browser.get(url)
browser.find_element_by_css_selector('input[type=\"submit\"]').click()

In [3]:
# run empty search to extract the number of NGOs listed
init_page_soup = BeautifulSoup(browser.page_source, "html.parser")
charity_count = init_page_soup.find_all('p')[1].text.split(' ')[1]
print(charity_count)

2910


In [4]:
# calculate the number of 'next page' clicks needed
additional_clicks_required = math.ceil(int(charity_count)/50 - 1)
print(additional_clicks_required)

58


In [6]:
# click next pages
for click in range(additional_clicks_required):
    browser.find_element_by_xpath('//a[@href="ngolist.php?next"]').click()
    if (click % 5) == 0:
        print(click)

0
5
10
15
20
25
30
35
40
45
50
55


In [7]:
# extract final soup
final_page_soup = BeautifulSoup(browser.page_source, "html.parser")

In [9]:
table_body = final_page_soup.find('table').find_all('table')[2].find('tbody')
rows = table_body.find_all('tr', valign="top")
print(len(rows))

2910


In [10]:
# instantiate dataframe columns
indices = []
ngos = []
descriptions = []

In [11]:
# create rows for dataframe 
for row in rows: 
    cols = row.find_all('td')
    indices.append(cols[0].text)
    ngos.append(cols[1].text)
    descriptions.append([content for content in cols[2].contents if ((content.name != 'br' ) & (content.name != 'b') )])
print( len(indices), len(ngos), len(descriptions))

2910 2910 2910


In [12]:
# create dataframe 
df = pd.DataFrame({'name': ngos, 'description': descriptions, 'email': '', 'website': '', 'contact_number': '', 'address': '', 'country': 'Indonesia', 'city': ''}, index = indices)

In [13]:
# define regex expressions
re_email = r"(?<=Email: ).*$"
re_website = r"(?<=Web: ).*$"
re_phone = r"(?<=Ph. )[0-9- ]+"
re_city = r".*(?= Ph.)"
re_no_digits = r"[A-Za-z-' ]\D+"

In [14]:
# populate columns 
for index in df.index:
    for line in df['description'][index]:
        email = re.search(re_email, line)
        if email: df.at[index, 'email'] = email.group(0).strip()
        website = re.search(re_website, line)
        if website: df.at[index, 'website'] = website.group(0).strip()
        phone = re.search(re_phone, line)
        if phone: df.at[index, 'contact_number'] = phone.group(0).strip()
        city = re.search(re_city, line)
        if city: 
            city_mixed = re.search(re_no_digits, city.group(0))
            if city_mixed: 
                city_string = city_mixed.group(0).strip()
                df.at[index, 'city'] = city_string
                df.at[index, 'address'] += ('\n' + city_string) 
        if not( email or website or phone):
            if df['address'][index]: 
                df.at[index, 'address'] += ('\n' + line)
            else: 
                df.at[index, 'address'] = line


In [15]:
df.head()

Unnamed: 0,address,city,contact_number,country,description,email,name,website
1,Komplek Golden Plaza Blok J-36\nJl. Fatmawati ...,Jakarta,021-75915814,Indonesia,[Komplek Golden Plaza Blok J-36 Jl. Fatmawati ...,clo@id.missions-acf.org,Action Against Hunger,www.actionagainsthunger.org/countries/asia/in
2,Komplek Perpustakaan Nasional\nJl. Medan Merde...,Jakarta,021- 3521910,Indonesia,[Komplek Perpustakaan Nasional Jl. Medan Merde...,info@aipi.or.id,Akademi Ilmu Pengetahuan Indonesia(AIPI),www.aipi.or.id
3,"Menara 165, lantai 11,\nJl. TB. Simatupang Kav...",Jakarta,021-29406565,Indonesia,"[Menara 165, lantai 11, Jl. TB. Simatupang Kav...",,Aksi Cepat Tanggap(ACT),www.act.id
4,Jl. Temanggung Tilung V Nomor 25\nPalangkaraya,Palangkaraya,0852 48 02 8480,Indonesia,"[Jl. Temanggung Tilung V Nomor 25, Palangkaray...",www.ampuh-kalteng.or.id,Asosiasi Masyarakat Peduli Hukum(Ampuh),
5,Jl. Letjen. Suprapto No.33\nSemarang,Semarang,024-3563173,Indonesia,"[Jl. Letjen. Suprapto No.33, Semarang Ph. 024-...",,Bumi Samudra Sejahtera(BSS),


In [16]:
df.to_csv("smeru.csv",header=True,index=False)