In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
from requests.exceptions import HTTPError

Scraper uses two functions, the first get the names of the charities, the second script connects to the organisations individual page on NGOhub and scrapes the organisations data.

In [2]:
def get_names(url):
    page =  urllib.request.urlopen(url)
    soup = BeautifulSoup(page, "lxml")
    names = soup.find_all('h4')
    df = pd.DataFrame(names)
    df[0]= df[0].astype(str)
    df[0] = df[0].str.split('<h4><a href="/organizations/').str.get(1)
    df[0] = df[0].str.split('">').str.get(0)
    return df

In [3]:
def get_details(name):
    url =  "https://www.ngohub.asia/organizations/"+name
    page =  urllib.request.urlopen(url) 
    soup = BeautifulSoup(page, "lxml")
    details = [item.get_text().strip() for item in soup.find_all('div', "col-md-11")]
    df = pd.DataFrame(details)
    df = df.transpose()
    df = df.astype(str)
    df['Name'] = name
    return df

We iterate through the 52 pages of NGOhub to get the charity names, and append them to a data frame called all_names

In [4]:
all_names = pd.DataFrame()
for i in range(1,52):
    url = "https://www.ngohub.asia/all_organizations?page="+str(i)
    all_names = all_names.append(get_names(url), ignore_index=True)

We then create a dataframe called all_detials and append the details of the individual charity. Some of the links are dead, so we return a 404 and keep going. Note that the script usually fails at the end of the list

In [None]:
all_details = pd.DataFrame()
for org in all_names[0]:
    try:
        all_details = all_details.append(get_details(org), ignore_index=True)
    except:
        print('failed 404 '+org)
all_details = all_details.rename(columns={8: "URL", 9: "Email", 10: "Phone Number", 11: "Address", 12: "City"})    

We now clean up the data and rename the columns

In [25]:
all_details = all_details.rename(columns={0: "URL", 1: "Email", 2: "Phone Number", 3: "Address", 4: "City"})    

In [35]:
all_details['Name'] = all_details['Name'].replace('-', ' ', inplace=True)

In [36]:
all_details.head()

Unnamed: 0,URL,Email,Phone Number,Address,City,Name,5,6,7
0,abimsabah.blogspot.com,jjaafarmahmud@yahoo.com,0168401894,WDT 501 PEJABAT POS BESAR SABAH 88100 KOTA KIN...,kota kinabalu,,,,
1,www.agathians.org,admin@agathians.org,012-2968014,"No 22 Jalan Kelah 8/6, Seksyen 8, 46050 Petali...",PETALING JAYA,,,,
2,www.aidhome.org,intan@emagineers.com.my,6 012-3082.092,"No 6-3, Jalan USJ 9/5R47620 Subang JayaSelangor","Subang Jaya, Selangor",,,,
3,www.aiesec.my,youth@aiesec.my,03-78871624,"Block E-51-2, Zenith Corporate Park, Jalan SS7...",Petaling Jaya,,,,
4,www.ajarajarmalaysia.org,phyee1057@hotmail.com,010-9008470,"12, Jalan Utara Section 52",Petaling Jaya,,Facebook Profile,,


Save the dataframe to a CSV file

In [37]:
all_details.to_csv('ngohub_output.csv')