Data Source: the NGO List

Link: https://www.thengolist.com/cambodia.html

# Setup

In [1]:
# load dependencies
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd

# Variables

In [2]:
browser = webdriver.PhantomJS(executable_path=r'../../Webscraping tools/phantomjs-2.1.1-macosx/bin/phantomjs')
thengolist_URL = 'https://www.thengolist.com/cambodia.html'
csv_dump_path = 'thengolist_cambodia.csv'



In [3]:
def get_page_soup(browser, url):
    browser.get(url)
    page_soup = BeautifulSoup(browser.page_source, "html.parser")
    return page_soup

In [4]:
def extract_charities(soup):
    rows = soup.find_all('table')[2:]
    charities = []
    for row in rows: 
        for charity in row.find_all('td'):
            charities.append(charity)
    return charities

In [5]:
def build_df(charities):
    column_names = ['name', 'description', 'website', 'cause_area', 'country', 'city', 'email']
    df = pd.DataFrame(columns=column_names)
    no_rows = len(charities)
    for i in range(1, no_rows): 
        charity = charities[i -1]
        if not charity.find_all('div')[0].find('strong') == None: 
            name = extract_charity_name(charity)
            web = extract_website(charity)
            email = extract_emails(charity)
            cause_area = extract_sector(charity)
            country = 'Cambodia'
            city = extract_city(charity)
            description = extract_description(charity)
            df.loc[i]=[name, description, web, cause_area, country, city, email]
    return df

In [6]:
def extract_charity_name(charity):
    return charity.find_all('div')[0].find('strong').text

In [7]:
def extract_website(charity):
    return charity.find_all('div')[-1].find('a')['href']

In [8]:
def extract_emails(charity):
    for child in charity.find_all('div')[-1].stripped_strings:
        if '(at)' in child:
            email = child.replace(' (at) ', '@').replace(' (dot) ', '.')
            return email 

In [9]:
def extract_sector(charity):
    return ''.join([text for text in charity.find_all('div')[0].stripped_strings][1:]).split('(')[0]

In [10]:
def extract_city(charity):
    text_array = ''.join([text for text in charity.find_all('div')[0].stripped_strings][1:]).split('(')
    if len(text_array) > 1: 
        return text_array[1][:-1]
    else: 
        return ''

In [11]:
def extract_description(charity):
    return [text for text in charity.find_all('div')[5].stripped_strings][1]

In [12]:
def extract_program_type(charity):
    [text for text in charity.find_all('div')[5].stripped_strings][-1]

In [14]:
def save_df(df, csv_dump_path):
    df.to_csv(csv_dump_path,header=True,index=False)# Run script

In [None]:
def do_scrape(browser, thengolist_URL, csv_dump_path):
    soup = get_page_soup(browser, thengolist_URL)
    charities = extract_charities(soup)
    df = build_df(charities)
    save_df(df, csv_dump_path)

# Run script

In [None]:
do_scrape(browser, thengolist_URL, csv_dump_path)

# Check CSV

In [None]:
pd.read_csv(csv_dump_path, index_col=False, header=0)