Data Source: p2pfoundation

Link: http://wiki.p2pfoundation.net/NGOs_in_Thailand

# Set up

In [1]:
# load dependencies
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd

# Variables

In [2]:
browser = webdriver.PhantomJS(executable_path=r'../../Webscraping tools/phantomjs-2.1.1-macosx/bin/phantomjs')
p2pfoundation_URL = 'http://wiki.p2pfoundation.net/NGOs_in_Thailand'
csv_dump_path = 'p2pfoundation_thailand.csv'



In [3]:
def do_scrape(browser, p2pfoundation_URL, csv_dump_path):
    soup = get_page_soup(browser, p2pfoundation_URL)
    table = get_table_from_soup(soup)
    rows = get_rows_from_table(table)
    df = build_df(rows)
    save_df(df, csv_dump_path)

In [4]:
def get_page_soup(browser, url):
    browser.get(url)
    page_soup = BeautifulSoup(browser.page_source, "html.parser")
    return page_soup

In [5]:
def get_table_from_soup(soup):
    table = soup.find('tbody')
    return table

In [6]:
def get_rows_from_table(table):
    rows = table.find_all('tr')
    return rows

In [7]:
def get_column_names(rows):
    column_names = []
    for column in rows[0].find_all('th'):
        column_names.append(column.text.strip())
    return column_names

In [8]:
def build_df(rows):
    column_names = ['cause_area', 'name', 'description', 'website', 'email', 'drop']
    df = pd.DataFrame(columns=column_names)
    no_rows = len(rows)
    for i in range(1, no_rows): 
        row_data = []
        cell_contents = rows[i].find_all('td')
        table_width = len(cell_contents)
        for j in range(table_width):
            # build special condition to extract website link 
            if j == 3:
                a_tag = rows[i].find('a')
                href = a_tag['href'] if a_tag else ''
                row_data.append(href) 
            else: 
                row_data.append(cell_contents[j].text.strip()) 
        df.loc[i] = row_data
    del df['drop']
    return df

In [9]:
def save_df(df, csv_dump_path):
    df.to_csv(csv_dump_path,header=True,index=False)

# Run script

In [10]:
do_scrape(browser, p2pfoundation_URL, csv_dump_path)

# Check CSV

In [11]:
pd.read_csv(csv_dump_path, index_col=False, header=0)

Unnamed: 0,cause_area,name,description,website,email
0,Animals,Animal Sanctuary,Love Animal House is an animal sanctuary and e...,http://animal-sanctuary.chiangmai-chiangrai.com/,loveanimalhouse at yahoo.com
1,Animals,Animals in Zoos,Zoos of Thailand,,
2,Animals,Strays in Bangkok,Pic-A-Pet Home : nurture stray dogs back to he...,http://thailand4life.net/icare/animals/picapet...,
3,Animals,Strays in Hua Hin,Hua Hin Dog Resue Centre : rescues stray dogs ...,,info at dogrescuecenter.org
4,Animals,Thai Elephant Conservation,"A ""Mobile Elephant Clinic"" with experienced ve...",http://www.phuket.com/conservation/elephants.htm,ecearth at samart.co.th
5,Assorted,Nabuur.com,Nabuur is not a Thai but an virtual organizati...,http://Nabuur.com,info at nabuur.com
6,Assorted,Smiling Albino,Highland Farm & Gibbon Sanctuary Phayathai Ba...,http://www.smilingalbino.com/community/,info at smilingalbino.com
7,Assorted,Thai Population Association,The Thai Population Association (TPA) is an or...,http://www.thaipopulation.org/,
8,Assorted,UNESCO Bangkok,Division of Emerging Social Issues,,escap-esid at un.org
9,Children and families,International Humanity Foundation,IHF Thailand center in Chiang Rai is a home fo...,http://www.ihfonline.org/thailand.php/,volunteerrecruitment.open@ihfonline.org
