## Load required libraries

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

## Useful functions

In [2]:
def start_browser(browser=None,driver_binary=None):
    if browser=='Safari':
        browser = webdriver.Safari(executable_path=driver_binary);
    if browser=='Firefox':
        browser = webdriver.Firefox(executable_path=driver_binary);
    if browser=='Chrome':
        browser = webdriver.Chrome(executable_path=driver_binary);
    
    return browser

def get_project_categories(projects_url):
    
    browser.get(projects_url)
    
    # Find the category dropdown menu
    dropdown_menu = browser.find_element_by_class_name('cf-select__trigger')
    #dropdown_menu.click()

    # Get a list of the categories and navigate to the second one
    category_selector = browser.find_element_by_class_name('cf-select__dropdown')
    category_obj = category_selector.find_elements_by_tag_name('li')
    category_list = [category.text.replace(' ','+') for category in category_obj]
    del category_list[:1]
    
    return category_list

def get_category_url(category):
    
    return 'https://www.crowdfunder.co.uk/search/projects?filter[c]='+category.replace(' ','+')

def get_category_page_url(category,page):
    get_category_url(category)+'&page='+str(page)
    
    return category_page_url

def pages_in_category(category):
    
    browser.get(get_category_url(category));
    pagination = browser.find_elements_by_css_selector('a.cf-button.cf-button--pagination')
    page_numbers = [page.text for page in pagination]

    return int(page_numbers[-1])


## Get list of URLs to be scraped

### List of all project categories

In [3]:
# Load the Safari web driver
browser = start_browser('Safari','/usr/bin/safaridriver')

# Here we get:
# - The list of all the available project categories in www.crowdfunder.co.uk,
# - The urls of each category.
categories = get_project_categories('https://www.crowdfunder.co.uk/search/projects')
categories_urls = [get_category_url(category) for category in categories]

browser.quit()

In [4]:
# Here is the list of all categories
for cat in categories[0:20]:
    print(cat, end=',  \t')

# And the corresponding URLs
categories_urls

Community,  	Business,  	Charities,  	Arts,  	Film+and+Theatre,  	Social+Enterprise,  	Music,  	Politics,  	Schools,  	Sports,  	Personal+Causes,  	Technology,  	Food+and+Drink,  	Environment,  	University,  	Publishing,  	Community+shares,  	Heritage,  	

['https://www.crowdfunder.co.uk/search/projects?filter[c]=Community',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Business',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Charities',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Arts',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Film+and+Theatre',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Social+Enterprise',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Music',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Politics',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Schools',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Sports',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Personal+Causes',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Technology',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Food+and+Drink',
 'https://www.crowdfunder.co.uk/search/projects?filter[c]=Environment',
 'https

### List of all Project Search results pages

In [5]:
# Load the Safari web driver
browser = start_browser('Safari','/usr/bin/safaridriver')

cat_pages_urls = []
all_cats_pages_urls = []

for category in categories:
    for page in range(1,pages_in_category(category)+1):
        cat_pages_urls.append(get_category_url(category)+'&page='+str(page))
    
    print(category,'has',pages_in_category(category),'pages')
    
    temp_list = cat_pages_urls.copy()
    all_cats_pages_urls.append(temp_list)
    cat_pages_urls.clear()

browser.quit()

Community has 42 pages
Business has 42 pages
Charities has 42 pages
Arts has 42 pages
Film+and+Theatre has 42 pages
Social+Enterprise has 42 pages
Music has 42 pages
Politics has 42 pages
Schools has 42 pages
Sports has 42 pages
Personal+Causes has 42 pages
Technology has 42 pages
Food+and+Drink has 40 pages
Environment has 32 pages
University has 28 pages
Publishing has 19 pages
Community+shares has 10 pages
Heritage has 4 pages


In [6]:
# And now we have a list of all search pages per category
for i in range(0,3):
    for url in all_cats_pages_urls[i][0:3]:
        print(url)
    print('...')

print('...')
print(all_cats_pages_urls[-1][-2])
print(all_cats_pages_urls[-1][-1])

https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&page=1
https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&page=2
https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&page=3
...
https://www.crowdfunder.co.uk/search/projects?filter[c]=Business&page=1
https://www.crowdfunder.co.uk/search/projects?filter[c]=Business&page=2
https://www.crowdfunder.co.uk/search/projects?filter[c]=Business&page=3
...
https://www.crowdfunder.co.uk/search/projects?filter[c]=Charities&page=1
https://www.crowdfunder.co.uk/search/projects?filter[c]=Charities&page=2
https://www.crowdfunder.co.uk/search/projects?filter[c]=Charities&page=3
...
...
https://www.crowdfunder.co.uk/search/projects?filter[c]=Heritage&page=3
https://www.crowdfunder.co.uk/search/projects?filter[c]=Heritage&page=4


### Focusing on "Community" projects. List of all campaigns URLs

In [7]:
# We use the previous list of search pages to get the URLS of
# all the campaigns in all the categories

# IMPORTANT NOTE: to get started let's focus on the *Community* campaigns
community_pages_urls = all_cats_pages_urls[0]

browser = start_browser('Safari','/usr/bin/safaridriver')

# Initialize list to save all community project
community_campaigns_urls = [];
for page_url in community_pages_urls:
    
    if page_url in community_pages_urls[0:3]:
        print('Getting all URLs from',page_url)
    
    browser.get(page_url)
    urls_obj = browser.find_elements_by_css_selector('a.cf-pod__image')
    campaigns_hrefs = [item.get_attribute('href') for item in urls_obj]
    community_campaigns_urls.extend(campaigns_hrefs)
    
    time.sleep(3)

print('\n...')

browser.quit()

Getting all URLs from https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&page=1
Getting all URLs from https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&page=2
Getting all URLs from https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&page=3

...


In [8]:
# Here's the list of URLs of the first then Community projects
#
print('Here are the first then Community projects ...\n')
for campaign in community_campaigns_urls[0:10]:
    print(campaign)

print('...\n...\nThere are',len(community_campaigns_urls),'Community campaigns in total')

Here are the first then Community projects ...

https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign
https://www.crowdfunder.co.uk/barra-distillery-share-offer
https://www.crowdfunder.co.uk/punl-cgp
https://www.crowdfunder.co.uk/highland-bagpipe-centre
https://www.crowdfunder.co.uk/northfieldartsforum
https://www.crowdfunder.co.uk/somerford-youth-and-community-centre
https://www.crowdfunder.co.uk/lets-build-a-broch
https://www.crowdfunder.co.uk/cardinal-fm
https://www.crowdfunder.co.uk/midsteeple-quarter
https://www.crowdfunder.co.uk/stop-funding-hate-lock-in-the-change
...
...
There are 1000 Community campaigns in total


## Scraping Community Campaigns info

In [9]:
# Let's initialize lists to save all the pledges, with their corresponding
# dates and supporter names. 
total_pledges = []
total_dates = []
total_names = []

# Let's also initialize a counter to keep track of the total number of
# campaigns saved
Ncampaigns = 0

In [10]:
browser = start_browser('Safari','/usr/bin/safaridriver')

for campaign_url in community_campaigns_urls[0:10]:
    
    browser.get(campaign_url+'/backers')
    
    pledges = []
    dates = []
    names = []
    more_supporters = True
    
    while more_supporters==True:
        print('Scraping:',browser.current_url)
        time.sleep(3)
        soup = BeautifulSoup(browser.page_source,'html.parser')

        tags = soup.find_all('article',{'class':'cf-well', 'data-well':'plain', 'data-well-spacing':'vertical'})

        for tag in tags:
            #print(tag.find("span", class_="cf-text--light").string.split()[-1][1:])
            pledges.append(float(tag.find("span", class_="cf-text--light").string.split()[-1][1:].replace(',','')))

        for tag in tags:
            #print(tag.find("p", class_="cf-text").string)
            dates.append(tag.find("p", class_="cf-text").string)

        for tag in tags:
            #print(tag.find("a"))
            if tag.find("a")==None:
                #print('Anonymous')
                names.append('Anonymous')
            else:
                #print(tag.find("a").string)
                names.append(tag.find("a").string)

        button = soup.find('a',{'class':'cf-button cf-button--small cf-button--hollow', 'data-icon-button':'next'})

        if button is None:
            more_supporters = False;
        if button is not None:
            browser.get(campaign_url+'/backers'+button['href'])
        
    total_pledges.append(pledges)
    total_dates.append(dates)
    total_names.append(names)
    Ncampaigns += 1

browser.quit()

Scraping: https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign/backers
Scraping: https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign/backers?page=2#start
Scraping: https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign/backers?page=3#start
Scraping: https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign/backers?page=4#start
Scraping: https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign/backers?page=5#start
Scraping: https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign/backers?page=6#start
Scraping: https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign/backers?page=7#start
Scraping: https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign/backers?page=8#start
Scraping: https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign/backers?page=9#start
Scraping: https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign/backers?page=10#start
Scraping: https://www.crowdfunder.co.uk/lady-astor-statue-100-campaign/backers?page=11#start

In [11]:
# Check how many campaigns were scraped
len(total_pledges), len(total_dates), len(total_names)

(10, 10, 10)

In [12]:
nlist = total_names.copy()
dlist = total_dates.copy()
plist = total_pledges.copy()

plist[1][0:10], nlist[1][0:10], dlist[1][0:10]

([750.0, 1500.0, 1000.0, 750.0, 750.0, 1000.0, 300.0, 250.0, 250.0, 250.0],
 ['Anonymous',
  'Kevin MacNeil',
  'Werner Henssen',
  'David Cott',
  'Juergen Roemer',
  'Roland Wirth',
  'johannes hoyme',
  'Stefan Dahlmann',
  'FUMIKAZU SAWA',
  'Patrick Sackermann'],
 ['29th January 2019 00:22',
  '26th January 2019 17:43',
  '15th January 2019 17:54',
  '15th January 2019 14:13',
  '14th January 2019 08:53',
  '13th January 2019 16:22',
  '13th January 2019 10:32',
  '10th January 2019 10:40',
  '10th January 2019 04:44',
  '9th January 2019 07:06'])

In [13]:
df_dict = {}

for i in range(len(plist)):
    df_temp = pd.DataFrame(
        {'date': dlist[i][::-1],
         'supporter name': nlist[i][::-1],
         'pledge': plist[i][::-1]
        })
    df_dict.update( {'campaign'+'{0:0=4d}'.format(i) : df_temp} )
    df_temp.to_csv('campaign'+'{0:0=4d}'.format(i)+'.csv')
    
df_dict['campaign0001']

Unnamed: 0,date,supporter name,pledge
0,21st December 2018 15:07,Anonymous,750.0
1,21st December 2018 15:14,Alexander Clark,888.0
2,21st December 2018 19:12,Anonymous,500.0
3,24th December 2018 09:13,Peter Jeffs,2500.0
4,24th December 2018 09:34,Dirk Tinbergen,750.0
5,30th December 2018 13:18,John,750.0
6,30th December 2018 21:38,Anonymous,800.0
7,2nd January 2019 14:24,Peter Scheurer,750.0
8,4th January 2019 14:52,Arnfinn Stake,250.0
9,7th January 2019 12:27,Duncan Tait,750.0
