## Load required libraries

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

In [2]:
import sys, os
sys.path.insert(0, os.path.abspath('../'))
#sys.path.insert(0, '/Users/jorge/git/crowdfunding-prophet/')
from cwdprophet.campaign import *

## Useful functions

In [3]:
def get_category_url(category,campaign_state=None):
    
    projects_url = 'https://www.crowdfunder.co.uk/search/projects'    
    states = ['recent','pending','ending','successful','overfunding']
    
    if campaign_state is None:
        return projects_url+'?filter[c]='+category.replace(' ','+')
    
    elif campaign_state in states:
        return projects_url+'?filter[c]='+category.replace(' ','+')+'&filter[t]='+campaign_state
    
    else:
        print('Not a valid option')
        exit()


def pages_in_category(category,campaign_state):
    
    browser.get(get_category_url(category,campaign_state));
    pagination = browser.find_elements_by_css_selector('a.cf-button.cf-button--pagination')
    page_numbers = [page.text for page in pagination]

    if len(page_numbers)!= 0:
        return int(page_numbers[-1])
    else:
        return 1


## Get list of URLs to be scraped

### 1. Get the list of all project categories

In [4]:
# Load a web driver with your favorite browser
# to get the list of project categories in www.crowdfunder.co.uk
browser = webdriver.Chrome();
browser.get('https://www.crowdfunder.co.uk/search/projects')

soup = BeautifulSoup(browser.page_source,'html.parser')
categories = [item.text.replace(' ','+') for item in soup.find('select',{"id":"filter_c"}).find_all('option')][1:]

browser.quit()

In [5]:
# Here is the list of all categories
categories

['Community',
 'Business',
 'Charities',
 'Arts',
 'Film+and+Theatre',
 'Social+Enterprise',
 'Music',
 'Politics',
 'Schools',
 'Sports',
 'Personal+Causes',
 'Technology',
 'Food+and+Drink',
 'Environment',
 'University',
 'Publishing',
 'Community+shares',
 'Heritage']

### List of Number of pages per category shown in Project Search results

In [6]:
category = 'Community'
campaign_state = 'successful'

In [7]:
# Load a web driver (Chrome in this case)
browser = webdriver.Chrome();

cat_pages_urls = []
all_cats_pages_urls = []

for category in categories:
    for page in range(1,pages_in_category(category,campaign_state)+1):
        cat_pages_urls.append(get_category_url(category,campaign_state)+'&page='+str(page))
    
    print(category,'has',page,'pages')
    
    temp_list = cat_pages_urls.copy()
    all_cats_pages_urls.append(temp_list)
    cat_pages_urls.clear()

browser.quit()

Community has 42 pages
Business has 42 pages
Charities has 42 pages
Arts has 42 pages
Film+and+Theatre has 42 pages
Social+Enterprise has 31 pages
Music has 40 pages
Politics has 42 pages
Schools has 27 pages
Sports has 27 pages
Personal+Causes has 21 pages
Technology has 9 pages
Food+and+Drink has 14 pages
Environment has 14 pages
University has 11 pages
Publishing has 7 pages
Community+shares has 5 pages
Heritage has 3 pages


In [8]:
# And now we have a list of all search pages per category
for i in range(0,2):
    for url in all_cats_pages_urls[i][0:3]:
        print(url)
    print('...')

print('...')
print(all_cats_pages_urls[-1][-1])

https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&filter[t]=successful&page=1
https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&filter[t]=successful&page=2
https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&filter[t]=successful&page=3
...
https://www.crowdfunder.co.uk/search/projects?filter[c]=Business&filter[t]=successful&page=1
https://www.crowdfunder.co.uk/search/projects?filter[c]=Business&filter[t]=successful&page=2
https://www.crowdfunder.co.uk/search/projects?filter[c]=Business&filter[t]=successful&page=3
...
...
https://www.crowdfunder.co.uk/search/projects?filter[c]=Heritage&filter[t]=successful&page=3


### Focusing on "Community" projects.
Here we will get for each campaign:
- URL
- Titles
- Total Raised
- Percentage of Original Goal

In [9]:
# We use the previous list of search pages to get the URLS of
# all the campaigns in one of the categories (Community for example)

# IMPORTANT NOTE: to get started let's focus on the *Community* campaigns
community_pages_urls = all_cats_pages_urls[0]

browser = webdriver.Chrome();

# Initialize lists to save campaigns info
campaigns_titles = [];
campaigns_urls = [];
campaigns_goal_pct = [];
campaigns_raised = [];

for page_url in community_pages_urls:
    
    if page_url in community_pages_urls[0:3]:
        print('Getting all URLs from',page_url)
    
    browser.get(page_url)
    soup = BeautifulSoup(browser.page_source,'html.parser')
    articles = soup.find_all('article',{'class':'cf-pod'})
    
    for article in articles:
        
        campaigns_titles.append(article.find('h5').text)
        campaigns_urls.append(article.find('a',{'class':'cf-pod__image'})['href'])
        campaigns_goal_pct.append(float(article.find('div',
                                                     {'class':'cf-text cf-text--light cf-text--fixed14'}).text.split()[0][:-1]))
        campaigns_raised.append(int(article.find('div',
                                                 {'class':'cf-text cf-text--light cf-text--fixed14 cf-text--thick'}).text.split()[0][1:].replace(',', '')))
        
    time.sleep(2)

print('\n...')

browser.quit()

Getting all URLs from https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&filter[t]=successful&page=1
Getting all URLs from https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&filter[t]=successful&page=2
Getting all URLs from https://www.crowdfunder.co.uk/search/projects?filter[c]=Community&filter[t]=successful&page=3

...


In [10]:
# Here are the details of the first projects
#
print('Here are the details of the first ten Community projects ...\n')
for title,campaign,goal_pct,raised in zip(campaigns_titles[0:3],campaigns_urls[0:3],
                                          campaigns_goal_pct[0:3],campaigns_raised[0:3]):
    print(title)
    print(campaign)
    print(format(goal_pct,",")+'% of goal raised')
    print('£'+str(raised),'total\n')
    

print('...\n...\nThere are',len(campaigns_urls),'Community campaigns in total')

Here are the details of the first ten Community projects ...

GlenWyvis Distillery
https://www.crowdfunder.co.uk/glenwyvis-distillery
170.0% of goal raised
£2544210 total

Aberdeen Community Energy - Donside Hydro
https://www.crowdfunder.co.uk/ace
100.0% of goal raised
£500000 total

Save The New Inn, Norton Lindsey
https://www.crowdfunder.co.uk/save-the-new-inn-norton-lindsey
114.0% of goal raised
£342000 total

...
...
There are 1000 Community campaigns in total


In [11]:
df = pd.DataFrame({'title': campaigns_titles, 'URL': campaigns_urls, 
                   'Total raised': campaigns_raised, 'Pct Original Goal': campaigns_goal_pct})


# We can now estimate the original goal using the percentage 
# of the total goal achieved by the campaign
df['Pct Original Goal'] = df['Pct Original Goal'].replace(0,0.17105)
df['Goal'] = df['Total raised']/df['Pct Original Goal']*100
df['successful'] = df['Pct Original Goal'] >= 100

pd.options.display.float_format = '{:.0f}'.format
df.head()

Unnamed: 0,title,URL,Total raised,Pct Original Goal,Goal,successful
0,GlenWyvis Distillery,https://www.crowdfunder.co.uk/glenwyvis-distil...,2544210,170,1496594,True
1,Aberdeen Community Energy - Donside Hydro,https://www.crowdfunder.co.uk/ace,500000,100,500000,True
2,"Save The New Inn, Norton Lindsey",https://www.crowdfunder.co.uk/save-the-new-inn...,342000,114,300000,True
3,Perranporth Community Pool,https://www.crowdfunder.co.uk/perranporth-comm...,250322,100,250322,True
4,Save The Bromley Cross,https://www.crowdfunder.co.uk/save-the-bromley...,212502,101,210398,True


Let's no save the campaign info to a CSV file

In [17]:
df['Category'] = 'Community'
df = df[['title','URL','Category','Goal','Total raised','Pct Original Goal','successful']]
df.to_csv('Campaigns.csv')
df.head(50)

Unnamed: 0,title,URL,Category,Goal,Total raised,Pct Original Goal,successful
0,GlenWyvis Distillery,https://www.crowdfunder.co.uk/glenwyvis-distil...,Community,1496594,2544210,170,True
1,Aberdeen Community Energy - Donside Hydro,https://www.crowdfunder.co.uk/ace,Community,500000,500000,100,True
2,"Save The New Inn, Norton Lindsey",https://www.crowdfunder.co.uk/save-the-new-inn...,Community,300000,342000,114,True
3,Perranporth Community Pool,https://www.crowdfunder.co.uk/perranporth-comm...,Community,250322,250322,100,True
4,Save The Bromley Cross,https://www.crowdfunder.co.uk/save-the-bromley...,Community,210398,212502,101,True
