## Scrape Kickstarter project descriptions

In [None]:
# Import packages
import numpy as np
import pandas as pd
import json
import requests
import urllib.request
import certifi
import urllib3
import time
from bs4 import BeautifulSoup
import os
from requests.adapters import HTTPAdapter
import csv
import re

In [3]:
# Read in the file
df = pd.read_json('input_data_files/Kickstarter_2019-07-18T03_20_05_009Z.json', lines=True)

In [4]:
df.head()

Unnamed: 0,table_id,robot_id,run_id,data
0,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 1456501169, 'photo': {'key': 'assets/01..."
1,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 1259045207, 'photo': {'key': 'assets/01..."
2,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 577497800, 'photo': {'key': 'assets/012..."
3,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 521645557, 'photo': {'key': 'assets/012..."
4,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 504193797, 'photo': {'key': 'assets/011..."


In [5]:
df.shape

(212378, 4)

In [6]:
df['data']

0         {'id': 1456501169, 'photo': {'key': 'assets/01...
1         {'id': 1259045207, 'photo': {'key': 'assets/01...
2         {'id': 577497800, 'photo': {'key': 'assets/012...
3         {'id': 521645557, 'photo': {'key': 'assets/012...
4         {'id': 504193797, 'photo': {'key': 'assets/011...
                                ...                        
212373    {'id': 713996337, 'photo': {'key': 'assets/018...
212374    {'id': 1202344260, 'photo': {'key': 'assets/02...
212375    {'id': 545419145, 'photo': {'key': 'assets/011...
212376    {'id': 1547451445, 'photo': {'key': 'assets/02...
212377    {'id': 763194451, 'photo': {'key': 'assets/015...
Name: data, Length: 212378, dtype: object

In [7]:
print(type(df.at[0, 'data']))

<class 'dict'>


# Limit df to US completed projects

In [7]:
# Define a function to extract country and state from df
def get_country(d):
    return d['country']

df['country'] = df['data'].apply(lambda x: get_country(x))

In [8]:
def get_state(d):
    return d['state']

df['state'] = df['data'].apply(lambda x: get_state(x))

In [9]:
df.head()

Unnamed: 0,table_id,robot_id,run_id,data,country,state
0,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 1456501169, 'photo': {'key': 'assets/01...",US,canceled
1,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 1259045207, 'photo': {'key': 'assets/01...",ES,failed
2,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 577497800, 'photo': {'key': 'assets/012...",US,canceled
3,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 521645557, 'photo': {'key': 'assets/012...",US,successful
4,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 504193797, 'photo': {'key': 'assets/011...",US,failed


In [197]:
df.shape

(212378, 6)

In [10]:
df_pared = df.copy()[(df['country'] == 'US') & (df['state'].isin(['failed', 'successful']))]

df_pared.shape

(139246, 6)

In [11]:
df_pared.sample(5)

Unnamed: 0,table_id,robot_id,run_id,data,country,state
156472,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 89334771, 'photo': {'key': 'assets/012/...",US,successful
15615,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 490365111, 'photo': {'key': 'assets/012...",US,failed
55832,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 1423041017, 'photo': {'key': 'assets/01...",US,successful
177592,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 1788260771, 'photo': {'key': 'assets/01...",US,successful
55783,Kickstarter,Kickstarter,Kickstarter_2019-07-18T03_20_05_009Z,"{'id': 1337435684, 'photo': {'key': 'assets/01...",US,failed


In [200]:
df_pared['state'].unique()

array(['successful', 'failed'], dtype=object)

In [12]:
# Get the kickstarter urls to scrape and append to a list
projectUrls = []
for i in df_pared['data']: 
    finalUrl = i['urls']['web']['project']
#     print(finalUrl)
    projectUrls.append(finalUrl)

# Getting to Correct Scraped data
- Using beautifulsoup, work with a single url to get to correct parsing
- Build iterative process for scraping a list of urls
- Implement working code (Refactored for performance)

In [20]:
projectUrls[0]

'https://www.kickstarter.com/projects/1498616534/simbin?ref=discovery_category_newest'

In [21]:
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())

In [22]:
testUrl = projectUrls[0]

In [23]:
from urllib.request import urlopen
r = http.request('GET', testUrl)
page = urllib.request.urlopen(testUrl).read()

In [30]:
print(page)



In [24]:
# Parse HTML and save to BeautifulSoup object¶
# soup = BeautifulSoup(r.text, 'html.parser')
soup = BeautifulSoup(page, 'html.parser')

In [25]:
# Using "inspect elements" - identify and parse the description from the url
soup.find('div', {'class': 'rte__content js-full-description responsive-media'})

<div class="rte__content js-full-description responsive-media">
<p>En SIMBIN estamos desarrollando un buscador global sostenible, es decir, un lugar donde encontrar todo lo que buscas, pero desde un punto de vista responsable y respetuoso con el medio ambiente y la gente.</p><p>Evitamos la super-producción en masa, potenciamos la artesanía y los proyectos ecológicos para cambiar los hábitos de consumo actuales, en los que se compra o crean productos y proyectos, que la mayoría de casos no benefician ni al planeta ni a la gente, solo a las grandes empresas y bancos.</p><p>Con esto no pretendemos cambiar la manera de comprar, sino conducir a nuestro usuario a través de una red global sostenible donde podrá encontrar lo que desee en cualquier momento.</p>
</div>

In [26]:
# save the description to text
text = soup.find('div', {'class': 'rte__content js-full-description responsive-media'})

In [40]:
type(text)

bs4.element.Tag

In [41]:
text

<div class="rte__content js-full-description responsive-media">
<h1 class="page-anchor" id="h:at-citizen-carpentry">
<b>At Citizen Carpentry, we build community around a shared woodworking shop.</b>
</h1><p>Our workshop is almost complete! We've been putting up walls and getting our storage built out over the last 5 months, and this final push will allow us to operate on our own hours and offer shop time and classes in the evenings.</p><div class="template asset" contenteditable="false" data-alt-text="" data-caption="" data-id="15122354">
<figure>
<img alt="" class="fit" src="https://ksr-ugc.imgix.net/assets/015/122/354/584622c2f8a4f554310c064e30fa80a1_original.jpg?ixlib=rb-2.1.0&amp;w=680&amp;fit=max&amp;v=1484068162&amp;auto=format&amp;gif-q=50&amp;q=92&amp;s=a4491b60030f69ea1ed6c8031934b775"/>
</figure>
</div>
<h1 class="page-anchor" id="h:were-dedicated-to-in">We're dedicated to increasing public access to tools and space to work.</h1><p><b>Teaching and sharing skills.</b></p><ul>


In [None]:
# from pattern.web import plaintext

In [27]:
# since p tags are paragraphs, find all p tags within the div class
ptags = text.findAll('p')

In [28]:
type(ptags)

bs4.element.ResultSet

In [29]:
ptags

[<p>En SIMBIN estamos desarrollando un buscador global sostenible, es decir, un lugar donde encontrar todo lo que buscas, pero desde un punto de vista responsable y respetuoso con el medio ambiente y la gente.</p>,
 <p>Evitamos la super-producción en masa, potenciamos la artesanía y los proyectos ecológicos para cambiar los hábitos de consumo actuales, en los que se compra o crean productos y proyectos, que la mayoría de casos no benefician ni al planeta ni a la gente, solo a las grandes empresas y bancos.</p>,
 <p>Con esto no pretendemos cambiar la manera de comprar, sino conducir a nuestro usuario a través de una red global sostenible donde podrá encontrar lo que desee en cualquier momento.</p>]

In [30]:
ptags[0]

<p>En SIMBIN estamos desarrollando un buscador global sostenible, es decir, un lugar donde encontrar todo lo que buscas, pero desde un punto de vista responsable y respetuoso con el medio ambiente y la gente.</p>

In [77]:
# print a test page
f = open("output.txt", "a")

for p in ptags:
    print(p.text.strip(), file = f) 
 
f.close()

# Testing Iterative Scraping


In [213]:
# Build code to iterate scraping aver a list of urls

# testlist = []
# testdict = {}
keys = projectUrls
# f = open("output.txt", "a")

w = csv.writer(open("output7.csv", "w"))
    for i, url in enumerate(projectUrls[5:7]):
        page = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(page, 'html.parser')
        text = soup.find('div', {'class': 'rte__content js-full-description responsive-media'})
    #         if tag is not None or text is not None:
        for tag in text:
            ptags = text.findAll('p')
            urlchunk = ""
        #if p is not None:
        for p in ptags:
            strippedp = p.text.strip() # strips all ptags down to clean text
            urlchunk = urlchunk+" "+strippedp
        testlist.append(urlchunk)
        testdict[keys[i]] = testlist[i]
        w.writerow([keys[i], testlist[i]])
        print(i, url, urlchunk)
#     for key, value in testdict.items():
#         w.writerow([key, value])




0 https://www.kickstarter.com/projects/904448565/donald-trump-kiss-leggings-inspired-by-hp-lovecraf?ref=discovery_category_newest  I'm crazy enough to make these Donald Trump Kiss leggings no matter what happens. Putting up this Kickstarter is a way for me to raise funds to offset the startup cost of putting these babies into production. The kiss leggings are being sold at roughly the cost it will take to produce them, I just want to spread some joy. If you would like a pair, please pledge to support the project! The original image is by David Becker for Getty Images.
1 https://www.kickstarter.com/projects/clbayda/we-wish-you-a-merry-rockin-christmas?ref=discovery_category_newest  Rockin' Around the Town Square: THE BIRTH OF THE PROJECT Since the inaugural performance in 2008, I've been heavily involved with the "Rockin' Around the Town Square" Christmas concerts, serving as the Musical Director and also performing lead and rhythm guitars at all of the concerts. What began as an excuse

In [209]:
# testfile = pd.read_csv('output5.csv')
with open('output6.csv') as csv_file:
    reader = csv.reader(csv_file)
    mydict = dict(reader)

In [210]:
mydict

{'https://www.kickstarter.com/projects/184955320/how-to-breathe-underwater-in-the-black-box-debut-2?ref=discovery_category_newest': ' My name is T.S. Woodward and I\'m a self-taught songwriter, pianist and performer from Athens, GA. In 2015 I completed production of a 56 minute double album of original, experimental pop music entitled How to Breathe Underwater/In the Black Box. Collectively, these sixteen tracks flow into one another seamlessly from start to finish, forming a revolving tableau of phantasmagoric imagery, characters, and landscapes. Separately, they are explorations on seeking; hazy recollections of places which can only exist in the twilight between sleep and sleeplessness; a swirling, shifting no man\'s land populated by the specters of memory and ambition; a surrealist\'s guide to navigating the dark descent into and out of a dream. I\'m here now to raise money to release this, my artistic debut, in a limited edition double cassette box set which includes a 24 page, l

In [None]:
projectUrls[0]

# Working Code

In [None]:
## Scrape Kickstarter project pages for project descriptions (refactored 9/23/19)

start = time.time()

keys = projectUrls
delays = [0.5,1,1.5,2,2.5,3]
delay = np.random.choice(delays)

w = csv.writer(open("improvedwk3.csv", "w"))

for i, url in enumerate(projectUrls):
    page = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(page, 'html.parser')
    text = soup.find('div', {'class': 'rte__content js-full-description responsive-media'})
    urlchunk = ""
    if text is not None:
        urlchunk = text.text.replace('\n', ' ')
        urlchunk = re.sub(r'\s\s+','', urlchunk)
        print(i,url)
        w.writerow([url, urlchunk])
        time.sleep(delay)

end = time.time()
print(end - start)

w = None
        


0 https://www.kickstarter.com/projects/184955320/how-to-breathe-underwater-in-the-black-box-debut-2?ref=discovery_category_newest
1 https://www.kickstarter.com/projects/760226815/backpackable-high-resolution-uav-habitat-mapping-s?ref=discovery_category_newest
2 https://www.kickstarter.com/projects/1430060595/love-story?ref=discovery_category_newest
3 https://www.kickstarter.com/projects/533614462/detroit-revue-concert?ref=discovery_category_newest
4 https://www.kickstarter.com/projects/1319958753/lets-commit-to-a-murder-she-wrote-album-of-pop-son?ref=discovery_category_newest
5 https://www.kickstarter.com/projects/904448565/donald-trump-kiss-leggings-inspired-by-hp-lovecraf?ref=discovery_category_newest
6 https://www.kickstarter.com/projects/clbayda/we-wish-you-a-merry-rockin-christmas?ref=discovery_category_newest
7 https://www.kickstarter.com/projects/1961724545/gulcher-records-hat-trick?ref=discovery_category_newest
8 https://www.kickstarter.com/projects/loftland/loftlands-new-album

In [None]:
test = pd.read_csv('improvedwk3.csv')

In [None]:
test.head(10)