In [33]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [34]:
# defining the url and getting the response
url = 'https://boston.craigslist.org/search/jjj?'

try:
    response = requests.get(url)
except Exception as e:
    print(e)

In [35]:
# save the content of the response if status code is 200
if response.status_code == 200:
    data = response.text
else:
    print(response.status_code)

In [None]:
# creating a Soup object and find all 'a' tags

soup = BeautifulSoup(data, 'html.parser')
tags = soup.find_all('a')
for tag in tags:
    print(tag.get('href'))

In [None]:
# using a dictionary to specify element details
results = soup.find_all('a', {'class': 'result-title'})
for result in results:
    print(result.text)

In [None]:
# getting the address information
addresses = soup.find_all('span', {'class':'result-hood'})
for address in addresses:
    print(address.text)

In [None]:
# the address and the titles belong to a div tag parent, we call it "wrapper"
details = soup.find_all('div', {'class':'result-info'})

# creating dataframe to export results
column_names = ['name','city','date','link','description','compensation','employment_type','posting_info']
df = pd.DataFrame(columns=column_names)

for detail in details:
    job_info = []
    name = detail.find('a', {'class': 'result-title'}).text
    # in case a location doesn't appear we can add an if statement
    city_tag = detail.find('span', {'class':'result-hood'})
    city = city_tag.text[2:-1] if city_tag else "N/A"
    date = detail.find('time', {'class':'result-date'}).text
    link = detail.find('a', {'class':'result-title'}).get('href')
    
    # getting some detail info about the jobs
    try:
        job_detail_response = requests.get(link)
        job_soup = BeautifulSoup(job_detail_response.text, 'html.parser')
        # getting section of interest
        job_detail = job_soup.find('section', {'class':'userbody'})
        description = job_detail.find('section', {'id':'postingbody'}).text
        compensation_type_tag = job_detail.find('p', {'class':'attrgroup'})
        if compensation_type_tag:
            span_tags = compensation_type_tag.find_all('span')
            for span_tag in span_tags:
                if 'compensation' in span_tag.text.lower():
                    compensation = span_tag.text.split(':')[1].strip()
                elif 'employment type' in span_tag.text.lower():
                    employment_type = span_tag.text.split(':')[1].strip()
        else:
            compensation = ''
            employment_type = ''

        posting_info = job_detail.find_all('p', {'class':'postinginfo'})[1].text.split(':')[1].strip()

    except Exception as e:
        print("Error getting job details!")
        print(e)
        description = ''
        compensation = ''
        employment_type = ''
        posting_info = ''

    job_info.append(name)
    job_info.append(city)
    job_info.append(date)
    job_info.append(link)
    job_info.append(description)
    job_info.append(compensation)
    job_info.append(employment_type)
    job_info.append(posting_info)

    df.loc[len(df.index)] = job_info

In [39]:
# export result to current folder
df.to_csv(r'.\\Results.csv')


# Asignment

Based on what you have leaned in this course, web scrape the API lists on this page, and export your result into a CSV file.

https://www.programmableweb.com/category/all/apis

OR

https://www.programmableweb.com/category/tools/api



Your Python code should scrape the following details from each table row:

• API Name  

• API (absolute) URL  

• API Category  

• API Description  



Your Python code should crawl to all the available "next" pages. Your final result should be approx. 20,400 rows, and for the second link approx. 1533 rows.



On your machine, write the code and test it to make sure it is working. Then copy paste your final code here.



All the best!

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL
url = 'https://www.programmableweb.com'

# define URL
base_url = 'https://www.programmableweb.com/category/tools/api?pw_view_display_id=apis_all&page='

# counter to concatenate with url
count_url = 0

# counter to keep track of each result
count = 0

# dictionary where results will be saved
results = {}
column_names = ['Name', 'Absolute URL', 'Category', 'Description']

while True:
    specific_url = 'https://www.programmableweb.com/category/tools/api?pw_view_display_id=apis_all&page=' + str(count_url)
    print('Getting data from ' + specific_url)
    response = requests.get(specific_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'class':'views-table'})
    if table is None:
        print("No more results!")
        break   
    body = table.find('tbody')
    rows = body.find_all('tr')
    for row in rows:
        data = row.find_all('td')
        name = data[0].text if data[0] else "N/A"
        partial_api_url = data[4].find('a').get('href')
        # a href equals to # means a dropdown is present
        if partial_api_url == "#":
            # when the version field has a dropdown we check for li tags
            versions = data[4].find_all('li')
            for version in versions:
                # checking and selecting the last recommended version
                if 'recommended' in version.text.lower():
                    api_url = url + version.find('a').get('href')     
        else:
            api_url = url + partial_api_url
        category = data[2].text if data[2] else "N/A" 
        detail_response = requests.get(api_url)
        detail_soup = BeautifulSoup(detail_response.text, 'html.parser')
        detail = detail_soup.find('div', {'class':'api_description'}).text
        results[count] = [name, api_url, category, detail]
        count += 1
    count_url += 1

In [22]:
# create the dataframe from result dictionary
df = pd.DataFrame.from_dict(results, orient='index', columns=column_names)
df.to_csv('./API_Results.csv')