In [65]:
# Scrape a website searching for books related to the subject data
from bs4 import BeautifulSoup
import requests
import re
import csv
from time import sleep
import matplotlib.pyplot as plt
from collections import Counter


#Parse out the data from the site
def book_info(article):
    """Given a BeautifulSoup <article> Tag extract the books 
        details and return in a dict"""
    title = article.find('p', 'title').a.text
    author_name = article.find('p', 'note').text
    authors = [x.strip() for x in re.sub("^By ", "", author_name).split(",")]
    published = article.find('p', 'note date2').text
    # Publishing date sanitation
    cleaned_date = re.sub(' +', '', published)
    remove_newline = re.sub('\n', '', cleaned_date)
    extracted_date = [x.strip() for x in re.sub("^ReleaseDate:", "", cleaned_date).split(",")]
    
    return {
        "title" : title,
        "authors" : authors,
        "date" : extracted_date
    }



#Plot the number of books being published each year
def get_year(book):
    """book["date"] looks like 'May2019' so we need to extract the number 
    from the string"""
    return [int(s) for s in book["date"].split() if s.isdigit()]


base_url = "https://ssearch.oreilly.com/?i=1;m_Sort=searchDate;"
#headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}

csv_columns = ['title', 'authors', 'date']

books = []

NUM_PAGES = 100 #Total number of pages to scrape

for page_num in range(1, NUM_PAGES + 1):
    print "Scraping page", page_num, ",", len(books), " found so far"
    if page_num == 1:
        url = base_url + "q=data;q1=Books;x1=t1&act=pg_1"
    else:
        url = base_url + "page=%s;q=data;q1=Books;x1=t1&act=pg_%s" % (page_num, page_num)

    soup = BeautifulSoup(requests.get(url).text, 'html5lib')
    
    for article in soup('article', 'product-result'):
        books.append(book_info(article))
    '''    
    # Sleep for 10 seconds
    print(books[page_num])
    
    csvData = [books[page_num]]
    
    with open('data_books2.csv', 'a') as csvFile:
        writer = csv.DictWriter(csvFile, fieldnames=csv_columns)
        writer.writeheader()
        for data in csvData:
            writer.writerow(data)
            
    csvFile.close()
    '''
    #Plot the number of books related to data by year
    year_counts = Counter(get_year(book) for book in books 
                         if get_year(book) <= 2019)
    
    years = sorted(year_counts)
    book_counts = [year_counts[year] for year in years]
    plt.plot(years, book_counts)
    plt.ylabel("# of data books")
    plt.title("Data is Big")
    plt.show()
    
                                                
    
    sleep(10)


#Structure layout of website
'''
<article class="result">
            

            <a class="learn-more"
            href="https://www.oreilly.com/pub/e/3289">Learn more</a>
            <p class="title">
              
                <a href="https://www.oreilly.com/pub/e/3289"> 2015 Data
                Preview: Spark, Data Visualization, YARN, and More - O'Reilly
                Media... </a>
              
            </p>
              <p class="note">By Alistair Croll</p>
  
                <p class="note date2">Publish Date: 

                        July 29, 2016
                   
</p>
              <p class="description"> In February, Big Data's biggest event
              comes to the Bay Area. Get a sneak peek with this free online
              conference, featuring many of Strata's most sought-after
              speakers and hottest topics. About Alistair Croll Alistair has
              been an entrepreneur... </p>

            </article>

'''
            





Scraping page 1 , 0  found so far


AttributeError: 'list' object has no attribute 'split'