### Webscraping script

#### This script will scrape information from https://quotes.toscrape.com/ website

#### Different tasks will be done which will be described in the notebook

In [89]:
# import necessary packages

import bs4
import requests
import lxml
import pandas as pd

### Obtain unique names of all the authors on the first page

In [90]:
# obtain the text from the website and store to be read

url_first = "https://quotes.toscrape.com/"

ask = requests.get(url_first)

soup = bs4.BeautifulSoup(ask.text, 'lxml')

In [91]:
# obtain author class

author_class = soup.select(".author")

In [92]:
# extra author names

author_name = []

for name in author_class:
    author_name.append(name.text)

unique_name = set(author_name)

### Create a list of all the quotes on the first page

In [93]:
words = soup.select(".text")
quote = []

for word in words:
    quote.append(word.text)

quote

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

### Obtain the top ten tags as mentioned on the page

In [94]:
tag = soup.select(".tag-item")

for x in range(len(tag)):
    print(tag[x].a.text)

love
inspirational
life
humor
books
reading
friendship
friends
truth
simile


### Loop through all the pages on the website and scrape all the unique authors on the website

#### From research all pages that exist and have a quote in them will have the class quote

In [95]:
len(soup.select('.quote'))

10

In [96]:


#number variable for looping through the pages
page_num = 1

exist = True

authors = []

#do a while loop to loop through all pages while quote is true
while exist:
    
    # create a base url 
    base_url = f"https://quotes.toscrape.com/page/{page_num}/"

    ask = requests.get(base_url)
    soup = bs4.BeautifulSoup(ask.text, 'lxml')

    #check if quote class exists or not
    if len(soup.select('.quote')) != 0:

        authors_class = soup.select(".author")
        for name in authors_class:
            authors.append(name.text)

        page_num+=1
    
    #condition to break out of loop
    else:
        exist = False

author_set = set(authors)



'\n\n#number variable for looping through the pages\npage_num = 1\n\nexist = True\n\nauthors = []\n\n#do a while loop to loop through all pages while quote is true\nwhile exist:\n    \n    # create a base url \n    base_url = f"https://quotes.toscrape.com/page/{page_num}/"\n\n    ask = requests.get(base_url)\n    soup = bs4.BeautifulSoup(ask.text, \'lxml\')\n\n    #check if quote class exists or not\n    if len(soup.select(\'.quote\')) != 0:\n\n        authors_class = soup.select(".author")\n        for name in authors_class:\n            authors.append(name.text)\n\n        page_num+=1\n    \n    #condition to break out of loop\n    else:\n        exist = False\n\nauthor_set = set(authors)\n\n'

In [97]:
author_set

{'Albert Einstein',
 'Alexandre Dumas fils',
 'Alfred Tennyson',
 'Allen Saunders',
 'André Gide',
 'Ayn Rand',
 'Bob Marley',
 'C.S. Lewis',
 'Charles Bukowski',
 'Charles M. Schulz',
 'Douglas Adams',
 'Dr. Seuss',
 'E.E. Cummings',
 'Eleanor Roosevelt',
 'Elie Wiesel',
 'Ernest Hemingway',
 'Friedrich Nietzsche',
 'Garrison Keillor',
 'George Bernard Shaw',
 'George Carlin',
 'George Eliot',
 'George R.R. Martin',
 'Harper Lee',
 'Haruki Murakami',
 'Helen Keller',
 'J.D. Salinger',
 'J.K. Rowling',
 'J.M. Barrie',
 'J.R.R. Tolkien',
 'James Baldwin',
 'Jane Austen',
 'Jim Henson',
 'Jimi Hendrix',
 'John Lennon',
 'Jorge Luis Borges',
 'Khaled Hosseini',
 "Madeleine L'Engle",
 'Marilyn Monroe',
 'Mark Twain',
 'Martin Luther King Jr.',
 'Mother Teresa',
 'Pablo Neruda',
 'Ralph Waldo Emerson',
 'Stephenie Meyer',
 'Steve Martin',
 'Suzanne Collins',
 'Terry Pratchett',
 'Thomas A. Edison',
 'W.C. Fields',
 'William Nicholson'}

In [98]:
authors[-1]

'George R.R. Martin'

In [135]:
x_url = "https://quotes.toscrape.com/author/Albert-Einstein/"
x_ask = requests.get(x_url)
x_soup = bs4.BeautifulSoup(x_ask.text, 'lxml')
x_soup.select('.author-born-location')[0].text.strip('in ')

'Ulm, Germany'

### Writing a function to obtain the date of birth of a particular author


#### first write a function to create a dictionary of all authors and the link to their bio

In [136]:
def author_bio(page):

    base = "https://quotes.toscrape.com"

    new_url = base+page

    bio_request = requests.get(new_url)
    bio_soup = bs4.BeautifulSoup(bio_request.text, 'lxml')

    bio_date = bio_soup.select('.author-born-date')[0].text
    bio_location = bio_soup.select('.author-born-location')[0].text.strip('in ')

    return [bio_date, bio_location]



In [137]:
def author_info():
    
    page_num = 1

    exist = True

    authors = []
    author_dict = {}

    #do a while loop to loop through all pages while quote is true
    while exist:

        #print(page_num)
        
        # create a base url 
        base_url = f"https://quotes.toscrape.com/page/{page_num}/"

        ask = requests.get(base_url)
        soup = bs4.BeautifulSoup(ask.text, 'lxml')

        check_page = soup.select('.quote')
        #check if quote class exists or not
        if len(check_page) != 0:

            authors_class = soup.select(".author")
            for name in authors_class:
                authors.append(name.text)
                ind = authors_class.index(name)
                bio_page = check_page[ind].a['href']
                bio_info = author_bio(bio_page)
                author_dict[name.text] = bio_info
            page_num+=1
        
        #condition to break out of loop
        else:
            exist = False

    author_set = set(authors)

    return author_dict

In [123]:
author_info()

{'Albert Einstein': ['March 14, 1879', 'in Ulm, Germany'],
 'J.K. Rowling': ['July 31, 1965',
  'in Yate, South Gloucestershire, England, The United Kingdom'],
 'Jane Austen': ['December 16, 1775',
  'in Steventon Rectory, Hampshire, The United Kingdom'],
 'Marilyn Monroe': ['June 01, 1926', 'in The United States'],
 'André Gide': ['November 22, 1869', 'in Paris, France'],
 'Thomas A. Edison': ['February 11, 1847',
  'in Milan, Ohio, The United States'],
 'Eleanor Roosevelt': ['October 11, 1884', 'in The United States'],
 'Steve Martin': ['August 14, 1945', 'in Waco, Texas, The United States'],
 'Bob Marley': ['February 06, 1945', 'in Nine Mile, Saint Ann, Jamaica'],
 'Dr. Seuss': ['March 02, 1904', 'in Springfield, MA, The United States'],
 'Douglas Adams': ['March 11, 1952',
  'in Cambridge, England, The United Kingdom'],
 'Elie Wiesel': ['September 30, 1928', 'in Sighet, Romania'],
 'Friedrich Nietzsche': ['October 15, 1844',
  'in Röcken bei Lützen, Prussian Province of Saxony, Ger

In [138]:
quote_authors = author_info()

In [139]:
author_df = pd.DataFrame.from_dict(quote_authors).transpose().reset_index()

In [140]:
author_df.columns = ['Author', 'Date of Birth', 'Location']

In [144]:
author_df.head()

Unnamed: 0,Author,Date of Birth,Location
0,Albert Einstein,"March 14, 1879","Ulm, Germany"
1,J.K. Rowling,"July 31, 1965","Yate, South Gloucestershire, England, The Unit..."
2,Jane Austen,"December 16, 1775","Steventon Rectory, Hampshire, The United Kingdom"
3,Marilyn Monroe,"June 01, 1926",The United States
4,André Gide,"November 22, 1869","Paris, France"
