In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
response = requests.get('https://www.goodreads.com/list/show/6934.Science_Fiction_Books_by_Female_Authors')
doc = BeautifulSoup(response.text)

In [3]:
books = doc.find_all('tr')

len(books)

100

In [4]:
rows = []

for book in books:
    row = {}
    row['Rank'] = book.find(class_="number").text
    row['Title'] = book.find(class_="bookTitle").text.strip()
    row['Author'] = book.find(class_="authorName").text.strip()
    row['Score'] = book.find(href="#").text.strip()
    row['Votes'] = book.find(href="#").find_next_sibling('a').text
    row['Rating'] = book.find(class_="minirating").text

    
    rows.append(row) 

In [5]:
df = pd.DataFrame(rows)

df

Unnamed: 0,Author,Rank,Rating,Score,Title,Votes
0,Margaret Atwood,1,"4.09 avg rating — 1,102,318 ratings","score: 30,733",The Handmaid's Tale,314 people voted
1,Suzanne Collins,2,"4.33 avg rating — 5,742,147 ratings","score: 28,553","The Hunger Games (The Hunger Games, #1)",292 people voted
2,Mary Wollstonecraft Shelley,3,"3.78 avg rating — 1,023,439 ratings","score: 21,909",Frankenstein,224 people voted
3,Madeleine L'Engle,4,"4.01 avg rating — 903,270 ratings","score: 18,720","A Wrinkle in Time (Time Quintet, #1)",196 people voted
4,Ursula K. Le Guin,5,"4.06 avg rating — 98,786 ratings","score: 17,920",The Left Hand of Darkness (Hainish Cycle #4),184 people voted
5,Veronica Roth,6,"4.21 avg rating — 2,602,763 ratings","score: 13,326","Divergent (Divergent, #1)",138 people voted
6,Suzanne Collins,7,"4.29 avg rating — 2,200,701 ratings","score: 12,749","Catching Fire (The Hunger Games, #2)",133 people voted
7,Lois Lowry,8,"4.12 avg rating — 1,535,105 ratings","score: 12,399","The Giver (The Giver, #1)",129 people voted
8,Octavia E. Butler,9,"4.23 avg rating — 72,335 ratings","score: 11,070",Kindred,116 people voted
9,Ursula K. Le Guin,10,"4.21 avg rating — 71,641 ratings","score: 10,731",The Dispossessed (Hainish Cycle #6),112 people voted


In [6]:
df.to_csv("Goodreads", index=False)

In [7]:
df = pd.read_csv("Goodreads", encoding='latin-1', na_values=[0, "UNKNOWN"])
df.head(2)

Unnamed: 0,Author,Rank,Rating,Score,Title,Votes
0,Margaret Atwood,1,"4.09 avg rating â 1,102,318 ratings","score: 30,733",The Handmaid's Tale,314 people voted
1,Suzanne Collins,2,"4.33 avg rating â 5,742,147 ratings","score: 28,553","The Hunger Games (The Hunger Games, #1)",292 people voted


In [8]:
df.dtypes

Author    object
Rank       int64
Rating    object
Score     object
Title     object
Votes     object
dtype: object

In [9]:
import re

In [10]:
df['Score'] = df['Score'].str.extract(r'score:(.*)')

In [11]:
df.head()

Unnamed: 0,Author,Rank,Rating,Score,Title,Votes
0,Margaret Atwood,1,"4.09 avg rating â 1,102,318 ratings",30733,The Handmaid's Tale,314 people voted
1,Suzanne Collins,2,"4.33 avg rating â 5,742,147 ratings",28553,"The Hunger Games (The Hunger Games, #1)",292 people voted
2,Mary Wollstonecraft Shelley,3,"3.78 avg rating â 1,023,439 ratings",21909,Frankenstein,224 people voted
3,Madeleine L'Engle,4,"4.01 avg rating â 903,270 ratings",18720,"A Wrinkle in Time (Time Quintet, #1)",196 people voted
4,Ursula K. Le Guin,5,"4.06 avg rating â 98,786 ratings",17920,The Left Hand of Darkness (Hainish Cycle #4),184 people voted


In [16]:
df[['Rating','Number of ratings']] = df.Rating.str.split("â", expand=True)

In [17]:
df.head()

Unnamed: 0,Author,Rank,Rating,Score,Title,Votes,Number of ratings
0,Margaret Atwood,1,4.09 avg rating,30733,The Handmaid's Tale,314 people voted," 1,102,318 ratings"
1,Suzanne Collins,2,4.33 avg rating,28553,"The Hunger Games (The Hunger Games, #1)",292 people voted," 5,742,147 ratings"
2,Mary Wollstonecraft Shelley,3,3.78 avg rating,21909,Frankenstein,224 people voted," 1,023,439 ratings"
3,Madeleine L'Engle,4,4.01 avg rating,18720,"A Wrinkle in Time (Time Quintet, #1)",196 people voted," 903,270 ratings"
4,Ursula K. Le Guin,5,4.06 avg rating,17920,The Left Hand of Darkness (Hainish Cycle #4),184 people voted," 98,786 ratings"


In [20]:
df[['Title','Series']] = df.Title.str.split("\(", expand=True)

In [21]:
df.head()

Unnamed: 0,Author,Rank,Rating,Score,Title,Votes,Number of ratings,Series
0,Margaret Atwood,1,4.09 avg rating,30733,The Handmaid's Tale,314 people voted," 1,102,318 ratings",
1,Suzanne Collins,2,4.33 avg rating,28553,The Hunger Games,292 people voted," 5,742,147 ratings","The Hunger Games, #1)"
2,Mary Wollstonecraft Shelley,3,3.78 avg rating,21909,Frankenstein,224 people voted," 1,023,439 ratings",
3,Madeleine L'Engle,4,4.01 avg rating,18720,A Wrinkle in Time,196 people voted," 903,270 ratings","Time Quintet, #1)"
4,Ursula K. Le Guin,5,4.06 avg rating,17920,The Left Hand of Darkness,184 people voted," 98,786 ratings",Hainish Cycle #4)


In [24]:
df[['Series','Number in Series']] = df.Series.str.split("\#", expand=True)

In [25]:
df.head(2)

Unnamed: 0,Author,Rank,Rating,Score,Title,Votes,Number of ratings,Series,Number in Series
0,Margaret Atwood,1,4.09 avg rating,30733,The Handmaid's Tale,314 people voted," 1,102,318 ratings",,
1,Suzanne Collins,2,4.33 avg rating,28553,The Hunger Games,292 people voted," 5,742,147 ratings","The Hunger Games,",1)
