In [1]:
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [2]:
# Define URL to scrape and create Soup object to parse HTML
url = "https://www.rd.com/list/hit-movies-that-were-books-first/"
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [3]:
main_area = soup.find("section", class_="content-wrapper pure-g")
books = main_area.find_all('div', class_='listicle-card')

In [4]:
# create an empty list to store the book titles and authors
book_list = []

In [5]:
# loop through each section element and extract the book titles and authors
for book in books:
    # find the "h2" elements within the book section
    h2_tags = book.find_all('h2')
    
    # find the index of the "h2" element you need
    index = None
    for i, h2 in enumerate(h2_tags):
        if 'by' in h2.get_text():
            index = i
            break
    
    # skip this book if the desired "h2" element was not found
    if index is None:
        continue
    
    # get the title and author string from the desired "h2" element
    title_author_str = h2_tags[index].get_text().strip()

    try:
        # extract the title and author name from the title_author string
        title, separator, author = title_author_str.partition(' by ')

        # get the book published and movie released years
        book_published = ""
        movie_released = ""
        for p in book.find_all('p'):
            strong_tag = p.find('strong')
            if strong_tag:
                strong_text = strong_tag.get_text().strip()
                if strong_text == "Book published:":
                    book_published = p.get_text().replace(strong_text, '').strip()
                elif strong_text == "Movie released:":
                    movie_released = p.get_text().replace(strong_text, '').strip()

        # create a dictionary for the book
        summary_dict = {
            "Title": title,
            "Author": author,
            "Book Published": book_published,
            "Movie Released": movie_released
        }

        # add the book dictionary to the book list
        book_list.append(summary_dict)
    
    except ValueError:
        # if there was an error extracting the title and author, skip this book
        print(f"Skipping book with title and author string: {title_author_str}")
        continue

In [6]:
browser.quit()

In [36]:
# Convert book movie data to DataFrame
df = pd.DataFrame(book_list)

In [37]:
# remove the number from the beginning of the "Title" column
df['Title'] = df["Title"].str.replace(r'^\d+\.\s+', '')
df

  


Unnamed: 0,Title,Author,Book Published,Movie Released
0,The Invention of Hugo Cabret,Brian Selznick,2007,2011
1,My Abandonment,Peter Rock,2009,2018
2,The Help,Kathryn Stockett,2009,2011
3,The Cider House Rules,John Irving,1985,1999
4,David Copperfield,Charles Dickens,1850,2019
5,Charlie and the Chocolate Factory,Roald Dahl,1964,
6,One Flew Over The Cuckoo’s Nest,Ken Kesey,1963,1975
7,The Orchid Thief: A True Story of Beauty and O...,Susan Orlean,1998,2002
8,Jurassic Park,Michael Crichton,1990,1993
9,The Hate U Give,Angie Thomas,2017,2018


In [38]:
# check for rows with more than one movie release year
mult = df['Movie Released'].str.count('\d{4}') >1
rows_with_two_years = df[mult].copy()
rows_with_two_years

Unnamed: 0,Title,Author,Book Published,Movie Released
25,Pride and Prejudice,Jane Austen,1813,1995 and 2005
29,The Picture of Dorian Gray,Oscar Wilde,1891,1945 and 2009


In [39]:
# split rows with two movie release years into two separate rows
if not rows_with_two_years.empty:
    new_rows = []
    for _, row in rows_with_two_years.iterrows():
        years = row['Movie Released'].split()
        for year in years:
            new_row = row.copy()
            new_row['Movie Released'] = year
            new_rows.append(new_row)
    
    # join together the dataframes
    df = pd.concat([df, pd.DataFrame(new_rows)])
    
    # Reset the index
    df.reset_index(drop=True, inplace=True)
    
    # drop the original rows with multiple movie release years
    df.drop(rows_with_two_years.index, inplace=True)
    
# reset the index
df.reset_index(drop=True, inplace=True)
    
df

Unnamed: 0,Title,Author,Book Published,Movie Released
0,The Invention of Hugo Cabret,Brian Selznick,2007,2011
1,My Abandonment,Peter Rock,2009,2018
2,The Help,Kathryn Stockett,2009,2011
3,The Cider House Rules,John Irving,1985,1999
4,David Copperfield,Charles Dickens,1850,2019
5,Charlie and the Chocolate Factory,Roald Dahl,1964,
6,One Flew Over The Cuckoo’s Nest,Ken Kesey,1963,1975
7,The Orchid Thief: A True Story of Beauty and O...,Susan Orlean,1998,2002
8,Jurassic Park,Michael Crichton,1990,1993
9,The Hate U Give,Angie Thomas,2017,2018


In [40]:
# create a boolean mask that identifies rows with "and" in the "Movie Released" column
mask = df['Movie Released'].str.contains('and')

# select the rows that do not contain "and" in the "Movie Released" column
df = df[~mask]

# reset the index
df.reset_index(drop=True, inplace=True)

# display the updated DataFrame
df.head()

Unnamed: 0,Title,Author,Book Published,Movie Released
0,The Invention of Hugo Cabret,Brian Selznick,2007,2011
1,My Abandonment,Peter Rock,2009,2018
2,The Help,Kathryn Stockett,2009,2011
3,The Cider House Rules,John Irving,1985,1999
4,David Copperfield,Charles Dickens,1850,2019


In [41]:
# Rename book publised and movie released columns so there are no spaces
df = df.rename(columns={"Book Published": "Book_Published", "Movie Released": "Movie_Released"})
df.head()

Unnamed: 0,Title,Author,Book_Published,Movie_Released
0,The Invention of Hugo Cabret,Brian Selznick,2007,2011
1,My Abandonment,Peter Rock,2009,2018
2,The Help,Kathryn Stockett,2009,2011
3,The Cider House Rules,John Irving,1985,1999
4,David Copperfield,Charles Dickens,1850,2019


In [42]:
# Save dataframe to CSV
df.to_csv("Resources/books_to_movies.csv")