# Set-Up

In [1]:
# Import necessary packages
import requests
from bs4 import BeautifulSoup

In [2]:
# Define the URL of the site
base_site = "https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/"

In [3]:
# Sending a request to the webpage
response = requests.get(base_site)
response.status_code

200

In [4]:
# Get the HTML from the webpage
html = response.content

In [5]:
# Convert the HTML into a soup
soup = BeautifulSoup(html,"lxml")

In [6]:
# Exporting the HTML to a file (to check for parsing issues)
with open ('Rotten-Tomatoes-LXML-Parser.html','wb') as file:
    file.write(soup.prettify('utf-8'))

## Obtaining the elements containing relevant data

In [7]:
divs = soup.find_all('div', {"class":"col-sm-18 col-full-xs countdown-item-content"})

In [8]:
print(divs[0].prettify()) # Each of these divs contain all the relevant information per movie

<div class="col-sm-18 col-full-xs countdown-item-content">
 <div class="row countdown-item-title-bar">
  <div class="col-sm-20 col-full-xs" style="height: 100%;">
   <div class="article_movie_title" style="float: left;">
    <div>
     <h2>
      <a href="https://www.rottentomatoes.com/m/1018009-running_scared">
       Running Scared
      </a>
      <span class="subtle start-year">
       (1986)
      </span>
      <span class="icon tiny rotten" title="Rotten">
      </span>
      <span class="tMeterScore">
       57%
      </span>
     </h2>
    </div>
   </div>
  </div>
  <div class="col-sm-4 col-full-xs" style="height: 100%;">
   <div class="countdown-index">
    #140
   </div>
  </div>
 </div>
 <div class="row countdown-item-details">
  <div class="col-sm-24">
   <div class="info countdown-adjusted-score">
    <span class="descriptor">
     Adjusted Score:
    </span>
    58276%
    <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted 

# Extracting the Title, Year, and Score of each movie

In [9]:
headings = [div.find('h2') for div in divs]

In [10]:
headings[0]

<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">57%</span></h2>

## Title

In [11]:
movie_names = [heading.find('a').text for heading in headings]

In [12]:
len(movie_names) # 140 movies --> 140 names

140

In [13]:
movie_names[:10]

['Running Scared',
 'Equilibrium',
 'Hero',
 'Road House',
 'Unstoppable',
 'Shaft',
 'The Villainess',
 'Highlander',
 'Die Hard 2',
 'National Treasure']

In [14]:
#movie_names

## Year

In [15]:
movie_years = [heading.find('span', class_="subtle start-year").text for heading in headings]

In [16]:
len(movie_years) # Correct number of movie years

140

In [17]:
movie_years[:10] # We need to remove the surrounding parenthesis and change the dtypes to int

['(1986)',
 '(2002)',
 '(2002)',
 '(1989)',
 '(2010)',
 '(1971)',
 '(2017)',
 '(1986)',
 '(1990)',
 '(2004)']

In [18]:
# Removing unwanted parenthesis

movie_years = [year.strip('()') for year in movie_years]
movie_years[:10]

['1986',
 '2002',
 '2002',
 '1989',
 '2010',
 '1971',
 '2017',
 '1986',
 '1990',
 '2004']

In [19]:
# Recasting the years to integer types

movie_years = [int(year) for year in movie_years]
movie_years[:10]

[1986, 2002, 2002, 1989, 2010, 1971, 2017, 1986, 1990, 2004]

In [20]:
#movie_years

## Score

In [21]:
movie_scores = [heading.find('span', class_="tMeterScore").text for heading in headings]

In [22]:
len(movie_scores) # Correct number of movie scores

140

In [23]:
movie_scores[:10] # We should remove the '%' signs and change the dtypes to int again

['57%', '41%', '94%', '40%', '87%', '88%', '85%', '70%', '69%', '46%']

In [24]:
# Removing the % sign from scores

movie_scores = [score.strip('%') for score in movie_scores]
movie_scores[:10]

['57', '41', '94', '40', '87', '88', '85', '70', '69', '46']

In [25]:
# Recasting the scores as integer types

movie_scores = [int(score) for score in movie_scores]
movie_scores[:10]

[57, 41, 94, 40, 87, 88, 85, 70, 69, 46]

In [26]:
#movie_scores

# Extracting the rest of the information

## Critics Consensus

In [27]:
consensus = [div.find('div', {"class":"info critics-consensus"}) for div in divs]

In [28]:
len(consensus) # Correct number of movie consensus

140

In [29]:
consensus[0] # We need to extract only the text from these div elements

<div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> <em>Running Scared</em> struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.</div>

In [30]:
consensus[0].text # We don't need "Critics Consensus: " in front of the text

'Critics Consensus: Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

In [31]:
common_phrase = 'Critics Consensus: '
common_len = len(common_phrase)

In [32]:
consensus_text = [con.text[common_len:] if con.text.startswith(common_phrase) else con.text for con in consensus]

In [33]:
consensus_text[0] # Looks good

'Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

In [34]:
#consensus_text

## Directors

In [35]:
directors_info = [div.find('div', {"class":"info director"}) for div in divs]

In [36]:
len(directors_info) # Correct number of director divs

140

In [37]:
directors_info[0] # We need to extract only the text again

<div class="info director">
<span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/peter_hyams">Peter Hyams</a></div>

In [38]:
director_links = directors_info[0].find_all('a') # We use find_all because some movies have multiple directors
director_links

[<a class="" href="//www.rottentomatoes.com/celebrity/peter_hyams">Peter Hyams</a>]

In [39]:
director_names = [link.text for link in director_links]
director_names

['Peter Hyams']

In [40]:
# We will combine all directors of the same movie into one string
directors = ", ".join(director_names) 
directors

'Peter Hyams'

In [41]:
# Using a for-loop to create the list of directors

directors = []
for d in directors_info:
    director_links = d.find_all('a')
    director_names = [link.text for link in director_links]
    result = ", ".join(director_names)
    
    directors.append(result)

In [42]:
len(directors)

140

In [43]:
# Creating the same list with list comprehension instead

directors = [", ".join([link.text for link in d.find_all('a')]) for d in directors_info]

In [44]:
#directors

## Cast Info

In [45]:
cast_info = [div.find('div', {"class":"info cast"}) for div in divs]

In [46]:
len(cast_info) # Correct number of cast divs

140

In [47]:
cast_info[0] # We should follow the same steps we used for creating the list of directors

<div class="info cast">
<span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/gregory_hines">Gregory Hines</a>, <a class="" href="//www.rottentomatoes.com/celebrity/billy_crystal">Billy Crystal</a>, <a class="" href="//www.rottentomatoes.com/celebrity/jimmy_smits">Jimmy Smits</a>, <a class="" href="//www.rottentomatoes.com/celebrity/steven_bauer">Steven Bauer</a></div>

In [48]:
cast_links = cast_info[0].find_all('a')
cast_links[0]

<a class="" href="//www.rottentomatoes.com/celebrity/gregory_hines">Gregory Hines</a>

In [49]:
cast_names = [link.text for link in cast_links]
cast_names[0]

'Gregory Hines'

In [50]:
cast = ", ".join(cast_names)
cast

'Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer'

In [51]:
# Creating the list of cast members using a for-loop

cast = []
for c in cast_info:
    cast_links = c.find_all('a')
    cast_names = [link.text for link in cast_links]
    result = ", ".join(cast_names)
    
    cast.append(result)

In [52]:
#cast

In [53]:
# Creating the same list using list comprehension

cast = [", ".join([link.text for link in c.find_all('a')]) for c in cast_info]

In [54]:
#cast

## Adjusted Score

In [55]:
adjusted_score_info = [div.find('div', {"class":"info countdown-adjusted-score"}) for div in divs]

In [56]:
len(adjusted_score_info) # Correct number of adjusted scores

140

In [57]:
adjusted_score_info[0] # Once again we only want the text from these divs

<div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>58276% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>

In [58]:
adjusted_text = adjusted_score_info[0].text # We should get rid of all unnecessary text and convert to floats
adjusted_text

'Adjusted Score: 58276% '

In [59]:
common_adjusted_phrase = 'Adjusted Score: '
common_adjusted_len = len(common_adjusted_phrase)
common_adjusted_len

16

In [60]:
adjusted_score = adjusted_text[common_adjusted_len:]
int(adjusted_score.strip('% '))/1000

58.276

In [61]:
# Using a for-loop to create the list of adjusted scores

adjusted_scores = []
common_adjusted_phrase = 'Adjusted Score: '
common_adjusted_len = len(common_adjusted_phrase)
for a in adjusted_score_info:
    adjusted_text = a.text
    if (adjusted_text.startswith(common_adjusted_phrase)):
        adjusted_score = adjusted_text[common_adjusted_len:]
    result = int(adjusted_score.strip('% '))/1000
    
    adjusted_scores.append(result)    

In [62]:
adjusted_scores[:10]

[58.276,
 42.451,
 101.761,
 43.372,
 93.228,
 91.987,
 90.83,
 73.009,
 72.566,
 51.397]

In [63]:
#adjusted_scores

## Synopsis

In [64]:
synopsis_info = [div.find('div', {"class":"info synopsis"}) for div in divs]

In [65]:
len(synopsis_info) # Correct number of synopses

140

In [66]:
synopsis_info[0] # We only need the text

<div class="info synopsis"><span class="descriptor">Synopsis:</span> Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/1018009-running_scared" target="_top"> [More]</a></div>

In [67]:
synopsis_info[0].text[len('Synopsis: '):] # Looks good

'Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio... [More]'

In [68]:
common_synopsis_phrase = 'Synopsis: '
common_synopsis_len = len(common_synopsis_phrase)
common_synopsis_len

10

In [69]:
# List-comprehension

synopsis = [s.text[common_synopsis_len:] if s.text.startswith(common_synopsis_phrase) else s.text for s in synopsis_info]

In [70]:
synopsis[:5]

['Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio... [More]',
 'In a futuristic world, a regime has eliminated war by suppressing emotions: books, art and music are strictly forbidden and... [More]',
 'In this visually arresting martial arts epic set in ancient China, an unnamed fighter (Jet Li) is being honored for... [More]',
 'The Double Deuce is the meanest, loudest and rowdiest bar south of the Mason-Dixon Line, and Dalton (Patrick Swayze) has... [More]',
 'When a massive, unmanned locomotive roars out of control, the threat is more ominous than just a derailment. The train... [More]']

In [71]:
#synopsis

# Creating the DataFrame

In [72]:
import pandas as pd
rotten_tomato_scrape = pd.DataFrame()

In [73]:
rotten_tomato_scrape["Movie Name"] = movie_names
rotten_tomato_scrape["Year"] = movie_years
rotten_tomato_scrape["Score"] = movie_scores
rotten_tomato_scrape["Critics Consensus"] = consensus_text
rotten_tomato_scrape["Directors"] = directors
rotten_tomato_scrape["Cast"] = cast
rotten_tomato_scrape["Adjusted Score"] = adjusted_scores
rotten_tomato_scrape["Synopsis"] = synopsis

In [74]:
rotten_tomato_scrape

Unnamed: 0,Movie Name,Year,Score,Critics Consensus,Directors,Cast,Adjusted Score,Synopsis
0,Running Scared,1986,57,Running Scared struggles to strike a consisten...,Peter Hyams,"Gregory Hines, Billy Crystal, Jimmy Smits, Ste...",58.276,"Ray and Danny (Gregory Hines, Billy Crystal) a..."
1,Equilibrium,2002,41,Equilibrium is a reheated mishmash of other sc...,Kurt Wimmer,"Christian Bale, Emily Watson, Taye Diggs, Angu...",42.451,"In a futuristic world, a regime has eliminated..."
2,Hero,2002,94,With death-defying action sequences and epic h...,Zhang Yimou,"Jet Li, Tony Leung Chiu Wai, Maggie Cheung Man...",101.761,In this visually arresting martial arts epic s...
3,Road House,1989,40,Whether Road House is simply bad or so bad it'...,Rowdy Herrington,"Patrick Swayze, Kelly Lynch, Sam Elliott, Ben ...",43.372,"The Double Deuce is the meanest, loudest and r..."
4,Unstoppable,2010,87,"As fast, loud, and relentless as the train at ...",Tony Scott,"Denzel Washington, Chris Pine, Rosario Dawson,...",93.228,"When a massive, unmanned locomotive roars out ..."
...,...,...,...,...,...,...,...,...
135,Hard-Boiled,1992,94,Boasting impactful action as well as surprisin...,John Woo,"Chow Yun-Fat, Bowie Lam, Philip Chan, Tony Leu...",96.488,A cop who loses his partner in a shoot-out wit...
136,The Matrix,1999,88,"Thanks to the Wachowskis' imaginative vision, ...","Andy Wachowski, Larry Wachowski","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",94.059,Neo (Keanu Reeves) believes that Morpheus (Lau...
137,Terminator 2: Judgment Day,1991,93,T2 features thrilling action sequences and eye...,James Cameron,"Arnold Schwarzenegger, Linda Hamilton, Edward ...",98.529,"In this sequel set eleven years after ""The Ter..."
138,Die Hard,1988,94,Its many imitators (and sequels) have never co...,John McTiernan,"Bruce Willis, Alan Rickman, Bonnie Bedelia, Re...",99.270,New York City policeman John McClane (Bruce Wi...


In [75]:
# Saving this final dataframe to a CSV file 
rotten_tomato_scrape.to_csv('Scraped_RottenTomatoes_Data.csv', index=False, encoding='utf-8')