# Scraping movies from box_office_mojo website 

## Import libraries and connect API

In [153]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import pandas as pd

In [101]:
obj_mov=webdriver.Chrome()

In [102]:
obj_mov.get('https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW')

### get each movie on this page with all characteristics belonging to it

In [103]:
movies=obj_mov.find_elements("xpath","//tbody//tr")

In [104]:
for i in movies:
    print(i.text)


1 Avatar $2,923,706,026 2009
2 Avengers: Endgame $2,799,439,100 2019
3 Avatar: The Way of Water $2,320,250,281 2022
4 Titanic $2,264,750,694 1997
5 Star Wars: Episode VII - The Force Awakens $2,071,310,218 2015
6 Avengers: Infinity War $2,052,415,039 2018
7 Spider-Man: No Way Home $1,952,723,719 2021
8 Inside Out 2 $1,698,778,437 2024
9 Jurassic World $1,671,537,444 2015
10 The Lion King $1,662,020,819 2019
11 The Avengers $1,520,538,536 2012
12 Furious 7 $1,515,342,457 2015
13 Top Gun: Maverick $1,495,696,292 2022
14 Frozen II $1,453,683,476 2019
15 Barbie $1,446,938,421 2023
16 Avengers: Age of Ultron $1,405,018,048 2015
17 The Super Mario Bros. Movie $1,360,847,665 2023
18 Black Panther $1,349,926,083 2018
19 Harry Potter and the Deathly Hallows: Part 2 $1,342,499,744 2011
20 Deadpool & Wolverine $1,338,073,645 2024
21 Star Wars: Episode VIII - The Last Jedi $1,334,407,706 2017
22 Jurassic World: Fallen Kingdom $1,310,469,037 2018
23 Frozen $1,306,450,154 2013
24 Iron Man 3 $1,266,

186 The Wandering Earth II $615,023,132 2023
187 The Passion of the Christ $612,060,372 2004
188 Mamma Mia! $611,452,132 2008
189 Life of Pi $609,016,565 2012
190 Ready Player One $607,874,422 2018
191 Transformers: The Last Knight $605,425,157 2017
192 Madagascar: Escape 2 Africa $603,900,354 2008
193 War of the Worlds $603,873,504 2005
194 Tangled $591,806,017 2010
195 Quantum of Solace $589,593,688 2008
196 Men in Black $589,390,539 1997
197 The Croods $587,266,745 2013
198 The Hangover Part II $586,764,305 2011
199 Iron Man $585,796,247 2008
200 I Am Legend $585,410,052 2007


## make list for each characteristic belonging to each movie

In [146]:
ranks=[]
titles=[]
moneys=[]
year_movies=[]


In [147]:
import time

In [148]:
time.sleep(5)
for movie in movies:   
    try:
        rank=movie.find_element("xpath", ".//td[@class='a-text-right mojo-header-column mojo-truncate mojo-field-type-rank']").text
        ranks.append(rank)
    except NoSuchElementException:
        rank = "no rank"
        
    try:
        title=movie.find_element("xpath", ".//td[@class='a-text-left mojo-field-type-title']//A[@class='a-link-normal']").text
        titles.append(title)
    except NoSuchElementException:
        title = "no title"

    try:
        money=movie.find_element("xpath", ".//td[@class='a-text-right mojo-field-type-money']").text
        moneys.append(money)
    except NoSuchElementException:
        money = "no money"
    
    try:
        year = movie.find_element("xpath", ".//td[@class='a-text-left mojo-field-type-year']").text
        year_movies.append(year)
    except NoSuchElementException:
        year = "no year"

In [150]:
print(titles[:10])
print("-----------------------------------------------------------------------------------")
print(ranks[:10])
print("-----------------------------------------------------------------------------------")
print(moneys[:10])
print("-----------------------------------------------------------------------------------")
print(year_movies[:10])

['Avatar', 'Avengers: Endgame', 'Avatar: The Way of Water', 'Titanic', 'Star Wars: Episode VII - The Force Awakens', 'Avengers: Infinity War', 'Spider-Man: No Way Home', 'Inside Out 2', 'Jurassic World', 'The Lion King']
-----------------------------------------------------------------------------------
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
-----------------------------------------------------------------------------------
['$2,923,706,026', '$2,799,439,100', '$2,320,250,281', '$2,264,750,694', '$2,071,310,218', '$2,052,415,039', '$1,952,723,719', '$1,698,778,437', '$1,671,537,444', '$1,662,020,819']
-----------------------------------------------------------------------------------
['2009', '2019', '2022', '1997', '2015', '2018', '2021', '2024', '2015', '2019']


### check length for each list

In [151]:
print(len(titles))
print("-----------------------------------------------------------------------------------")
print(len(ranks))
print("-----------------------------------------------------------------------------------")
print(len(moneys))
print("-----------------------------------------------------------------------------------")
print(len(year_movies))

200
-----------------------------------------------------------------------------------
200
-----------------------------------------------------------------------------------
200
-----------------------------------------------------------------------------------
200


### Create dataframe from Lists

In [154]:
df_movies = pd.DataFrame(zip(ranks,titles,moneys,year_movies), columns=['movie_rank','movie_title','movie_lifetime','movie_year_release'])
df_movies

Unnamed: 0,movie_rank,movie_title,movie_lifetime,movie_year_release
0,1,Avatar,"$2,923,706,026",2009
1,2,Avengers: Endgame,"$2,799,439,100",2019
2,3,Avatar: The Way of Water,"$2,320,250,281",2022
3,4,Titanic,"$2,264,750,694",1997
4,5,Star Wars: Episode VII - The Force Awakens,"$2,071,310,218",2015
...,...,...,...,...
195,196,Men in Black,"$589,390,539",1997
196,197,The Croods,"$587,266,745",2013
197,198,The Hangover Part II,"$586,764,305",2011
198,199,Iron Man,"$585,796,247",2008


### Save the data in csv file

In [155]:
df_movies.to_csv("movies.csv")