The Act of Web Scrapping Using Python Libraries

In [None]:
import numpy as np
import pandas as pd
import requests

In [None]:
#uncomment below to install plotly
!pip install plotly

import plotly.express as px



In [None]:
url = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"

# Fetch all the tables from the web page
tables_list = pd.read_html(url)
tables_list

[    Rank  Peak                                          Title  \
 0      1     1                                         Avatar   
 1      2     1                              Avengers: Endgame   
 2      3     3                       Avatar: The Way of Water   
 3      4     1                                        Titanic   
 4      5     3                   Star Wars: The Force Awakens   
 5      6     4                         Avengers: Infinity War   
 6      7     6                        Spider-Man: No Way Home   
 7      8     3                                 Jurassic World   
 8      9     7                                  The Lion King   
 9     10     3                                   The Avengers   
 10    11     4                                      Furious 7   
 11    12    11                              Top Gun: Maverick   
 12    13    10                                      Frozen II   
 13    14    14                                       Barbie †   
 14    15 

In [None]:
# How many tables were returned?
len(tables_list)

89

In [None]:
# We're only interested in the first table
df = tables_list[0]
df.head()

Unnamed: 0,Rank,Peak,Title,Worldwide gross,Year,Reference(s)
0,1,1,Avatar,"$2,923,706,026",2009,[# 1][# 2]
1,2,1,Avengers: Endgame,"$2,797,501,328",2019,[# 3][# 4]
2,3,3,Avatar: The Way of Water,"$2,320,250,281",2022,[# 5][# 6]
3,4,1,Titanic,"T$2,257,844,554",1997,[# 7][# 8]
4,5,3,Star Wars: The Force Awakens,"$2,068,223,624",2015,[# 9][# 10]


In [None]:
# Let's remove the "Reference" column since we don't need it
df = df.drop(columns=["Reference(s)"])
df

Unnamed: 0,Rank,Peak,Title,Worldwide gross,Year
0,1,1,Avatar,"$2,923,706,026",2009
1,2,1,Avengers: Endgame,"$2,797,501,328",2019
2,3,3,Avatar: The Way of Water,"$2,320,250,281",2022
3,4,1,Titanic,"T$2,257,844,554",1997
4,5,3,Star Wars: The Force Awakens,"$2,068,223,624",2015
5,6,4,Avengers: Infinity War,"$2,048,359,754",2018
6,7,6,Spider-Man: No Way Home,"$1,921,847,111",2021
7,8,3,Jurassic World,"$1,671,537,444",2015
8,9,7,The Lion King,"$1,656,943,394",2019
9,10,3,The Avengers,"$1,518,815,515",2012


In [None]:
# Let's perform data cleaning
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Rank             50 non-null     int64 
 1   Peak             50 non-null     object
 2   Title            50 non-null     object
 3   Worldwide gross  50 non-null     object
 4   Year             50 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 2.1+ KB


In [None]:
# Remove the RK string from some rows in the Peak column

df['Peak'] = df['Peak'].astype('str').str.extractall('(\d+)').unstack().fillna('').sum(axis=1).astype(int)

# Remove "$" and "," string from the rows in the Worgross'ldwide gross column
df['Worldwide gross'] = df['Worldwide gross'].astype('str').str.extractall('(\d+)').unstack().fillna('').sum(axis=1).astype(int)

In [None]:
# change data types for some columns
df = df.astype(
    {'Peak': 'int64',
     'Year': 'category',
     }
)

df

Unnamed: 0,Rank,Peak,Title,Worldwide gross,Year
0,1,1,Avatar,2923706026,2009
1,2,1,Avengers: Endgame,2797501328,2019
2,3,3,Avatar: The Way of Water,2320250281,2022
3,4,1,Titanic,2257844554,1997
4,5,3,Star Wars: The Force Awakens,2068223624,2015
5,6,4,Avengers: Infinity War,2048359754,2018
6,7,6,Spider-Man: No Way Home,1921847111,2021
7,8,3,Jurassic World,1671537444,2015
8,9,7,The Lion King,1656943394,2019
9,10,3,The Avengers,1518815515,2012


In [None]:
# Now, let's get additional data about each movie

genre = [
        "Science Fiction", "Superhero", "Science Fiction", "Romance",
        "Science Fiction", "Superhero", "Superhero",
        "Action", "Musical Drama", "Superhero", "Action",
        "Action", "Animation", "Animation", "Superhero",
        "Animation", "Superhero", "Thriller", "Science Fiction",
        "Action", "Animation", "Animation", "Animation", "Action",
        "Superhero", "Animation", "Superhero", "Superhero",
        "Adventure", "Superhero", "Superhero", "Science Fiction",
        "Action", "Science Fiction", "Superhero", "Thriller",
        "Science Fiction", "Animation", "Animation",
        "Adventure", "Science Fiction", "Action", "Animation",
        "Animation", "Action", "Action", "Science Fiction",
        "Fantasy", "Animation", "Adventure"
]

In [None]:

df["Genre"] = genre
df

Unnamed: 0,Rank,Peak,Title,Worldwide gross,Year,Genre
0,1,1,Avatar,2923706026,2009,Science Fiction
1,2,1,Avengers: Endgame,2797501328,2019,Superhero
2,3,3,Avatar: The Way of Water,2320250281,2022,Science Fiction
3,4,1,Titanic,2257844554,1997,Romance
4,5,3,Star Wars: The Force Awakens,2068223624,2015,Science Fiction
5,6,4,Avengers: Infinity War,2048359754,2018,Superhero
6,7,6,Spider-Man: No Way Home,1921847111,2021,Superhero
7,8,3,Jurassic World,1671537444,2015,Action
8,9,7,The Lion King,1656943394,2019,Musical Drama
9,10,3,The Avengers,1518815515,2012,Superhero


## Now, the data has been cleaned. Let's perform some analyses

In [None]:
df.describe()

Unnamed: 0,Rank,Peak,Worldwide gross
count,50.0,50.0,50.0
mean,25.5,11.62,2968463000.0
std,14.57738,10.157715,11303900000.0
min,1.0,1.0,1017031000.0
25%,13.25,4.0,1073582000.0
50%,25.5,9.0,1187128000.0
75%,37.75,15.0,1484279000.0
max,50.0,43.0,81238760000.0


In [None]:
#Which movie grossed the highest revenue?
#df["Worldwide gross"].max()

df.nlargest(1, "Worldwide gross")

Unnamed: 0,Rank,Peak,Title,Worldwide gross,Year,Genre
23,24,11,The Fate of the Furious,81238764765,2017,Action


Observation: The 'The Fate of the Furious' movie garnered the highest gross revenue

In [None]:
#Worldwide gross by genre
px.bar(data_frame=df, x="Genre", y="Worldwide gross")