# ETL ( Extraction, Transformation, Loading) part

# Extraction

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', None)


import requests
from bs4 import BeautifulSoup

url = 'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating'
response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')

# Find the total number of pages
num_pages = 10

# Loop through each page and extract movie information
for i in range(1, num_pages+1):
    page_url = f'{url}?page={i}'
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    

In [2]:
# Find all the movie items on the page
movie_items = soup.find_all('div', {'class': 'lister-item-content'})
data = []
# Loop through each movie item and extract the required information
for item in movie_items:
    total = {}
    total['title'] = item.h3.a.text
    total['year'] = item.find('span', {'class': 'lister-item-year'}).text.strip('()')
    total['rating'] = item.find('div', {'class': 'inline-block ratings-imdb-rating'}).text.strip()
    total['runtime'] = item.find('span', {'class': 'runtime'}).text         
    total['description'] = item.find_all('p')[1].text.strip()
    total['genre'] = item.find('span', {'class': 'genre'}).text.strip()

    data.append(total)
        

In [3]:
data

[{'title': 'John Wick: Chapter 4',
  'year': '2023',
  'rating': '8.3',
  'runtime': '169 min',
  'description': 'John Wick uncovers a path to defeating The High Table. But before he can earn his freedom, Wick must face off against a new enemy with powerful alliances across the globe and forces that turn old friends into foes.',
  'genre': 'Action, Crime, Thriller'},
 {'title': 'Avatar: The Way of Water',
  'year': '2022',
  'rating': '7.7',
  'runtime': '192 min',
  'description': "Jake Sully lives with his newfound family formed on the extrasolar moon Pandora. Once a familiar threat returns to finish what was previously started, Jake must work with Neytiri and the army of the Na'vi race to protect their home.",
  'genre': 'Action, Adventure, Fantasy'},
 {'title': 'Everything Everywhere All at Once',
  'year': '2022',
  'rating': '7.9',
  'runtime': '139 min',
  'description': 'A middle-aged Chinese immigrant is swept up into an insane adventure in which she alone can save existence b

## Transformation

In [4]:
imdb = pd.DataFrame(data)
imdb

Unnamed: 0,title,year,rating,runtime,description,genre
0,John Wick: Chapter 4,2023,8.3,169 min,"John Wick uncovers a path to defeating The High Table. But before he can earn his freedom, Wick must face off against a new enemy with powerful alliances across the globe and forces that turn old friends into foes.","Action, Crime, Thriller"
1,Avatar: The Way of Water,2022,7.7,192 min,"Jake Sully lives with his newfound family formed on the extrasolar moon Pandora. Once a familiar threat returns to finish what was previously started, Jake must work with Neytiri and the army of the Na'vi race to protect their home.","Action, Adventure, Fantasy"
2,Everything Everywhere All at Once,2022,7.9,139 min,A middle-aged Chinese immigrant is swept up into an insane adventure in which she alone can save existence by exploring other universes and connecting with the lives she could have led.,"Action, Adventure, Comedy"
3,The Whale,2022,7.7,117 min,"A reclusive, morbidly obese English teacher attempts to reconnect with his estranged teenage daughter.",Drama
4,Top Gun: Maverick,2022,8.3,130 min,"After thirty years, Maverick is still pushing the envelope as a top naval aviator, but must confront ghosts of his past when he leads TOP GUN's elite graduates on a mission that demands the ultimate sacrifice from those chosen to fly it.","Action, Drama"
...,...,...,...,...,...,...
95,The Big Short,2015,7.8,130 min,"In 2006-2007 a group of investors bet against the United States mortgage market. In their research, they discover how flawed and corrupt the market is.","Biography, Comedy, Drama"
96,There Will Be Blood,2007,8.2,158 min,"A story of family, religion, hatred, oil and madness, focusing on a turn-of-the-century prospector in the early days of the business.",Drama
97,Zack Snyder's Justice League,2021,8.0,242 min,"Determined to ensure that Superman's ultimate sacrifice wasn't in vain, Bruce Wayne recruits a team of metahumans to protect the world from an approaching threat of catastrophic proportions.","Action, Adventure, Fantasy"
98,A Clockwork Orange,1971,8.3,136 min,"In the future, a sadistic gang leader is imprisoned and volunteers for a conduct-aversion experiment, but it doesn't go as planned.","Crime, Sci-Fi"


In [5]:
imdb['year'] = imdb['year'].str.replace('[^a-zA-Z0-9\w\s]', '', regex=True)

In [6]:
import re

imdb['year'] = imdb['year'].apply(lambda x: re.findall('\d+', str(x))[0]).astype(int)

In [7]:
imdb['runtime_in_min'] = imdb['runtime'].apply(lambda x: re.findall('\d+', str(x))[0]).astype(float)

In [8]:
imdb['rating'] = imdb['rating'].astype(float)

In [9]:
imdb['year'] = imdb['year'].astype(int)

In [10]:
imdb['rating'] = imdb['rating'].astype(float)

In [11]:
unique_values = imdb['year'].unique()
unique_values

array([2023, 2022, 1972, 1994, 2014, 2019, 2011, 2009, 1967, 2021, 2001,
       2008, 1997, 2013, 1999, 2010, 1993, 1990, 2017, 1985, 2006, 2000,
       1957, 1995, 2005, 1976, 1991, 2015, 2012, 1975, 2003, 1979, 1977,
       2004, 1974, 1981, 2016, 1960, 1980, 2007, 1986, 1987, 2018, 1971])

In [12]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           100 non-null    object 
 1   year            100 non-null    int32  
 2   rating          100 non-null    float64
 3   runtime         100 non-null    object 
 4   description     100 non-null    object 
 5   genre           100 non-null    object 
 6   runtime_in_min  100 non-null    float64
dtypes: float64(2), int32(1), object(4)
memory usage: 5.2+ KB


In [13]:
imdb.head()

Unnamed: 0,title,year,rating,runtime,description,genre,runtime_in_min
0,John Wick: Chapter 4,2023,8.3,169 min,"John Wick uncovers a path to defeating The High Table. But before he can earn his freedom, Wick must face off against a new enemy with powerful alliances across the globe and forces that turn old friends into foes.","Action, Crime, Thriller",169.0
1,Avatar: The Way of Water,2022,7.7,192 min,"Jake Sully lives with his newfound family formed on the extrasolar moon Pandora. Once a familiar threat returns to finish what was previously started, Jake must work with Neytiri and the army of the Na'vi race to protect their home.","Action, Adventure, Fantasy",192.0
2,Everything Everywhere All at Once,2022,7.9,139 min,A middle-aged Chinese immigrant is swept up into an insane adventure in which she alone can save existence by exploring other universes and connecting with the lives she could have led.,"Action, Adventure, Comedy",139.0
3,The Whale,2022,7.7,117 min,"A reclusive, morbidly obese English teacher attempts to reconnect with his estranged teenage daughter.",Drama,117.0
4,Top Gun: Maverick,2022,8.3,130 min,"After thirty years, Maverick is still pushing the envelope as a top naval aviator, but must confront ghosts of his past when he leads TOP GUN's elite graduates on a mission that demands the ultimate sacrifice from those chosen to fly it.","Action, Drama",130.0


In [14]:
imdb = imdb.drop('runtime', axis=1)
imdb.head()

Unnamed: 0,title,year,rating,description,genre,runtime_in_min
0,John Wick: Chapter 4,2023,8.3,"John Wick uncovers a path to defeating The High Table. But before he can earn his freedom, Wick must face off against a new enemy with powerful alliances across the globe and forces that turn old friends into foes.","Action, Crime, Thriller",169.0
1,Avatar: The Way of Water,2022,7.7,"Jake Sully lives with his newfound family formed on the extrasolar moon Pandora. Once a familiar threat returns to finish what was previously started, Jake must work with Neytiri and the army of the Na'vi race to protect their home.","Action, Adventure, Fantasy",192.0
2,Everything Everywhere All at Once,2022,7.9,A middle-aged Chinese immigrant is swept up into an insane adventure in which she alone can save existence by exploring other universes and connecting with the lives she could have led.,"Action, Adventure, Comedy",139.0
3,The Whale,2022,7.7,"A reclusive, morbidly obese English teacher attempts to reconnect with his estranged teenage daughter.",Drama,117.0
4,Top Gun: Maverick,2022,8.3,"After thirty years, Maverick is still pushing the envelope as a top naval aviator, but must confront ghosts of his past when he leads TOP GUN's elite graduates on a mission that demands the ultimate sacrifice from those chosen to fly it.","Action, Drama",130.0


In [15]:
imdb

Unnamed: 0,title,year,rating,description,genre,runtime_in_min
0,John Wick: Chapter 4,2023,8.3,"John Wick uncovers a path to defeating The High Table. But before he can earn his freedom, Wick must face off against a new enemy with powerful alliances across the globe and forces that turn old friends into foes.","Action, Crime, Thriller",169.0
1,Avatar: The Way of Water,2022,7.7,"Jake Sully lives with his newfound family formed on the extrasolar moon Pandora. Once a familiar threat returns to finish what was previously started, Jake must work with Neytiri and the army of the Na'vi race to protect their home.","Action, Adventure, Fantasy",192.0
2,Everything Everywhere All at Once,2022,7.9,A middle-aged Chinese immigrant is swept up into an insane adventure in which she alone can save existence by exploring other universes and connecting with the lives she could have led.,"Action, Adventure, Comedy",139.0
3,The Whale,2022,7.7,"A reclusive, morbidly obese English teacher attempts to reconnect with his estranged teenage daughter.",Drama,117.0
4,Top Gun: Maverick,2022,8.3,"After thirty years, Maverick is still pushing the envelope as a top naval aviator, but must confront ghosts of his past when he leads TOP GUN's elite graduates on a mission that demands the ultimate sacrifice from those chosen to fly it.","Action, Drama",130.0
...,...,...,...,...,...,...
95,The Big Short,2015,7.8,"In 2006-2007 a group of investors bet against the United States mortgage market. In their research, they discover how flawed and corrupt the market is.","Biography, Comedy, Drama",130.0
96,There Will Be Blood,2007,8.2,"A story of family, religion, hatred, oil and madness, focusing on a turn-of-the-century prospector in the early days of the business.",Drama,158.0
97,Zack Snyder's Justice League,2021,8.0,"Determined to ensure that Superman's ultimate sacrifice wasn't in vain, Bruce Wayne recruits a team of metahumans to protect the world from an approaching threat of catastrophic proportions.","Action, Adventure, Fantasy",242.0
98,A Clockwork Orange,1971,8.3,"In the future, a sadistic gang leader is imprisoned and volunteers for a conduct-aversion experiment, but it doesn't go as planned.","Crime, Sci-Fi",136.0


# Loading 

In [16]:
IMDB_descriptive_data = imdb

In [17]:
output_file = r'C:\Users\ashut\Downloads\IMDB_descriptive_data.csv'

# write the dataframe to a CSV file at the desired location
IMDB_descriptive_data.to_csv(output_file, index=False)

In [18]:
file_path = r'C:\Users\ashut\Downloads\IMDB_descriptive_data.json'
IMDB_descriptive_data.to_json(file_path, orient='records')