# ETL ( Extraction, Transformation, Loading) part

# Extraction

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

base_url = 'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating'

num_pages = 11

movies = []

for i in range(1, num_pages+1):
    url = base_url + f'&start={i*100}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    movie_containers = soup.find_all('div', class_='lister-item mode-advanced')
    
    for movie in movie_containers:
        title = movie.h3.a.text
        year = movie.h3.find('span', class_='lister-item-year').text.strip('()')
        rating = movie.strong.text
        runtime = movie.find('span', class_='runtime').text
        genre = movie.find('span', class_='genre').text.strip()
        director = movie.find_all('p')[2].a.text
        stars = [star.text for star in movie.find_all('p')[2].find_all('a')[1:]]
        gross = movie.find_all('span', attrs={'name': 'nv'})[0].text
        
        movie_data = {
            'title': title,
            'year': year,
            'rating': rating,
            'runtime': runtime,
            'genre': genre,
            'director': director,
            'stars': stars,
            'gross': gross
        }
        
        movies.append(movie_data)
    
    # pause the loop for a few seconds to avoid overwhelming the website with requests
    time.sleep(2)

# Transformation

In [20]:
# create a pandas dataframe from the list of movies
imdb_dataset = pd.DataFrame(movies)

In [21]:
imdb_dataset.head()

Unnamed: 0,title,year,rating,runtime,genre,director,stars,gross
0,2001: A Space Odyssey,1968,8.3,149 min,"Adventure, Sci-Fi",Stanley Kubrick,"[Keir Dullea, Gary Lockwood, William Sylvester...",680413
1,Reservoir Dogs,1992,8.3,99 min,"Crime, Thriller",Quentin Tarantino,"[Harvey Keitel, Tim Roth, Michael Madsen, Chri...",1035752
2,Scarface,1983,8.3,170 min,"Crime, Drama",Brian De Palma,"[Al Pacino, Michelle Pfeiffer, Steven Bauer, M...",858124
3,Full Metal Jacket,1987,8.3,116 min,"Drama, War",Stanley Kubrick,"[Matthew Modine, R. Lee Ermey, Vincent D'Onofr...",753636
4,Hamilton,2020,8.3,160 min,"Biography, Drama, History",Thomas Kail,"[Lin-Manuel Miranda, Phillipa Soo, Leslie Odom...",99582


In [22]:
imdb_dataset['year'] = imdb_dataset['year'].str.replace('[^a-zA-Z0-9\w\s]', '', regex=True)

In [23]:
import re
imdb_dataset['year'] = imdb_dataset['year'].apply(lambda x: re.findall('\d+', str(x))[0]).astype(int)

In [24]:
imdb_dataset['runtime_in_min'] = imdb_dataset['runtime'].apply(lambda x: re.findall('\d+', str(x))[0]).astype(float)

In [25]:
imdb_dataset['rating'] = imdb_dataset['rating'].astype(float)

In [26]:
imdb_dataset['gross'] = pd.to_numeric(imdb_dataset['gross'].str.replace(',', ''), errors='coerce').astype(int)

In [27]:
imdb_dataset = imdb_dataset.drop('runtime', axis=1)

In [28]:
imdb_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           901 non-null    object 
 1   year            901 non-null    int32  
 2   rating          901 non-null    float64
 3   genre           901 non-null    object 
 4   director        901 non-null    object 
 5   stars           901 non-null    object 
 6   gross           901 non-null    int32  
 7   runtime_in_min  901 non-null    float64
dtypes: float64(2), int32(2), object(4)
memory usage: 49.4+ KB


In [29]:
imdb_dataset

Unnamed: 0,title,year,rating,genre,director,stars,gross,runtime_in_min
0,2001: A Space Odyssey,1968,8.3,"Adventure, Sci-Fi",Stanley Kubrick,"[Keir Dullea, Gary Lockwood, William Sylvester...",680413,149.0
1,Reservoir Dogs,1992,8.3,"Crime, Thriller",Quentin Tarantino,"[Harvey Keitel, Tim Roth, Michael Madsen, Chri...",1035752,99.0
2,Scarface,1983,8.3,"Crime, Drama",Brian De Palma,"[Al Pacino, Michelle Pfeiffer, Steven Bauer, M...",858124,170.0
3,Full Metal Jacket,1987,8.3,"Drama, War",Stanley Kubrick,"[Matthew Modine, R. Lee Ermey, Vincent D'Onofr...",753636,116.0
4,Hamilton,2020,8.3,"Biography, Drama, History",Thomas Kail,"[Lin-Manuel Miranda, Phillipa Soo, Leslie Odom...",99582,160.0
...,...,...,...,...,...,...,...,...
896,Les yeux sans visage,1960,7.6,"Drama, Horror",Georges Franju,"[Pierre Brasseur, Alida Valli, Juliette Maynie...",32977,90.0
897,Shine,1996,7.6,"Biography, Drama, Music",Scott Hicks,"[Geoffrey Rush, Armin Mueller-Stahl, Justin Br...",55114,105.0
898,The Invisible Man,1933,7.6,"Horror, Sci-Fi",James Whale,"[Claude Rains, Gloria Stuart, William Harrigan...",37154,71.0
899,The Odd Couple,1968,7.6,Comedy,Gene Saks,"[Jack Lemmon, Walter Matthau, John Fiedler, He...",35987,105.0


# Loading

In [30]:
file_path = r'C:\Users\ashut\Downloads\imdb_dataset.csv'

# write the dataframe to a CSV file at the desired location
imdb_dataset.to_csv(file_path, index=False)

In [31]:
file_path = r'C:\Users\ashut\Downloads\imdb_dataset.json'
imdb_dataset.to_json(file_path, orient='records')