In [1]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd
import re

# Scraping the data from IMDB with beautiful soup.

In [2]:
#scraping data from imdb site with b.soup
url2="https://www.imdb.com/search/title/?count=250&groups=top_250&sort=user_rating"
response2 = requests.get(url2)
soup2 = BeautifulSoup(response2.content, 'lxml')   
movie = soup2.find_all("div",attrs={"class":"lister-item mode-advanced"})
movie_name = [name.find("h3").text.strip() for name in movie]
movie_rating = [rating.find("strong").text.strip() for rating in movie]
movie_cert = [cert.find("span",attrs={"class":"certificate"}) for cert in movie]
movie_genre = [genre.find("span",attrs={"class":"genre"}).text.replace(",","").split() for genre in movie]
imdb = pd.DataFrame(movie_name)

In [3]:
imdb

Unnamed: 0,0
0,1.\nThe Shawshank Redemption\n(1994)
1,2.\nThe Godfather\n(1972)
2,3.\nThe Dark Knight\n(2008)
3,4.\nThe Godfather: Part II\n(1974)
4,5.\nThe Lord of the Rings: The Return of the K...
...,...
245,246.\nThe General\n(1926)
246,247.\nBefore Sunset\n(2004)
247,"248.\nMonsters, Inc.\n(2001)"
248,249.\nAladdin\n(1992)


# Setting the relevant data into arrays with regex groups

In [4]:
rank = []
title = []
year = []
regex = "(\d*)\.\n(.*)\n\((\d*)"
for t in imdb[0]:
    movie = re.search(regex,t)
    rank.append(movie.group(1))
    title.append(movie.group(2))
    year.append(movie.group(3))

In [5]:
imdb[0] = title
imdb['IMDB Rating'] = movie_rating
imdb['Rank'] = rank
imdb['Year'] = year
#imdb['Genre'] = movie_genre
#imdb['Certificate'] = movie_cert
imdb.columns = ['Title', 'IMDB Rating','IMDB Rank', 'Year']

In [6]:
imdb

Unnamed: 0,Title,IMDB Rating,IMDB Rank,Year
0,The Shawshank Redemption,9.3,1,1994
1,The Godfather,9.2,2,1972
2,The Dark Knight,9.0,3,2008
3,The Godfather: Part II,9.0,4,1974
4,The Lord of the Rings: The Return of the King,8.9,5,2003
...,...,...,...,...
245,The General,8.1,246,1926
246,Before Sunset,8.0,247,2004
247,"Monsters, Inc.",8.0,248,2001
248,Aladdin,8.0,249,1992


# classification of two groups - young, adult

In [7]:
young = ["PG","PG-13","G"]
adult = ["16","R","18"]
cert_class = []
for c in movie_cert:
    if c:
        if c.text in young:
            cert_class.append("Young")
        else:
            cert_class.append("Adult")
    else:
        cert_class.append(None)
imdb['Classification'] = cert_class

# Ignoring the genre Adventure and Crime

In [8]:
genre = []
for g in movie_genre:
    if g[0]=='Adventure' or g[0]=='Crime':
        genre.append(g[1])
    else:
        genre.append(g[0])
imdb['Genre'] = genre

In [9]:
imdb

Unnamed: 0,Title,IMDB Rating,IMDB Rank,Year,Classification,Genre
0,The Shawshank Redemption,9.3,1,1994,Young,Drama
1,The Godfather,9.2,2,1972,Young,Crime
2,The Dark Knight,9.0,3,2008,Young,Action
3,The Godfather: Part II,9.0,4,1974,Young,Crime
4,The Lord of the Rings: The Return of the King,8.9,5,2003,Young,Drama
...,...,...,...,...,...,...
245,The General,8.1,246,1926,,Action
246,Before Sunset,8.0,247,2004,Adult,Drama
247,"Monsters, Inc.",8.0,248,2001,Young,Animation
248,Aladdin,8.0,249,1992,,Animation


# Data Cleaning & Convertions

In [10]:
imdb['IMDB Rank'] = imdb['IMDB Rank'].astype('int64')
imdb['IMDB Rating'] = imdb['IMDB Rating'].astype('float64')
imdb['Year'] = imdb['Year'].replace("","0")
imdb['Year'] = imdb['Year'].astype('int64')
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Title           250 non-null    object 
 1   IMDB Rating     250 non-null    float64
 2   IMDB Rank       250 non-null    int64  
 3   Year            250 non-null    int64  
 4   Classification  153 non-null    object 
 5   Genre           250 non-null    object 
dtypes: float64(1), int64(2), object(3)
memory usage: 11.8+ KB


In [11]:
imdb

Unnamed: 0,Title,IMDB Rating,IMDB Rank,Year,Classification,Genre
0,The Shawshank Redemption,9.3,1,1994,Young,Drama
1,The Godfather,9.2,2,1972,Young,Crime
2,The Dark Knight,9.0,3,2008,Young,Action
3,The Godfather: Part II,9.0,4,1974,Young,Crime
4,The Lord of the Rings: The Return of the King,8.9,5,2003,Young,Drama
...,...,...,...,...,...,...
245,The General,8.1,246,1926,,Action
246,Before Sunset,8.0,247,2004,Adult,Drama
247,"Monsters, Inc.",8.0,248,2001,Young,Animation
248,Aladdin,8.0,249,1992,,Animation


# Saving the dataframe we made into a csv file to load

In [12]:
imdb.to_csv('imdbData.csv')