In [7]:
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

res  = requests.get('https://www.imdb.com/chart/top/')
soup = BeautifulSoup(res.content, 'html.parser')

### 1. Scraping the Data

In [2]:
data = []
ranking = 1               # Monitor Ranking

for i in soup.find('tbody', class_ = 'lister-list').find_all('tr'):
    
    movie_id     = i.find('td', class_ = 'titleColumn').find('a').get('href').split('/')[-2].strip()
    movie_name   = i.find('td', class_ = 'titleColumn').find('a').text.strip()
    release_year = int(i.find('td', class_ = 'titleColumn').find('span').text.strip()[1:-1])
    
    data.append([ranking , movie_id ,  movie_name , release_year])
    ranking += 1

### 2. Creating DataFrame

In [3]:
df = pd.DataFrame(data, columns = ['rank','id','name','year'])
df.head()

Unnamed: 0,rank,id,name,year
0,1,tt0111161,The Shawshank Redemption,1994
1,2,tt0068646,The Godfather,1972
2,3,tt0468569,The Dark Knight,2008
3,4,tt0071562,The Godfather Part II,1974
4,5,tt0050083,12 Angry Men,1957


### 3. Save the CSV - Checkpoint

In [4]:
df.to_csv('movies.csv' , index = False)

### 4. Scraping Movies Data - Individual

In [5]:
res = requests.get('https://www.imdb.com/title/tt0111161/ratings/')
imbd_votes    = BeautifulSoup(res.content,'html.parser').find('div', class_ = 'allText').text.strip().split('\n')[0]

res = requests.get('https://www.imdb.com/title/tt0111161')
soup = BeautifulSoup(res.content,'html.parser')

certificate   = soup.find('ul', class_ = 'ipc-inline-list ipc-inline-list--show-dividers sc-8c396aa2-0 kqWovI baseAlt').find_all('li')[1].find('span').text
duration      = soup.find('ul', class_ = 'ipc-inline-list ipc-inline-list--show-dividers sc-8c396aa2-0 kqWovI baseAlt').find_all('li')[2].text
genre         = [tag.text for tag in soup.find('div', class_ = 'ipc-chip-list sc-16ede01-5 ggbGKe').find_all('li')]
imdb_rating   = float(soup.find('span', class_ = 'sc-7ab21ed2-1 jGRxWM').text.strip())


cast_id   = [cast.find('a').get('href').split('?')[0].split('/')[-1] for cast in soup.find('div', class_ = 'ipc-sub-grid ipc-sub-grid--page-span-2 ipc-sub-grid--wraps-at-above-l ipc-shoveler__grid').find_all('div', class_ = 'sc-18baf029-7 eVsQmt')]
cast_name = [cast.find('a').get('aria-label') for cast in soup.find('div', class_ = 'ipc-sub-grid ipc-sub-grid--page-span-2 ipc-sub-grid--wraps-at-above-l ipc-shoveler__grid').find_all('div', class_ = 'sc-18baf029-7 eVsQmt')]

writter_name  = [writ.find('a').text for writ in soup.find('ul', class_ = 'ipc-metadata-list ipc-metadata-list--dividers-all sc-18baf029-10 jIsryf ipc-metadata-list--base').find_all('li', class_ = 'ipc-metadata-list__item')[1].find_all('li')]
writter_id    = [writ.find('a').get('href').split('?')[0].split('/')[-1] for writ in soup.find('ul', class_ = 'ipc-metadata-list ipc-metadata-list--dividers-all sc-18baf029-10 jIsryf ipc-metadata-list--base').find_all('li', class_ = 'ipc-metadata-list__item')[1].find_all('li')]

### 5. Scraping Movies Data - All

In [8]:
imbd_votes_    = []
imdb_rating_   = []
certificate_   = []
duration_      = []
genre_         = []
cast_id_       = []
cast_name_     = []
director_id_   = []
director_name_ = []
writter_name_  = []
writter_id_    = []


for i in tqdm(df['id']):
    
    res = requests.get('https://www.imdb.com/title/' + i)
    soup = BeautifulSoup(res.content,'html.parser')
    
    try:
        res = requests.get('https://www.imdb.com/title/' + i + '/ratings/')
        imbd_votes    = BeautifulSoup(res.content,'html.parser').find('div', class_ = 'allText').text.strip().split('\n')[0]
    except:
        imbd_votes    = np.nan
    
    try:
        certificate   = soup.find('ul', class_ = 'ipc-inline-list ipc-inline-list--show-dividers sc-8c396aa2-0 kqWovI baseAlt').find_all('li')[1].find('span').text
    except:
        certificate = np.nan
        
    try:
        duration      = soup.find('ul', class_ = 'ipc-inline-list ipc-inline-list--show-dividers sc-8c396aa2-0 kqWovI baseAlt').find_all('li')[2].text
    except:
        duration = np.nan
     
    try:
        genre         = ','.join([tag.text for tag in soup.find('div', class_ = 'ipc-chip-list sc-16ede01-5 ggbGKe').find_all('li')])
    except:
        genre = np.nan
        
    
    imdb_rating   = float(soup.find('span', class_ = 'sc-7ab21ed2-1 jGRxWM').text.strip())
    
    try:
        cast_id       = ','.join([cast.find('a').get('href').split('?')[0].split('/')[-1] for cast in soup.find('div', class_ = 'ipc-sub-grid ipc-sub-grid--page-span-2 ipc-sub-grid--wraps-at-above-l ipc-shoveler__grid').find_all('div', class_ = 'sc-18baf029-7 eVsQmt')])
        cast_name     = ','.join([cast.find('a').get('aria-label') for cast in soup.find('div', class_ = 'ipc-sub-grid ipc-sub-grid--page-span-2 ipc-sub-grid--wraps-at-above-l ipc-shoveler__grid').find_all('div', class_ = 'sc-18baf029-7 eVsQmt')])
    except:
        cast_id = np.nan
        cast_name = np.nan
        
    try:
        director_id   = ','.join([direc.find('a').get('href').split('?')[0].split('/')[-1] for direc in soup.find('ul', class_ = 'ipc-metadata-list ipc-metadata-list--dividers-all sc-18baf029-10 jIsryf ipc-metadata-list--base').find_all('li', class_ = 'ipc-metadata-list__item')[0].find('ul', class_ = 'ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--inline ipc-metadata-list-item__list-content base').find_all('li')])
        director_name = ','.join([direc.find('a').text for direc in soup.find('ul', class_ = 'ipc-metadata-list ipc-metadata-list--dividers-all sc-18baf029-10 jIsryf ipc-metadata-list--base').find_all('li', class_ = 'ipc-metadata-list__item')[0].find('ul', class_ = 'ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--inline ipc-metadata-list-item__list-content base').find_all('li')])
    except:
        director_id = np.nan
        director_name = np.nan
    
    try:
        writter_name  = ','.join([writ.find('a').text for writ in soup.find('ul', class_ = 'ipc-metadata-list ipc-metadata-list--dividers-all sc-18baf029-10 jIsryf ipc-metadata-list--base').find_all('li', class_ = 'ipc-metadata-list__item')[1].find_all('li')])
        writter_id    = ','.join([writ.find('a').get('href').split('?')[0].split('/')[-1] for writ in soup.find('ul', class_ = 'ipc-metadata-list ipc-metadata-list--dividers-all sc-18baf029-10 jIsryf ipc-metadata-list--base').find_all('li', class_ = 'ipc-metadata-list__item')[1].find_all('li')])
    except:
        writter_id = np.nan
        writter_name = np.nan
    
    imbd_votes_.append(imbd_votes)
    imdb_rating_.append(imdb_rating)
    certificate_.append(certificate)
    duration_.append(duration)
    genre_.append(genre)
    cast_id_.append(cast_id)
    cast_name_.append(cast_name)
    director_id_.append(director_id)
    director_name_.append(director_name)
    writter_name_.append(writter_name)
    writter_id_.append(writter_id)

100%|█████████████████████████████████████████| 250/250 [15:41<00:00,  3.76s/it]


### 6. Adding Data to the DataFrame

In [9]:
df['imbd_votes']     = imbd_votes_
df['imdb_rating']    = imdb_rating_
df['certificate']    = certificate_
df['duration']       = duration_
df['genre']          = genre_
df['cast_id']        = cast_id_
df['cast_name']      = cast_name_
df['director_id']    = director_id_
df['director_name']  = director_name_
df['writter_name']   = writter_name_
df['writter_id']     = writter_id_

### 7. Checking Null Values

In [10]:
df.isnull().sum()

rank             0
id               0
name             0
year             0
imbd_votes       0
imdb_rating      0
certificate      1
duration         1
genre            0
cast_id          0
cast_name        0
director_id      0
director_name    0
writter_name     0
writter_id       0
dtype: int64

### 8. Save Dataset - Checkpoint

In [11]:
df.to_csv('movies.csv', index = False)

## 9. Preprocessing the Dataset

#### 9.1 Preprocessing with Duration Column

In [41]:
duration = []


for d in df['duration']:    
    
    try:
        
        if('h' in d and 'm' in d):                                                        # 3h 39m
            duration.append((int(d.split(' ')[0][:-1]) * 60) + int(d.split(' ')[1][:-1]))   
            
        elif('h' in d and 'm' not in d):                                                  # 3h 
            duration.append(int(d[:-1])*60)
            
        else:                                                                             # 39m
            duration.append(int(d[:-1]))
            
    
    except:
        duration.append('Nan')

df['duration'] = duration

df.head()

Unnamed: 0,rank,id,name,year,imbd_votes,imdb_rating,certificate,duration,genre,cast_id,cast_name,director_id,director_name,writter_name,writter_id
0,1,tt0111161,The Shawshank Redemption,1994,2601152,9.3,A,142,Drama,"nm0000209,nm0000151,nm0348409,nm0006669,nm0000...","Tim Robbins,Morgan Freeman,Bob Gunton,William ...",nm0001104,Frank Darabont,"Stephen King,Frank Darabont","nm0000175,nm0001104"
1,2,tt0068646,The Godfather,1972,1796656,9.2,A,175,"Crime,Drama","nm0000008,nm0000199,nm0001001,nm0000473,nm0144...","Marlon Brando,Al Pacino,James Caan,Diane Keato...",nm0000338,Francis Ford Coppola,"Mario Puzo,Francis Ford Coppola","nm0701374,nm0000338"
2,3,tt0468569,The Dark Knight,2008,2572662,9.0,UA,152,"Action,Crime,Drama","nm0000288,nm0005132,nm0001173,nm0000323,nm0350...","Christian Bale,Heath Ledger,Aaron Eckhart,Mich...",nm0634240,Christopher Nolan,"Jonathan Nolan,Christopher Nolan,David S. Goyer","nm0634300,nm0634240,nm0275286"
3,4,tt0071562,The Godfather Part II,1974,1237934,9.0,A,202,"Crime,Drama","nm0000199,nm0000134,nm0000380,nm0000473,nm0001...","Al Pacino,Robert De Niro,Robert Duvall,Diane K...",nm0000338,Francis Ford Coppola,"Francis Ford Coppola,Mario Puzo","nm0000338,nm0701374"
4,5,tt0050083,12 Angry Men,1957,768548,9.0,U,96,"Crime,Drama","nm0000020,nm0002011,nm0000842,nm0275835,nm0550...","Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie...",nm0001486,Sidney Lumet,Reginald Rose,nm0741627


#### 9.2 Preprocessing IMDB Votes Column

In [46]:
votes = []

for i in df['imbd_votes']:
    votes.append(int(i.replace(',','')))
    
df['imbd_votes'] = votes

df.head()

Unnamed: 0,rank,id,name,year,imbd_votes,imdb_rating,certificate,duration,genre,cast_id,cast_name,director_id,director_name,writter_name,writter_id
0,1,tt0111161,The Shawshank Redemption,1994,2601152,9.3,A,142,Drama,"nm0000209,nm0000151,nm0348409,nm0006669,nm0000...","Tim Robbins,Morgan Freeman,Bob Gunton,William ...",nm0001104,Frank Darabont,"Stephen King,Frank Darabont","nm0000175,nm0001104"
1,2,tt0068646,The Godfather,1972,1796656,9.2,A,175,"Crime,Drama","nm0000008,nm0000199,nm0001001,nm0000473,nm0144...","Marlon Brando,Al Pacino,James Caan,Diane Keato...",nm0000338,Francis Ford Coppola,"Mario Puzo,Francis Ford Coppola","nm0701374,nm0000338"
2,3,tt0468569,The Dark Knight,2008,2572662,9.0,UA,152,"Action,Crime,Drama","nm0000288,nm0005132,nm0001173,nm0000323,nm0350...","Christian Bale,Heath Ledger,Aaron Eckhart,Mich...",nm0634240,Christopher Nolan,"Jonathan Nolan,Christopher Nolan,David S. Goyer","nm0634300,nm0634240,nm0275286"
3,4,tt0071562,The Godfather Part II,1974,1237934,9.0,A,202,"Crime,Drama","nm0000199,nm0000134,nm0000380,nm0000473,nm0001...","Al Pacino,Robert De Niro,Robert Duvall,Diane K...",nm0000338,Francis Ford Coppola,"Francis Ford Coppola,Mario Puzo","nm0000338,nm0701374"
4,5,tt0050083,12 Angry Men,1957,768548,9.0,U,96,"Crime,Drama","nm0000020,nm0002011,nm0000842,nm0275835,nm0550...","Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie...",nm0001486,Sidney Lumet,Reginald Rose,nm0741627


### 10. Saving Dataset - Checkpoint

In [47]:
df.to_csv('movies.csv', index = False)