In [46]:
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

### 1. Sending Request to First Page

In [47]:
data = []

link = 'https://www.imdb.com/search/name/?match_all=true'

### 2. Sending Request to all pages

In [48]:
for pages in tqdm(range(2000)):
    
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')

    for celeb in soup.find('div', class_ = 'lister-list').find_all('div', class_ = 'lister-item mode-detail'):
        
        try:
            id_        = celeb.find('a').get('href').split('/')[-1].strip()
        except:
            id_        = np.nan
           
        try:
            name       = celeb.find('h3').find('a').text.strip()
        except:
            name       = np.nan
            
        try:
            img_link   = celeb.find('img').get('src').strip()
        except:
            img_link   = np.nan
        
        try:
            profession = celeb.find('p').text.strip().split('|')[0].strip()
        except:
            profession = np.nan
            
        try:
            exp        = celeb.find('p').find('a').text
            exp_id     = celeb.find('p').find('a').get('href').split('/')[-2]
        except:
            exp        = np.nan
            exp_id     = np.nan
    
        data.append([id_, name , profession, exp_id, exp, img_link])

    link = 'https://www.imdb.com' + soup.find('a', class_ = 'lister-page-next next-page').get('href')

100%|█████████████████████████████████████| 2000/2000 [1:55:37<00:00,  3.47s/it]


### 3. Saving the Dataset

In [81]:
df = pd.DataFrame(data, columns = ['id','name','profession','known_for_id','known_for', 'img_link'])

df.to_csv('celebs.csv', index = False)

### 4. Dataset Walkthrough

In [82]:
df.head()

Unnamed: 0,id,name,profession,known_for_id,known_for,img_link
0,nm2581521,Austin Butler,Actor,tt3704428,Elvis\n,https://m.media-amazon.com/images/M/MV5BYzE2Yj...
1,nm1886602,Miles Teller,Actor,tt1714206,The Spectacular Now\n,https://m.media-amazon.com/images/M/MV5BMTY2Mz...
2,nm5611121,Millie Bobby Brown,Actress,tt7846844,Enola Holmes\n,https://m.media-amazon.com/images/M/MV5BMjA5Nz...
3,nm1869101,Ana de Armas,Actress,tt1856101,Blade Runner 2049\n,https://m.media-amazon.com/images/M/MV5BMWM3MD...
4,nm1270009,Genesis Rodriguez,Actress,tt2245084,Big Hero 6\n,https://m.media-amazon.com/images/M/MV5BMzBjNz...


##### 4.1 Preprocessing known_for Column

In [86]:
df['known_for'] = df['known_for'].fillna('None')    # Fill None in the place of None


known_for = []

for i in df['known_for']:
    
    try:
        known_for.append(i.split('\n')[0].strip())
    
df['known_for'] = known_for

df.head()

Unnamed: 0,id,name,profession,known_for_id,known_for,img_link
0,nm2581521,Austin Butler,Actor,tt3704428,Elvis,https://m.media-amazon.com/images/M/MV5BYzE2Yj...
1,nm1886602,Miles Teller,Actor,tt1714206,The Spectacular Now,https://m.media-amazon.com/images/M/MV5BMTY2Mz...
2,nm5611121,Millie Bobby Brown,Actress,tt7846844,Enola Holmes,https://m.media-amazon.com/images/M/MV5BMjA5Nz...
3,nm1869101,Ana de Armas,Actress,tt1856101,Blade Runner 2049,https://m.media-amazon.com/images/M/MV5BMWM3MD...
4,nm1270009,Genesis Rodriguez,Actress,tt2245084,Big Hero 6,https://m.media-amazon.com/images/M/MV5BMzBjNz...


##### 4.2 Dealing with Null Values of know_for_id Column

In [92]:
df['known_for_id'] = df['known_for_id'].fillna('None')

In [93]:
df.isnull().sum()

id                0
name              0
profession      269
known_for_id      0
known_for         0
img_link          0
dtype: int64

##### 4.3 Dealing with Null values for profession Column

In [95]:
df['profession'] = df['profession'].fillna('None')

df.isnull().sum()

id              0
name            0
profession      0
known_for_id    0
known_for       0
img_link        0
dtype: int64

##### 4.4 Finding Correct Professions

In [129]:
lst = []

for profession in df['profession'].unique():
    lst.append([profession, len(df[df['profession'] == profession])])
    
df_ = pd.DataFrame(lst, columns = ['profession','freq'])

main_professions = df_.sort_values(by = 'freq', ascending = False).head(33)['profession'].values

##### 4.5 Remove the data where profession is not mentioned

In [145]:
final_df = pd.DataFrame()

for profession in main_professions:    
    final_df = pd.concat((df[df['profession'] == profession], final_df))
    
    
final_df.head()

Unnamed: 0,id,name,profession,known_for_id,known_for,img_link
25393,nm0002337,Roger Christian,Set Decorator,tt0076759,Star Wars,https://m.media-amazon.com/images/M/MV5BMTYwMD...
41913,nm0089193,Leslie Bloom,Set Decorator,tt0093818,Radio Days,https://m.media-amazon.com/images/S/sash/9FayP...
46864,nm0816542,Victoria Spader,Set Decorator,tt0098724,"Sex, Lies, and Videotape",https://m.media-amazon.com/images/S/sash/9FayP...
60972,nm3914730,Gabrielle Rosenberg,Set Decorator,tt15038118,Blue's Big City Adventure,https://m.media-amazon.com/images/M/MV5BNGM0MT...
76297,nm0896580,Shane Vieau,Set Decorator,tt7740496,Nightmare Alley,https://m.media-amazon.com/images/M/MV5BMTc4N2...


In [148]:
final_df.to_csv('celebs.csv', index = False)