In [1]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.imdb.com/title/tt3581652/fullcredits'
url = ''https://www.imdb.com/title/tt9777666/''
response = requests.get(url) 
soup = BeautifulSoup(response.text)

In [7]:
from functools import lru_cache

In [39]:
!pip install nest-asyncio
import nest_asyncio
nest_asyncio.apply()



In [1]:
# define helper functions if needed
# and put them in `imdb_helper_functions` module.
# you can import them and use here like that:

from bs4 import BeautifulSoup
import lxml
import cchardet
from imdb_helper_functions import get_url
from functools import lru_cache

def get_actors_by_movie_soup(cast_page_soup: BeautifulSoup, num_of_actors_limit=None):
    
    actors=[]
    cast_list = cast_page_soup.find('table', attrs={'class':'cast_list'})

    for row in cast_list.find_all('tr'):
        columns = row.find_all('td')

        if len(columns) == 4:
            actor = columns[1].find_next('a')
            actor_name = actor.get_text().strip()
            actor_url = actor.attrs['href']
            actors.append((actor_name, actor_url))

            if len(actors) == num_of_actors_limit:
                break
    
    return actors


def get_movies_by_actor_soup(actor_page_soup: BeautifulSoup, num_of_movies_limit=None):
    filmography = actor_page_soup.find('div', attrs={'class':'filmo-category-section'})

    movies = []
    for movie in filmography.find_all('div', attrs={'class':'filmo-row'}):
        refs = movie.find_all('a')
        if len(refs) == 1 and refs[0].parent.next_sibling.text.strip() == '':
            
            movie = refs[0]
            movies.append((movie.get_text(), movie.attrs['href']))

            if len(movies) == num_of_movies_limit:
                break

    return movies


@lru_cache(maxsize=None)
def get_movies_by_actor_url(url, num_of_movies_limit=None):
    page_text = get_url(url)
    soup = BeautifulSoup(page_text, features="lxml")
    return get_movies_by_actor_soup(soup, num_of_movies_limit)

@lru_cache(maxsize=None)
def get_actors_by_movie_url(url, num_of_actors_limit=None):
    page_text = get_url(url)
    soup = BeautifulSoup(page_text, features="lxml")
    return get_actors_by_movie_soup(soup, num_of_actors_limit)


In [36]:
import urllib
import collections
def get_movie_distance(actor_start_url, actor_end_url,
        num_of_actors_limit=None, num_of_movies_limit=None):
    
    base_url = 'https://www.imdb.com/'
    start_url = urllib.parse.urlparse(actor_start_url).path + '/'
    end_url = urllib.parse.urlparse(actor_end_url).path + '/'

    visited_urls = set() 
    movies_queue = collections.deque()
    actors_queue = collections.deque()

    actors_queue.append(start_url)

    distance = 0
    while len(actors_queue) + len(movies_queue) > 0:   
        while actors_queue:
            actor_url = actors_queue.popleft()
            visited_urls.add(actor_url)

            

            url = urllib.parse.urljoin(base_url, actor_url)
            actor_movies = get_movies_by_actor_url(url, num_of_actors_limit)

            for movie, movie_url in actor_movies:
                if movie_url in visited_urls:
                    continue
                
                movies_queue.append(movie_url)

        distance += 1

        while movies_queue:
            movie_url = movies_queue.popleft()
            visited_urls.add(movie_url)

            url = urllib.parse.urljoin(base_url, movie_url) + 'fullcredits'
            movie_actors = get_actors_by_movie_url(url, num_of_movies_limit)

            for actor, actor_url in movie_actors:
                if actor_url in visited_urls:
                    continue

                if actor_url == end_url:
                    return distance
                
                actors_queue.append(actor_url)


In [None]:
actor_start_url = 'https://www.imdb.com/name/nm0695435?ref_=tt_cl_t_1'
actor_end_url = 'https://www.imdb.com/name/nm0366389/?ref_=tt_cl_t_3'
get_movie_distance(actor_start_url, actor_end_url)

In [23]:
actor_start_url = 'https://www.imdb.com/name/nm0695435?ref_=tt_cl_t_1'
actor_end_url = 'https://www.imdb.com/name/nm2088803?ref_=tt_cl_t_2'
get_movie_distance(actor_start_url, actor_end_url)

1

In [4]:
highest_paid_actors = [
    ('Dwayne Johnson', 'https://www.imdb.com/name/nm0425005?ref_=nmls_hd'),
    ('Chris Hemsworth', 'https://www.imdb.com/name/nm1165110?ref_=nmls_hd'),
    ('Robert Downey Jr.', 'https://www.imdb.com/name/nm0000375?ref_=nmls_hd'),
    ('Akshay Kumar', 'https://www.imdb.com/name/nm0474774?ref_=nmls_hd'),
    ('Jackie Chan', 'https://www.imdb.com/name/nm0000329?ref_=nmls_hd'),
    ('Bradley Cooper', 'https://www.imdb.com/name/nm0177896?ref_=nmls_hd'),
    ('Adam Sandler', 'https://www.imdb.com/name/nm0001191?ref_=nmls_hd'),
    ('Scarlett Johansson', 'https://www.imdb.com/name/nm0424060/?ref_=nv_sr_srsg_0'),
    ('Sofia Vergara', 'https://www.imdb.com/name/nm0005527/?ref_=nv_sr_srsg_0'),
    ('Chris Evans.', 'https://www.imdb.com/name/nm0262635?ref_=nmls_hd')
]

In [38]:
actors_pairs = []
for i in range(len(highest_paid_actors)):
    for j in range(i+1, len(highest_paid_actors)):
        actors_pairs.append((highest_paid_actors[i],highest_paid_actors[j]))
actors_pairs

[(('Dwayne Johnson', 'https://www.imdb.com/name/nm0425005?ref_=nmls_hd'),
  ('Chris Hemsworth', 'https://www.imdb.com/name/nm1165110?ref_=nmls_hd')),
 (('Dwayne Johnson', 'https://www.imdb.com/name/nm0425005?ref_=nmls_hd'),
  ('Robert Downey Jr.', 'https://www.imdb.com/name/nm0000375?ref_=nmls_hd')),
 (('Dwayne Johnson', 'https://www.imdb.com/name/nm0425005?ref_=nmls_hd'),
  ('Akshay Kumar', 'https://www.imdb.com/name/nm0474774?ref_=nmls_hd')),
 (('Dwayne Johnson', 'https://www.imdb.com/name/nm0425005?ref_=nmls_hd'),
  ('Jackie Chan', 'https://www.imdb.com/name/nm0000329?ref_=nmls_hd')),
 (('Dwayne Johnson', 'https://www.imdb.com/name/nm0425005?ref_=nmls_hd'),
  ('Bradley Cooper', 'https://www.imdb.com/name/nm0177896?ref_=nmls_hd')),
 (('Dwayne Johnson', 'https://www.imdb.com/name/nm0425005?ref_=nmls_hd'),
  ('Adam Sandler', 'https://www.imdb.com/name/nm0001191?ref_=nmls_hd')),
 (('Dwayne Johnson', 'https://www.imdb.com/name/nm0425005?ref_=nmls_hd'),
  ('Scarlett Johansson',
   'https:

In [None]:
#highes_paid_distance = []
for i in range(len(highest_paid_actors)):
    for j in range(i+1, len(highest_paid_actors)):
        actor1, actor1_url = highest_paid_actors[i]
        actor2, actor2_url = highest_paid_actors[j]
        if (actor1, actor2) in distances or (actor2, actor1) in distances:
            print('skip')
            continue 
        distance = get_movie_distance(actor1_url, actor2_url)
        highes_paid_distance.append((actor1, actor2, distance))
        distances[(actor1, actor2)] = distance


In [33]:
highes_paid_distance

[('Dwayne Johnson', 'Chris Hemsworth', 2),
 ('Dwayne Johnson', 'Robert Downey Jr.', 2),
 ('Dwayne Johnson', 'Akshay Kumar', 2),
 ('Dwayne Johnson', 'Jackie Chan', 2),
 ('Dwayne Johnson', 'Bradley Cooper', 2),
 ('Dwayne Johnson', 'Adam Sandler', 2)]

In [18]:
distances = {(a, b) : c for a, b, c in highes_paid_distance}
distances

{('Dwayne Johnson', 'Chris Hemsworth'): 2,
 ('Dwayne Johnson', 'Robert Downey Jr.'): 2,
 ('Dwayne Johnson', 'Akshay Kumar'): 2,
 ('Dwayne Johnson', 'Jackie Chan'): 2,
 ('Dwayne Johnson', 'Bradley Cooper'): 2,
 ('Dwayne Johnson', 'Adam Sandler'): 2}

In [8]:
base_url = 'https://www.imdb.com/'
url = 'https://www.imdb.com/name/nm0695435/'
end_url = urllib.parse.urlparse(url)
end_url.path

'/name/nm0695435'

In [15]:
data_row.find('td', attrs={'class':'ellipsis'}).find_previous_sibling('td').find_next('a').get_text().strip()

'Ansel Elgort'

In [32]:
url = 'https://www.imdb.com/name/nm5052065'
response = requests.get(url)
soup = BeautifulSoup(response.text)

In [33]:
filmography = soup.find('div', attrs={'class':'filmo-category-section'})

films = []
for film in filmography.find_all('div', attrs={'class':'filmo-row'}):
    refs = film.find_all('a')
    if(len(refs) == 1):
        film = refs[0]
        films.append((film.get_text(), film.attrs['href']))
films

[('Вестсайдская история', '/title/tt3581652/'),
 ('Щегол', '/title/tt3864056/'),
 ('J.I.D.: Off Da Zoinkys', '/title/tt10301894/'),
 ('Клуб миллиардеров', '/title/tt5179598/'),
 ('Дубликат', '/title/tt5639446/'),
 ('Ansel Elgort: Supernova', '/title/tt8246022/'),
 ('Ноябрьские преступники', '/title/tt3266284/'),
 ('Малыш на драйве', '/title/tt3890160/'),
 ('Ansel Elgort: Thief', '/title/tt8246020/'),
 ('Dua Lipa: Be the One (Version 2)', '/title/tt7307962/'),
 ('Allegiant: VR Experience', '/title/tt5573116/'),
 ('Дивергент, глава 3: За стеной', '/title/tt3410834/'),
 ('Бумажные города', '/title/tt3622592/'),
 ('Дивергент, глава 2: Инсургент', '/title/tt2908446/'),
 ('Мужчины, женщины и дети', '/title/tt3179568/'),
 ('Виноваты звезды', '/title/tt2582846/'),
 ('Дивергент', '/title/tt1840309/'),
 ('Телекинез', '/title/tt1939659/')]

In [34]:
divs = filmography.find_all('div', attrs={'class':'filmo-row'})

In [60]:
a = divs[7].find('a')
a.parent.nextSibling.nextSibling.name

'br'