In [12]:
import requests 
from bs4 import BeautifulSoup
import re # regex expression
import tqdm.notebook as tq # time loop in notebook
import re

In [5]:
def Get_hrefs(url):
    # Request url and gets the hrefs for the movies recommended in the "more like this" section in imdb page for a movie.
    # Returns a list of hrefs elements that link to the movies recommended by imdb
    
    response = requests.get(url)
    result = response.content    
    soup = BeautifulSoup(result, 'html.parser')
    
    # init the list with all href
    hrefs = []
    
    for poster in soup.find_all("div",attrs={'class':'ipc-poster-card ipc-poster-card--base ipc-poster-card--dynamic-width ipc-sub-grid-item ipc-sub-grid-item--span-2'}):
        for link in poster.find_all("a", attrs = {"class":"ipc-lockup-overlay ipc-focusable"}):
            if "href" in link.attrs:
                if link.get("href") not in hrefs:
                    movie_id = link.get("href").split("/?")[0] #only the first part of the link is required as it contains the id of the film. The next sequence might cause errors. 
                    hrefs.append(movie_id) #save the links (ids) in a list.
    return(hrefs)

In [36]:
def get_recommendations(url,depth):
    #Searches in the IMDB "more like this" for recommendations based on the link of one movie. 
    #Input: IMDB link to a movie/show. Depth: amount of iterations to search in the "more like this"
    #Output: a list of links for imdb movies/shows. 
    
    urls_checked = []

    for i in tq.tqdm(range(depth)):
        if i == 0:
            
            hrefs  = Get_hrefs(url)
            urls_checked.append(url) 
        else:
            hrefs_temp = []
            for href in tq.tqdm(hrefs):
                new_url = "https://www.imdb.com" + href
                if new_url not in urls_checked:
                    hrefs_temp += Get_hrefs(new_url)
                    urls_checked.append(new_url)
                    hrefs_temp = list(set(hrefs_temp))
            hrefs += [recom for recom in hrefs_temp if recom not in hrefs]
    
    links = ["https://www.imdb.com" + i for i in hrefs]
    
    return links

In [47]:
#dictionaries must have all numeric information directly in numbers.

movie1 = {"name": "The Matrix",
          "imdb_id": "tt0133093",
         "year_released": 1997,
         "runtime": 136,
          "imdb_reviews": 60,
         "external_reviews": 70,
         "imdb_rating": 8.7,
         "metacritic_punctuation": 73,
          "budget": 63000000,
          "earning_worldwide": 467222728,
          "earning_US&CA": 172076928,
          "genres": ["action","sci-fi"],
          "directors":["Lilly Wachows","kiLana Wachowski"],
          "cast":["keanu reevs", "carrie anne moss","Laurence Fishburne"],
          "imdb_recommendations": {},
          "new_recommendations": {}
         } 

movie2 = {"name": "The Matrix Reloaded",
          "imdb_id": "tt0234215",
         "year_released": 2003,
         "runtime": 138,
          "imdb_reviews": 2900,
         "external_reviews": 184,
         "imdb_rating": 7.2,
         "metacritic_punctuation": 62,
          "budget": 150000000,
          "earning_worldwide": 741847937,
          "earning_US&CA": 281576461,
          "genres": ["action","sci-fi"],
          "directors":["Lilly Wachows","kiLana Wachowski"],
          "cast":["keanu reevs", "carrie anne moss","Helmut Bakaitis"],
          "imdb_recommendations": {},
          "new_recommendations": {}
         } 

In [41]:
def compare_lists(list1,list2):
    #This functions takes two lists and estimates how similar the second list is to the first one. Result might be between 0 and 1.
    list1 = list(set(list1))
    list2 = list(set(list2))
    
    common_elements = len(set(list1) & set(list2))
    
    common_elements_proportion = common_elements / len(list1)
    
    return common_elements_proportion

In [48]:
movie1_vector = [movie1["year_released"],movie1["runtime"],movie1["imdb_reviews"],movie1["external_reviews"],movie1["imdb_rating"],movie1["metacritic_punctuation"],movie1["budget"],
                    movie1["earning_worldwide"],movie1["earning_US&CA"],1,1,1]

In [50]:
movie2_vector = [movie2["year_released"],movie2["runtime"],movie2["imdb_reviews"],movie2["external_reviews"],movie2["imdb_rating"],movie2["metacritic_punctuation"],movie2["budget"],
                    movie2["earning_worldwide"],movie2["earning_US&CA"]]

common_genres_rate = compare_lists(movie1["genres"],movie2["genres"])
movie2_vector.append(common_genres_rate)

common_directors_rate = compare_lists(movie1["directors"],movie2["directors"])
movie2_vector.append(common_directors_rate)

common_cast_rate = compare_lists(movie1["cast"],movie2["cast"])
movie2_vector.append(common_cast_rate)

print(movie1_vector)
print(movie2_vector)

[1997, 136, 60, 70, 8.7, 73, 63000000, 467222728, 172076928, 1, 1, 1]
[2003, 138, 2900, 184, 7.2, 62, 150000000, 741847937, 281576461, 0.6666666666666666, 1.0, 0.6666666666666666]


In [53]:
from math import dist

print(dist(movie1_vector,movie2_vector))

308185257.84597576


In [31]:
def get_year(movie_id):
    url = "https://www.imdb.com/title/"+movie_id+ "/releaseinfo?ref_=tt_ov_rdat"
    
    print(url)
    try:
        response = requests.get(url)
        result = response.content    
        soup = BeautifulSoup(result, 'html.parser')
        year = soup.find("td", attrs = {"class" : "release-date-item__date"}).getText()
        year = int(year.split(" ")[2]) #split the date by whitespaces. Get the third element, which is the year of release.
        
        return year
    except:
        print("Date not available")
        return 0

In [40]:
def get_runtime(movie_id):
    url = "https://www.imdb.com/title/"+movie_id+ "/technical?ref_=tt_spec_sm"
    
    print(url)
    try:
        response = requests.get(url)
        result = response.content    
        soup = BeautifulSoup(result, 'html.parser')
        
        table = soup.find("table", attrs = {"class" : "dataTable labelValueTable"}).find("tbody")
        
        children = table.contents
        runtime = children[1].contents[3].getText().split("(")[1]
        runtime = runtime.split(" ")[0]
        
        return runtime
    
    except:
        print("Runtime not available")
        return 0

In [16]:
def get_votes(movie_id):
    url = "https://www.imdb.com/title/"+movie_id+ "/ratings/"
    
    print(url)
    try:
        response = requests.get(url)
        result = response.content    
        soup = BeautifulSoup(result, 'html.parser')
        
        votes = soup.find("td", attrs = {"class" : "ratingTable Selected"}).find("div", attrs = {"class":"smallcell"}).getText()
        votes = re.sub("[^0-9]", "", votes)
        return int(votes)
    
    except:
        print("Votes not available")
        return 0

In [41]:
def get_metascore(movie_id):
    url = "https://www.imdb.com/title/"+movie_id+ "/criticreviews/"
    
    print(url)
    try:
        response = requests.get(url)
        result = response.content    
        soup = BeautifulSoup(result, 'html.parser')
        
        metascore = soup.find("div", attrs = {"class" : "metascore_block"}).find("div", attrs = {"class":"metascore_wrap"})
        metascore = metascore.find("span").getText()
        
        return int(metascore)
    
    except:
        print("Metascore not available")
        return 0

In [72]:
def get_num_reviews(movie_id):
    url = "https://www.imdb.com/title/"+movie_id+ "/reviews/"
    
    print(url)
    try:
        response = requests.get(url)
        result = response.content    
        soup = BeautifulSoup(result, 'html.parser')
        
        num_reviews = soup.find("div", attrs = {"class" : "lister"}).find("div", attrs = {"class":"header"})
        num_reviews = num_reviews.find("span").getText()
        num_reviews = re.sub("[^0-9]", "", num_reviews)
        return int(num_reviews)
    
    except:
        print("Number of reviews not available")
        return 0

In [76]:
def get_num_exernal_reviews(movie_id):
    url = "https://www.imdb.com/title/"+movie_id+ "/externalreviews/"
    
    print(url)
    try:
        response = requests.get(url)
        result = response.content    
        soup = BeautifulSoup(result, 'html.parser')
        
        num_reviews_external = soup.find("div", attrs = {"id" : "external_reviews_content"}).find("div", attrs = {"class":"nav"}).getText()
        num_reviews_external = re.sub("[^0-9]", "", num_reviews_external)
        return int(num_reviews_external)
    
    except:
        print("Number of external reviews not available")
        return 0

In [77]:
e = get_num_exernal_reviews("tt1707386")
e

https://www.imdb.com/title/tt1707386/externalreviews/


498

In [28]:
e

0