In [None]:
import os
import random
import json
from itertools import cycle

import pandas as pd
import requests

# Download Necessary Files

In [None]:
required_files = ["title.ratings.tsv.gz", "title.basics.tsv.gz"]

for file in required_files:
    if os.path.isfile(file):
        continue
    
    file_downloaded = requests.get(f"https://datasets.imdbws.com/{file}", allow_redirects=True)
    with open(file, 'wb') as new_file:
        new_file.write(file_downloaded.content)    

# Read Movie Ratings

In [None]:
movies_ = pd.read_csv("title.ratings.tsv.gz", delimiter="\t", low_memory=False)
extra_data_ = pd.read_csv("title.basics.tsv.gz", delimiter="\t", low_memory=False)

In [None]:
movies = movies_.copy()
extra_data = extra_data_.copy().set_index('tconst')

# Auxiliary Functions

In [None]:
def filter_movies(movies, votes, average_rate):
    filtered_movies = movies.copy()
    filtered_movies = filtered_movies[filtered_movies["numVotes"] >= votes]
    filtered_movies = filtered_movies[filtered_movies["averageRating"] >= average_rate]
    filtered_movies = filtered_movies.drop(["numVotes", "averageRating"], axis=1)
    filtered_movies = filtered_movies.set_index('tconst')
    return filtered_movies

In [None]:
def remove_unpopular(movies_):
    movies = movies_.copy()
    movies = movies[movies["isAdult"] == "0"]
    movies = movies[movies["titleType"] == 'movie']
    movies = movies[["startYear", "runtimeMinutes", "primaryTitle"]]
    return movies

In [None]:
def clean_movies(movies_):
    movies = movies_.copy()
    movies = movies[(movies != '\\N').all(axis=1)]
    movies = movies[movies["runtimeMinutes"].astype(int) > 70]
    movies = movies.drop("runtimeMinutes", axis=1)
    movies = movies[movies["startYear"].astype(int) > 1995]
    movies["movieID"] = movies.index
    return movies

In [None]:
def create_level(movies_, extra_data_, votes, average_rate):
    movies = movies_.copy()
    extra_data = extra_data_.copy()
    
    level = filter_movies(movies, votes=votes, average_rate=average_rate)
    level = extra_data.loc[level.index]
    level = remove_unpopular(level)
    level = clean_movies(level)
    
    return level

# Create Level Datasets

In [None]:
level0_movies = create_level(movies, extra_data, votes=250000, average_rate=8.0)
level1_movies = create_level(movies, extra_data, votes=250000, average_rate=7.5)
level2_movies = create_level(movies, extra_data, votes=100000, average_rate=7.5)
level3_movies = create_level(movies, extra_data, votes=75000, average_rate=7.0)
level4_movies = create_level(movies, extra_data, votes=25000, average_rate=7.0)

len(level0_movies), len(level1_movies), len(level2_movies), len(level3_movies), len(level4_movies)

# Check Longest Movie Title

In [None]:
longest_title = sorted(level4_movies["primaryTitle"].to_numpy(), key=len, reverse=True)[0]
longest_title, len(longest_title)

# Add Poster URL with OMDB API

In [None]:
def add_posters(movies_):
    movies = movies_.copy()
    
    apikeys = ["fcbcfdd4", "354ba942"]

    with open("top_movies_level4.json", "r") as dataset_file:
        movie_database = json.load(dataset_file)["data"]
        
    movie_posters = {movie[2]:movie[3] for movie in movie_database}
    
    index_cycle = cycle(list(range(len(apikeys))))
    api_index = next(index_cycle)
    
    poster_urls = []
    for movie_id in movies.index:
        if movie_id in movie_posters and movie_posters[movie_id] != "MISSING":
            poster_urls.append(movie_posters[movie_id])
            continue
        
        try:            
            apikey = apikeys[api_index]
            response = requests.get(f'http://omdbapi.com/?apikey={apikey}&i={movie_id}')
            poster_url = response.json()['Poster']
            poster_url = poster_url.replace("300.jpg", "500.jpg")
            poster_urls.append(poster_url)
            api_index = next(index_cycle)
            
        except:
            poster_urls.append("MISSING")
            api_index = next(index_cycle)
    
    movies["poster_url"] = poster_urls
    
    return movies

In [None]:
level0_movies = add_posters(level0_movies)
level1_movies = add_posters(level1_movies)
level2_movies = add_posters(level2_movies)
level3_movies = add_posters(level3_movies)
level4_movies = add_posters(level4_movies)

assert len(level0_movies[level0_movies["poster_url"] == "MISSING"]) == 0
assert len(level1_movies[level1_movies["poster_url"] == "MISSING"]) == 0
assert len(level2_movies[level2_movies["poster_url"] == "MISSING"]) == 0
assert len(level3_movies[level3_movies["poster_url"] == "MISSING"]) == 0
assert len(level4_movies[level4_movies["poster_url"] == "MISSING"]) == 0

len(level0_movies), len(level1_movies), len(level2_movies), len(level3_movies), len(level4_movies)

# Export Data as JSON

In [None]:
level0_movies.to_json("top_movies_level0.json", orient="split", index=False)
level1_movies.to_json("top_movies_level1.json", orient="split", index=False)
level2_movies.to_json("top_movies_level2.json", orient="split", index=False)
level3_movies.to_json("top_movies_level3.json", orient="split", index=False)
level4_movies.to_json("top_movies_level4.json", orient="split", index=False)

# Libraries Used

In [None]:
!pip install watermark;

In [None]:
%load_ext watermark
%watermark -n -u -v -iv -w