In [1]:
import os
import random

import pandas as pd
import requests

# Download Necessary Files

In [2]:
required_files = ["title.ratings.tsv.gz", "title.basics.tsv.gz"]

for file in required_files:
    if os.path.isfile(file):
        continue
    
    file_downloaded = requests.get(f"https://datasets.imdbws.com/{file}", allow_redirects=True)
    with open(file, 'wb') as new_file:
        new_file.write(file_downloaded.content)    

# Read Movie Ratings

In [3]:
movie_df_ = pd.read_csv("title.ratings.tsv.gz", delimiter="\t", low_memory=False)

In [4]:
movie_df = movie_df_.copy()
movie_df = movie_df[movie_df["numVotes"] >= 100000]
movie_df = movie_df[movie_df["averageRating"] >= 7.5]
movie_df = movie_df.drop(["numVotes", "averageRating"], axis=1)
movie_df = movie_df.set_index('tconst')

# Read Movie Metadata

In [5]:
extra_data_ = pd.read_csv("title.basics.tsv.gz", delimiter="\t", low_memory=False)

In [6]:
extra_data = extra_data_.copy()
extra_data = extra_data.set_index('tconst')
extra_data = extra_data.loc[movie_df.index]
extra_data = extra_data[extra_data["isAdult"] == "0"]
extra_data = extra_data[extra_data["titleType"] == 'movie']
extra_data = extra_data[["startYear", "runtimeMinutes", "primaryTitle"]]

# Filtering

In [7]:
extra_data = extra_data[(extra_data != '\\N').all(axis=1)]
extra_data = extra_data[extra_data["runtimeMinutes"].astype(int) > 70]
extra_data = extra_data.drop("runtimeMinutes", axis=1)
extra_data = extra_data[extra_data["startYear"].astype(int) > 1995]
extra_data["movieID"] = extra_data.index

# Add Poster URL with OMDB API

In [8]:
poster_urls = []

apikeys = ["fcbcfdd4", "354ba942"]

for movie_id in extra_data.index:
    apikey = random.choice(apikeys)
    response = requests.get(f'http://omdbapi.com/?apikey={apikey}&i={movie_id}')
    poster_url = response.json()['Poster']
    poster_url = poster_url.replace("300.jpg", "500.jpg")
    poster_urls.append(poster_url)

extra_data["poster_url"] = poster_urls

# Export Data as JSON

In [9]:
longest_title = sorted(extra_data["primaryTitle"].to_numpy(), key=len, reverse=True)[0]
longest_title, len(longest_title)

('The Assassination of Jesse James by the Coward Robert Ford', 58)

In [10]:
extra_data

Unnamed: 0_level_0,startYear,primaryTitle,movieID,poster_url
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0116282,1996,Fargo,tt0116282,https://m.media-amazon.com/images/M/MV5BNDJiZD...
tt0116922,1997,Lost Highway,tt0116922,https://m.media-amazon.com/images/M/MV5BYWUxOW...
tt0117381,1996,Primal Fear,tt0117381,https://m.media-amazon.com/images/M/MV5BZTM2NW...
tt0117665,1996,Sleepers,tt0117665,https://m.media-amazon.com/images/M/MV5BMzk1Mm...
tt0117731,1996,Star Trek: First Contact,tt0117731,https://m.media-amazon.com/images/M/MV5BYTllZj...
...,...,...,...,...
tt8110330,2020,Dil Bechara,tt8110330,https://m.media-amazon.com/images/M/MV5BNmI0MT...
tt8367814,2019,The Gentlemen,tt8367814,https://m.media-amazon.com/images/M/MV5BMTlkMm...
tt8404614,2019,The Two Popes,tt8404614,https://m.media-amazon.com/images/M/MV5BY2RiOT...
tt8579674,2019,1917,tt8579674,https://m.media-amazon.com/images/M/MV5BOTdmNT...


In [11]:
extra_data.to_json("top_movies.json", orient="split", index=False)

# Libraries Used

In [12]:
!pip install watermark;



In [13]:
%load_ext watermark
%watermark -n -u -v -iv -w

Last updated: Fri Jan 15 2021

Python implementation: CPython
Python version       : 3.8.5
IPython version      : 7.19.0

pandas  : 1.1.3
requests: 2.11.1

Watermark: 2.1.0

