In [3]:
import os
import random

import pandas as pd
import requests

# Download Necessary Files

In [4]:
required_files = ["title.ratings.tsv.gz", "title.basics.tsv.gz"]

for file in required_files:
    if os.path.isfile(file):
        continue
    
    file_downloaded = requests.get(f"https://datasets.imdbws.com/{file}", allow_redirects=True)
    with open(file, 'wb') as new_file:
        new_file.write(file_downloaded.content)    

# Read Movie Ratings

In [11]:
movie_df = pd.read_csv("title.ratings.tsv.gz", delimiter="\t", low_memory=False)

In [23]:
movie_df[movie_df["numVotes"] >= 500000].sort_values(by="averageRating", ascending=False)[:20]

Unnamed: 0,tconst,averageRating,numVotes
414055,tt0903747,9.5,1444369
1022193,tt7366338,9.4,528380
81859,tt0111161,9.3,2330926
423954,tt0944947,9.3,1751938
45772,tt0068646,9.2,1611141
601971,tt1475582,9.1,799630
48300,tt0071562,9.0,1124518
29728,tt0050083,9.0,686161
246260,tt0468569,9.0,2292480
79877,tt0108778,8.9,816664


In [8]:
movie_df_ = pd.read_csv("title.ratings.tsv.gz", delimiter="\t", low_memory=False)

In [10]:
movie_df = movie_df_.copy()
movie_df = movie_df.sort_values(by="numVotes", ascending=False).iloc[:300]
movie_df = movie_df[movie_df["averageRating"] >= 7.0]
movie_df = movie_df.drop(["numVotes", "averageRating"], axis=1)
movie_df = movie_df.set_index('tconst')

# Read Movie Metadata

In [None]:
extra_data_ = pd.read_csv("title.basics.tsv.gz", delimiter="\t", low_memory=False)

In [72]:
extra_data = extra_data_.copy()
extra_data = extra_data.set_index('tconst')
extra_data = extra_data.loc[movie_df.index]
extra_data = extra_data[extra_data["isAdult"] == "0"]
extra_data = extra_data[extra_data["titleType"] == 'movie']
extra_data = extra_data[["startYear", "runtimeMinutes", "primaryTitle"]]

# Filtering

In [64]:
extra_data = extra_data[(extra_data != '\\N').all(axis=1)]
extra_data = extra_data[extra_data["runtimeMinutes"].astype(int) > 70]
extra_data = extra_data.drop("runtimeMinutes", axis=1)
extra_data = extra_data[extra_data["startYear"].astype(int) > 1995]
extra_data["movieID"] = extra_data.index

# Add Poster URL with OMDB API

In [55]:
poster_urls = []

apikeys = ["fcbcfdd4", "354ba942"]

for movie_id in extra_data.index:
    apikey = random.choice(apikeys)
    response = requests.get(f'http://omdbapi.com/?apikey={apikey}&i={movie_id}')
    poster_url = response.json()['Poster']
    poster_url = poster_url.replace("300.jpg", "500.jpg")
    poster_urls.append(poster_url)

extra_data["poster_url"] = poster_urls

KeyError: 'poster_url'

# Export Data as JSON

In [98]:
longest_title = sorted(extra_data["primaryTitle"].to_numpy(), key=len, reverse=True)[0]
longest_title, len(longest_title)

('Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb', 68)

In [73]:
extra_data

Unnamed: 0_level_0,startYear,runtimeMinutes,primaryTitle
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0111161,1994,142,The Shawshank Redemption
tt0468569,2008,152,The Dark Knight
tt1375666,2010,148,Inception
tt0137523,1999,139,Fight Club
tt0110912,1994,154,Pulp Fiction
...,...,...,...
tt0790636,2013,117,Dallas Buyers Club
tt0449059,2006,101,Little Miss Sunshine
tt2179136,2014,133,American Sniper
tt3890160,2017,113,Baby Driver


In [68]:
extra_data.to_json("top_movies.json", orient="split", index=False)

# Libraries Used

In [8]:
!pip install watermark;



In [9]:
%load_ext watermark
%watermark -n -u -v -iv -w

Last updated: Mon Jan 11 2021

Python implementation: CPython
Python version       : 3.8.5
IPython version      : 7.19.0

pandas  : 1.1.3
requests: 2.24.0

Watermark: 2.1.0

