# Data Enrichment Project

# Imports & Data Cleaning


In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os, time, json, math
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

In [2]:
# defining function for movie rating
# CITING: Using TMDB API from LP
"""Adapated from https://github.com/celiao/tmdbsimple"""

def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    
    info = movie.info()
    
    releases = movie.releases()
    
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            info['certification'] = c['certification']
            
    return info

In [3]:
# defining function for writing json
# CITING: Efficient TMDB API calls from LP
"""Adapted from https://www.geeksforgeeks.org/append-to-json-file-using-python/"""

def write_json(new_data, filename):
    
    with open(filename,'r+') as file:
        file_data = json.load(file)
        
        if(type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
            
        file.seek(0)
        json.dump(file_data,file)

In [4]:
basic_gz = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_gz = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_gz = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [5]:
basics = pd.read_csv(basic_gz, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_gz, sep='\t', low_memory=False)
akas = pd.read_csv(akas_gz, sep='\t', low_memory=False)

KeyboardInterrupt: 

**Basics DF Data Cleanse**

In [None]:
basics.head()

In [None]:
basics.info()

In [None]:
# replacing null values with np.nan
basics = basics.replace({'\\N':np.nan})
basics.head()

In [None]:
# eliminate movies that are null for runtimeMinutes
basics = basics.dropna(subset=['runtimeMinutes'])
basics.info()

In [None]:
# eliminate movies that are null for genres
basics = basics.dropna(subset=['genres'])
basics.info()

In [None]:
# keep only titleType==Movie
basics['titleType'].value_counts()

In [None]:
basics = basics[basics.titleType == 'movie']
basics['titleType'].value_counts()

In [None]:
basics.head()

In [None]:
basics.info()

In [None]:
# keep startYear 2000-2022
basics = basics.dropna(subset=['startYear'])
basics.info()

In [None]:
basics['startYear'] = basics['startYear'].astype(int)
basics.info()

In [None]:
basics = basics.drop(basics[basics['startYear']<2000].index)
basics.head()

In [None]:
basics.info()

In [None]:
# Eliminate movies that include "Documentary" in genre
documentary_pos = basics['genres'].str.contains('documentary',case=False)
basics = basics[~documentary_pos]
basics.head()

In [None]:
basics.info()

In [None]:
# Keep only US movies
keepers = basics['tconst'].isin(akas['titleId'])
keepers

In [None]:
basics = basics[keepers]
basics.head()

In [None]:
basics.info()

**AKA's DF Data Cleanse**

In [None]:
akas.head()

In [None]:
akas.info()

In [None]:
# Replace "\N" with np.nan
akas = akas.replace({'\\N':np.nan})
akas.head()

In [None]:
# keep only US movies
akas['region'].value_counts()

In [None]:
akas = akas.dropna(subset=['region'])
akas.info()

In [None]:
is_us = akas['region'].str.contains('US',case=False)
akas = akas[is_us]
akas.head()

In [None]:
akas.info()

**Ratings DF Data Clease**

In [None]:
ratings.head()

In [None]:
ratings.info()

In [None]:
# Keep only US movies
keepers1 = ratings['tconst'].isin(akas['titleId'])
keepers1

In [None]:
ratings = ratings[keepers1]
ratings.head()

In [None]:
ratings.info()

**Saving csv file**

In [None]:
basics.to_csv('Data/title.basics.csv.gz',compression='gzip',index=False)

In [None]:
basics = pd.read_csv('Data/title.basics.csv.gz',low_memory=False)
basics.head()

**Loading API Credentials**

In [None]:
with open('/Users/chris/.secret/tmdb_api.json','r') as f:
    login = json.load(f)
login.keys()

In [None]:
tmdb.API_KEY = login['api-key']

**Designating folder for API data**

In [None]:
FOLDER = 'Data/'
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

**Loading in dataframe**

In [None]:
basics = pd.read_csv('Data/title.basics.csv.gz')
basics.head()

In [None]:
# defining lists of years to extract & errors list
YEARS_TO_GET = [2000,2001]
errors = []
YEAR = 2000
YEAR1 = 2001

In [None]:
# starting outer loop
for YEAR1 in tqdm_notebook(YEARS_TO_GET, desc='Years', position=0):
    
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    
    file_exists = os.path.isfile(JSON_FILE)
    
    if file_exists == False:
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)

In [None]:
# saving new year as current df
df = basics.loc[ basics['startYear']=={YEAR}].copy()
movie_ids = df['tconst'].copy()

In [None]:
previous_df = pd.read_json(JSON_FILE)

In [None]:
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

In [None]:
# starting inner loop
for movie_id in tqdm_notebook(movie_ids_to_get,
                              desc=f'Movies from {YEAR}',
                              position=1,
                              leave=True):
    try:
        temp = get_movie_with_rating(movie_id)
        write_json(temp,JSON_FILE)
        time.sleep(0.02)
        
    except Exception as e:
        errors.append([movie_id,e])