In [59]:
import json
import pandas as pd
import numpy as np

import re

from sqlalchemy import create_engine
import psycopg2

# from config import db_password
from config import db_password
import time

In [60]:
# 1. Add the clean movie function that takes in the argument, "movie".
def clean_movie(movie):
    movie=dict(movie) # Create a non-destructive copy
    
    # Combining multiple fields for alternate language titles into one
    alt_titles={}
    alt_lang=['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune–Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']
    # Check for presence of language, remove it as element and add it to dic as key-value pair
    for lang in alt_lang:
        if lang in movie:
            alt_titles[lang]=movie[lang]
            movie.pop(lang) 
    if len(alt_titles)>0:
        movie["Alternate Titles"] = alt_titles
        
     # Combining duplicate fields into one
    def change_column_name (old_name,new_name):
        if old_name in movie:
            movie[new_name]=movie.pop(old_name)
     # List of column names for modification       
    change_column_name('Directed by','Director')
    change_column_name('Distributed by','Distributor')
    change_column_name('Edited by','Editor(s)')
    change_column_name('Produced by','Producer')
    change_column_name('Producer','Producer(s)')
    change_column_name('Productioncompanies ','Productioncompany ')
    change_column_name('Productioncompany ','Production company(s)')
    change_column_name('Country of origin','Country')
    change_column_name('Adaptation by','Writer(s)')
    change_column_name('Length','Running time')
    change_column_name('Music by','Composer(s)')
    change_column_name('Adaptation by','Writer (s)')
    change_column_name('Original release','Release date')
    change_column_name('Original language(s)','Language')
    change_column_name('Released', 'Release date')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Story by', 'Writer(s)')
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')
    change_column_name('Voices of','Narrated by')     

    return movie

In [61]:
# 2 Add the function that takes in three arguments;
# Wikipedia data, Kaggle metadata, and MovieLens rating data (from Kaggle)

def extract_transform_load():
    # Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    kaggle_metadata=pd.read_csv(kaggle_file,low_memory=False)
    ratings=pd.read_csv(ratings_file)

    # Open the read the Wikipedia data JSON file.
    with open (wiki_file,"r") as wiki_data:
        wiki_movies_raw=json.load(wiki_data)     
    
    # 3. Write a list comprehension to filter out TV shows.
    wiki_movies=[movie for movie in wiki_movies_raw if 'No. of episodes' not in movie 
                                                        and ('Director' in movie or 'Directed by' in movie)                                                        
                                                        and 'imdb_link' in movie]

    # 4. Write a list comprehension to iterate through the cleaned wiki movies list
    # and call the clean_movie function on each movie.
    wiki_movies=[clean_movie(movie) for movie in wiki_movies]

    # 5. Read in the cleaned movies list from Step 4 as a DataFrame.
    wiki_movies_df=pd.DataFrame(wiki_movies)

    # 6. Write a try-except block to catch errors while extracting the IMDb ID using a regular expression string and
    #  dropping any imdb_id duplicates. If there is an error, capture and print the exception.
    try:
        exp=r"(tt\d{7})"
        wiki_movies_df['imdb_id']=wiki_movies_df['imdb_link'].str.extract(exp)
        wiki_movies_df.drop_duplicates("imdb_id",inplace=True)
    except Exception as e:
        print(e)

    #  7. Write a list comprehension to keep the columns that don't have null values from the wiki_movies_df DataFrame.
    drop_col_list=[col for col in wiki_movies_df.columns if (wiki_movies_df[col].isna().sum())/len(wiki_movies) > 0.9]
    wiki_movies_df.drop(drop_col_list, axis="columns",inplace=True)

    # 8. Create a variable that will hold the non-null values from the “Box office” column.
    box_office=wiki_movies_df["Box office"].dropna()
    
    # 9. Convert the box office data created in Step 8 to string values using the lambda and join functions.
    box_office=box_office.apply(lambda x: " ".join(x) if type(x)==list else x)
    box_office=box_office.str.replace(r'\$.*[-–—](?![a-z])','$',regex=True)# Cleaning ranges
    # 10. Write a regular expression to match the six elements of "form_one" of the box office data.
    form_one=r"\$\s*\d+\.?\d*\s*[mb]illi?on"
    # 11. Write a regular expression to match the three elements of "form_two" of the box office data.
    form_two=r"\$\s*\d{1,3}(?:[\.,]\d{3})+(?!\s[mb]illion)"
    # 12. Add the parse_dollars function.
    def parse_dollars(s):
        # if s is not a string, return NaN
        if type(s)!=str:
            return np.nan

        # if input is of the form $###.# million
        if re.match("\$\s*\d+\.?\d*\s*milli?on",s,flags=re.IGNORECASE):

            # remove dollar sign and " million"
            s = re.sub("\$|\s|[a-zA-Z]",'',s)
            # convert to float and multiply by a million
            value=float(s)*10**6
            # return value
            return value

        # if input is of the form $###.# billion
        elif re.match("\$\s*\d+\.?\d*\s*billi?on",s,flags=re.IGNORECASE):
            # remove dollar sign and " billion"
            s=re.sub("\$|\s|[a-zA-Z]",'',s)
            # convert to float and multiply by a billion
            value=float(s)*10**9
            # return value
            return value

        # if input is of the form $###,###,###
        elif re.match("\$\s*\d{1,3}(?:[\.,]\d{3})+(?!\s[mb]illion)",s,flags=re.IGNORECASE):
            # remove dollar sign and commas
            s = re.sub("\$|\s|,",'',s)
            # convert to float
            value=float(s)
            # return value
            return value
        # otherwise, return NaN
        else:
            return np.nan
    
        
    # 13. Clean the box office column in the wiki_movies_df DataFrame.
    wiki_movies_df["box_office"]=box_office.str.extract(f"({form_one}|{form_two})",flags=re.I)[0].apply(parse_dollars)
    
    # 14. Clean the budget column in the wiki_movies_df DataFrame.
    budget=wiki_movies_df["Budget"].dropna().apply(lambda x:" ".join(x) if type(x)==list else x)
    budget=budget.str.replace(r'\$.*[-–—](?![a-z])','$',regex=True) # Cleaning ranges
    budget=budget.str.replace(r'\[\d+\]\s*', '',regex=True) # Removing citations
    wiki_movies_df["budget"]=budget.str.extract(f"({form_one}|{form_two})",flags=re.IGNORECASE)[0].apply(parse_dollars)
    # 15. Clean the release date column in the wiki_movies_df DataFrame.
    release_date=wiki_movies_df["Release date"].dropna().apply(lambda x: " ".join(x) if type(x)==list else x)    
    date_form_one=r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[0123]?\d,\s*\d{4}"
    date_form_two=r"\d{4}.[01]\d.[0123]\d"
    date_form_three=r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{4}"
    date_form_four=r"\d{4}"
    wiki_movies_df['release_date']=pd.to_datetime(release_date.str\
                                .extract(f"({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})"\
                                ,flags=re.IGNORECASE)[0],infer_datetime_format=True)
    # 16. Clean the running time column in the wiki_movies_df DataFrame.
    running_time=wiki_movies_df["Running time"].dropna().apply(lambda x: " ".join(x) if type(x)==list else x)
    time_form_one=r'(\d*)\s*m'
    time_form_two=r'(\d+)\s*ho?u?r?s?\s*(\d*)\s*'
    running_time_extract=running_time.str.extract(f"{time_form_one}|{time_form_two}",flags=re.I).\
                        apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)
    wiki_movies_df['running_time'] = running_time_extract.apply(lambda row: row[1]*60 + row[2] if row[0] == 0 else row[0], axis=1)
    
    # Return three variables. The first is the wiki_movies_df DataFrame
    wiki_movies_df.drop(["Box office","Budget","Release date","Running time"],axis="columns",inplace=True)
    return wiki_movies_df, kaggle_metadata, ratings 

In [62]:
# 17. Create the path to your file directory and variables for the three files.
file_dir = r"C:\Users\dhanu\Desktop\Analysis Project\Movies ETL\Movies-ETL\Resources" 
# The Wikipedia data
wiki_file = f'{file_dir}/wikipedia-movies.json'
# The Kaggle metadata
kaggle_file = f'{file_dir}/movies_metadata.csv'
# The MovieLens rating data.
ratings_file = f'{file_dir}/ratings.csv'

In [63]:
# 18. Set the three variables equal to the function created in D1.
wiki_file, kaggle_file, ratings_file = extract_transform_load()

In [64]:
# 19. Set the wiki_movies_df equal to the wiki_file variable. 
wiki_movies_df = wiki_file

In [65]:
# 20. Check that the wiki_movies_df DataFrame looks like this. 
wiki_movies_df.head()

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Cinematography,Country,Language,Director,...,Editor(s),Producer(s),Production company(s),Composer(s),Writer(s),imdb_id,box_office,budget,release_date,running_time
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,United States,English,Renny Harlin,...,Michael Tronick,"[Steve Perry, Joel Silver]",Silver Pictures,"[Cliff Eidelman, Yello]","[David Arnott, James Cappe]",tt0098987,21400000.0,20000000.0,1990-07-11,102.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,United States,English,James Foley,...,Howard E. Smith,"[Ric Kidney, Robert Redlin]",Avenue Pictures,Maurice Jarre,"[James Foley, Robert Redlin]",tt0098994,2700000.0,6000000.0,1990-05-17,114.0
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Roger Deakins,United States,"[English, Lao]",Roger Spottiswoode,...,"[John Bloom, Lois Freeman-Fox]",Daniel Melnick,"[Carolco Pictures, IndieProd Company]",Charles Gross,"[John Eskow, Richard Rush]",tt0099005,57718089.0,35000000.0,1990-08-10,113.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",Carlo Di Palma,United States,English,Woody Allen,...,Susan E. Morse,Robert Greenhut,,,Woody Allen,tt0099012,7331647.0,12000000.0,1990-12-25,106.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Russell Boyd,US,English,John Cornell,...,David Stiven,John Cornell,,Maurice Jarre,Paul Hogan,tt0099018,6939946.0,25000000.0,1990-12-19,95.0


In [66]:
# 21. Check that wiki_movies_df DataFrame columns are correct. 
wiki_movies_df.columns.to_list()

['url',
 'year',
 'imdb_link',
 'title',
 'Based on',
 'Starring',
 'Cinematography',
 'Country',
 'Language',
 'Director',
 'Distributor',
 'Editor(s)',
 'Producer(s)',
 'Production company(s)',
 'Composer(s)',
 'Writer(s)',
 'imdb_id',
 'box_office',
 'budget',
 'release_date',
 'running_time']