In [None]:
import json
import pandas as pd
import numpy as np

In [None]:
file_dir = './Data/'


In [None]:
with open (f'{file_dir}wikipedia-movies.json', mode='r') as file:
    wiki_movies_raw = json.load(file)

In [None]:
len(wiki_movies_raw)

### *We should always take a look at a few individual records just to make sure that the data didn't come in horribly garbled.

In [None]:
# First 5 records
wiki_movies_raw[:5]

In [None]:
# Last 5 records
wiki_movies_raw[:-5]

In [None]:
# Some records in the middle
wiki_movies_raw[3600:3605]

# Extract 

### Extract the Kaggle Data

In [None]:
kaggle_metadata = pd.read_csv(f'{file_dir}movies_metadata.csv', low_memory=False)
ratings = pd.read_csv(f'{file_dir}ratings.csv')

### Inspect the data

In [None]:
kaggle_metadata.head()

In [None]:
kaggle_metadata.tail()

In [None]:
kaggle_metadata.sample(5)

In [None]:
ratings.head()

In [None]:
ratings.tail()

In [None]:
ratings.sample(5)

# Transform

In [None]:
wiki_movies_raw_df = pd.DataFrame(wiki_movies_raw)
wiki_movies_raw_df.head()

#### That's a lot of columns! Even if we try to use print(wiki_movies_df.columns), they won't all print out. We'll have to convert wiki_movies_df.columns to a list to see all of the columns.

#### Use wiki_movies_df.columns.tolist() and run the cell to see all of the column names that were imported.

In [None]:
wiki_movies_raw_df.columns.tolist()

### Use List Comprehensions to Filter Data



In [None]:
# filtering by only rows that have data in the columns 'Directed by' and 'imdb_link'.

wiki_movies = [movie for movie in wiki_movies_raw
                if ('Director' in movie or 'Directed by' in movie)
                        and 'imdb_link' in movie]
len(wiki_movies)

In [None]:
# create a dataframe out of wiki_movies

wiki_movies_df = pd.DataFrame(wiki_movies)
wiki_movies_df.head()

In [None]:
# adding another filter to the list comprehension: filtering out the tv shows.

wiki_movies = [movie for movie in wiki_movies_raw
                if ('Director' in movie or 'Directed by' in movie)
                        and 'imdb_link' in movie
                        and 'No. of episodes' not in movie]
len(wiki_movies)

In [None]:
wiki_movies_df.head()

In [None]:
def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    return movie

In [None]:
wiki_movies_df[wiki_movies_df['Arabic'].notnull()]

In [None]:
wiki_movies_df[wiki_movies_df['Arabic'].notnull()]['url']

In [None]:
sorted(wiki_movies_df.columns.tolist())

### Handle the Alternative Titles 


In [None]:
# Make an empty dict to hold all of the alternative titles.
# Loop through a list of all alternative title keys.
# Check if the current key exists in the movie object.
# If so, remove the key-value pair and add to the alternative titles dictionary.

def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    alt_titles = {}
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune–Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        
        if key in movie:
            alt_titles[key] = movie[key]
            movie.pop(key)
            
    return movie

In [None]:
# list of cleaned movies with a list comprehension
clean_movies = [clean_movie(movie) for movie in wiki_movies]

In [None]:
# Set wiki_movies_df to be the DataFrame created from clean_movies, and print out a list of the columns.
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

In [90]:
   def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)