# Automate the Cleaning Process

In [1]:
import os
import json
import pandas as pd
import numpy as np
import re

# Dependencies required for the uploading process
# import time
# from sqlalchemy import create_engine
# import psycopg2
# from config import db_password




# Functions created during the initial cleaning
- Each one takes a string as input:
 - cleaning_movie
     - reduces the sheer variety of keys the movie object has in wiki_movie
 - find_dollars
     - finds the dollar values hidden amoung the strings and lists
 - parse_dollars
     - converts the dollar values found by find_dollars into numeric values
 - find_dates
     - finds dates hidden in strings and lists to be converted to datetime objects with pd.to_datetime
 - find_time
     - finds the movie duration in minutes from amoung the strings and lists and converts them into numeric values

### cleaning_movie

In [2]:
def cleaning_movie(movie):
    """wiki_movie_file is a json file containing a list of dictionaries with each index as a different movie 
    with different key:value pairs. This function is specially designed to clean that specific dataset. 
    Wikipedia is managed by many people and they do not all use the same words.
    """
    alternate_titles = dict()
    fixed_movie = dict(movie)
###  Start with alternate titles stored in language keys and merge them into one.
###  Language keys i could find:      
    language_keys = ['Also known as','Arabic','Cantonese','Chinese','French', 'Hangul','Hebrew','Hepburn','Japanese','Literally',
        'Mandarin','McCune–Reischauer','Original title','Polish', 'Revised Romanization','Romanized','Russian',
        'Simplified','Traditional','Yiddish']
    for key in language_keys:
        if key in fixed_movie.keys():
            alternate_titles[key] = fixed_movie[key]
            fixed_movie.pop(key)
        else:
            pass
##  if there were alternate titles, add them to the movie.       
    if len(alternate_titles) > 0:
        fixed_movie['alternate_titles'] = alternate_titles
    else:
        pass
        
###  Alternative titles are fixed, now merge columns that are similar.     
    keys_to_merge = {'Director':'Directed by', 'Country': 'Country of origin', 'Distributor(s)':'Distributed by',
                     'Editor(s)':'Edited by',  'Language':'Original language(s)', 'Producer(s)':'Produced by',
                     'Genre(s)': 'Genre', 'Composer(s)': ['Music by', 'Theme music composer'], 
                     'Release date': ['Release(s)', 'Original release'], 'Distributor(s)':['Distributed by','Distributor'],
                     'Writer(s)':['Written by', 'Story by', 'Screenplay by', 'Screen story by', 'Adaptation by'],
                     'Production Comapany': ['Production company', 'Production company(s)', 'Productioncompanies ','Productioncompany ']  }
##  item = key, from the key:value pair and the key I want; values = the movie key(s) I do not want. 
    for item in keys_to_merge: 
        if type(keys_to_merge[item]) == type(list()):
            for n in keys_to_merge[item]:
                if n in fixed_movie.keys():
                    fixed_movie[item] = fixed_movie.pop(n)
                else:
                    pass
        else: 
            if keys_to_merge[item] in fixed_movie.keys():
                fixed_movie[item] = fixed_movie.pop(keys_to_merge[item])
            else:
                pass
    
    return fixed_movie



### find_dollars

In [3]:
def find_dollars(s):
    # declared patterns
    # searching for pattern that is similar to: "$45.3 million/billion"
    p1 = r'(\$\s?\d+\.?\d*\s*[bm]illi?on)'      
    # searching for pattern that is similar to $123,456,789.0
    p2 = r'(\$\s?\d+(?:[,\.]\d{3})+\.?\d*)(?!\s*[bm]illi?on)' 
    # modified pattern #1; includes a range $43.5-45.7 Million
    p3 = r"(\$\d*\.?\d*)(?:[-—–]\d*\.?\d*)(\s[bm]illi?on)" # modified pattern #1 version 3
    # searcing for $100 to $999999 (no commas)
    p4 = r"(\$\d{3,6})" 
    if len(re.findall(p1, str(s), flags=re.IGNORECASE)) == 1:
        num = re.findall(p1, str(s), flags=re.IGNORECASE)[0]
        return num
    elif len(re.findall(p2, str(s), flags=re.IGNORECASE)) == 1:
        num = re.findall(p2, str(s), flags=re.IGNORECASE)[0]
        return num
    elif len(re.findall(p1, str(s), flags=re.IGNORECASE)) > 1: 
        num = re.findall(p1, str(s), flags=re.IGNORECASE)[1]
        return num
    elif len(re.findall(p2, str(s), flags=re.IGNORECASE)) > 1:  
        num = re.findall(p2, str(s), flags=re.IGNORECASE)[1]
        return num
    elif len(re.findall(p3, str(s), flags=re.IGNORECASE)) > 0:
        fix = re.findall(p3, str(s), flags=re.IGNORECASE)
        num = fix[0][0] + fix[0][1]
        return num
    elif len(re.findall(p4, str(s), flags=re.IGNORECASE)) > 0:
        num = re.findall(p4, str(s), flags=re.IGNORECASE)[0].replace(" ", "")
        return num
    else:
        return np.nan

### parse_dollars

In [4]:
# Created during initial cleaning for budget and revenue/box office
# parse the dollar values found with find_dollars() and durn them into numeric values
def parse_dollars(s):   
    # patterns to change into numbers
    p1 = "(\$\d+\.?\d*\s*milli?on)" # pulled from pattern #1 above
    p2 = "(\$\d+\.?\d*\s*billi?on)" # pulled from pattern #1 above
    p3 = "(\$\d+(?:[,\.]\d{3})+\.?\d*)"  # pulled from pattern #2 above
    
    # if s is string, then change it, otherwise np.nan
    if type(s) == str:
  
        # if input is of the form $###.## Million, pattern #1
        if re.match(p1, s, flags=re.IGNORECASE):
            # Remove dollar sign and "million", convert to float and multiply by 1,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000
            
            # return value
            return num
            
        # elif input is of the form $###.## Billion, formerly pattern #1 also, now pattern #2
        elif re.match(p2, s, flags=re.IGNORECASE):
        # Remove dollar sign and "billion", convert to float and multiply by 1,000,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000000
            # return value
            return num

            
        # elif input is of the form $###,###,###, formerly pattern #2, now pattern #3
        elif re.match(p3, s, flags=re.IGNORECASE):
            # Remove dollar sign and commas, convert to float
            num = float(re.findall(r"(?:\$)(\d+(?:[,\.]\d{3})+\.?\d*)", s)[0].replace(",", ""))

            # return value
            return num
        else: 
            if len(s) <= 6:
                if re.match("(\$\d{1,6}$)", s, flags=re.IGNORECASE):
                    num = float(s.replace("$", ""))
                    return num

    else: 
        return np.nan


### find_dates

In [5]:
def find_dates(s):
    # declared patterns
    p1 = r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*[123]?\d,\s*\d{4}"
    p2 = r"[123]?\d\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{4}"
    p3 = r"(?:January|February|March|April|May|June|July|August|September|October|November|December),?\s*\d{4}"
    p4 = r"(\d{4})(?!\S)"
    p5 = r"(\d{4}-[01]?\d-[0123]?\d)"
    if re.findall(p1, str(s), flags=re.IGNORECASE):  # Month DD, YYYY
        spam = re.findall(p1, str(s), flags=re.IGNORECASE)[0]
        return spam
    elif re.findall(p2, str(s), flags=re.IGNORECASE): # DD Month YYYY
        spam = re.findall(p2, str(s), flags=re.IGNORECASE)[0]
        return spam
    elif re.findall(p3, str(s), flags=re.IGNORECASE): # Month YYYY
        spam = re.findall(p3, str(s), flags=re.IGNORECASE)[0]
        return spam
    elif re.findall(p5, str(s), flags=re.IGNORECASE): # YYYY-MM-DD
        spam = re.findall(p5, str(s), flags=re.IGNORECASE)[0]
        return spam
    elif re.findall(p4, str(s), flags=re.IGNORECASE): # YYYY
        spam = re.findall(p4, str(s), flags=re.IGNORECASE)[0]
        return spam
    else:
        return np.nan



### find_time

In [6]:
def find_time(s):
    # Declared patterns
    p1 = r"(\d+)(?:', ')?\s*min\.?(?:utes)?" # 120 minutes, 120 min, 120 min., '120', 'min'
    p2 = r"((?:\d+\s*h)?\s*\d+m)"# 1h 48m, 70m
    p3 = r"(\d+\s*hours?)" # one record says "4 Hours"
    if type(s) == float:
        return s
    if re.findall(p1, str(s), flags=re.IGNORECASE): # 120 minutes, 120 min, 120 min., '120', 'min'
        spam = int(re.findall(p1, str(s), flags=re.IGNORECASE)[0])
        return spam
    elif re.findall(p2, str(s), flags=re.IGNORECASE): # 1h 48m, 70m
        spam = re.findall(p2, str(s), flags=re.IGNORECASE)[0]
        egg = int(re.findall(r"(\d+)m$", spam, flags=re.IGNORECASE)[0])
        try:
            hours = int(re.findall(r"(\d+)h", spam, flags=re.IGNORECASE)[0])*60 
            egg+=hours
        except:
            pass

        return egg
    elif re.findall(p3, str(s), flags=re.IGNORECASE): # 1 record says "4 hours" 
        spam = re.findall(p3, str(s), flags=re.IGNORECASE)[0]
        egg = int(re.findall(r"(\d+)\s*h", spam, flags=re.IGNORECASE)[0])*60
        return egg
    else:
        return float(np.nan)

# Automate the Process

In [88]:
# 1 Add the function that takes in three arguments;
# Wikipedia data, Kaggle metadata, and MovieLens rating data (from Kaggle)

def extract_transform_load(wiki_json, kaggle_csv, ratings_csv):
    # Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    kaggle_df = pd.read_csv(kaggle_csv, sep=",", header=0, low_memory=False)
    ratings_df = pd.read_csv(ratings_csv, sep=",", header=0)

    # Open the read the Wikipedia data JSON file.
    with open(wiki_json, 'r') as file:
        wiki = json.load(file)
        file.close()
    
    # Write a list comprehension to filter out TV shows.
    wiki_movies = [movie for movie in wiki if (('imdb_link' and ('Directed by' or 'Director')) in movie.keys()) and ('No. of episodes' not in movie.keys())]
    # Write a list comprehension to iterate through the cleaned wiki movies list
    # and call the clean_movie function on each movie. Read in the cleaned movies list as a DataFrame.
    
    cleaned_movies = [cleaning_movie(movie) for movie in wiki_movies]
    wiki_df = pd.DataFrame(cleaned_movies)
    
    # Write a try-except block to catch errors while extracting the IMDb ID using a regular expression string and
    # dropping any imdb_id duplicates. If there is an error, capture and print the exception.
    imdb_pattern = r"(tt\d{7})"
    try:
        wiki_df['imdb_id'] = wiki_df['imdb_link'].str.extract(imdb_pattern)
    except TypeError: 
        print("imdb_pattern: TypeError, data cell type not string")
    except KeyError: 
        print("'imdb_id' or 'imdb_link' keys not available in wiki_movies")
    except: 
        print("Differnt Error")
    
    wiki_df = pd.DataFrame(wiki_df.drop_duplicates(subset='imdb_id'))
        
    # Write a list comprehension to keep the columns that don't have null values from the wiki_movies_df DataFrame.
    columns_to_keep = [column for column in wiki_df if wiki_df[column].isnull().sum()< len(wiki_df)*0.9]
    wiki_df = wiki_df[columns_to_keep]

    # Create a variable that will hold the non-null values from the “Box office” column.
    box_office = wiki_df['Box office'].dropna()

    # Convert the box office data created in Step 8 to string values using the lambda and join functions.
    box_office = box_office.apply(lambda x: ''.join(x) if type(x) != str else x)

    # Clean the box office column in the wiki_movies_df DataFrame with the parse_dollars function.
    wiki_df['box_office'] =  box_office.apply(lambda x: parse_dollars(find_dollars(x)))
    
    # Clean the budget column in the wiki_movies_df DataFrame.
    budget = wiki_df['Budget'].dropna()
    budget = budget.apply(lambda x: ''.join(x) if type(x) != str else x)
    wiki_df['budget'] = budget.apply(lambda x : parse_dollars(find_dollars(x)))

    # Clean the release date column in the wiki_movies_df DataFrame.
    release_date = wiki_df['Release date'].dropna()
    wiki_df['release_date'] = pd.to_datetime(release_date.apply(lambda x : find_dates(x)), infer_datetime_format=True)

    # Clean the running time column in the wiki_movies_df DataFrame.
    runtime = wiki_df['Running time'].dropna()
    wiki_df['running_time'] = runtime.apply(lambda x:  find_time(x))

    # 2. Clean the Kaggle metadata.
    kaggle_df = kaggle_df[kaggle_df['adult'].isin(['True', 'False'])]
    kaggle_df['adult'] = kaggle_df['adult'] == 'True'
    kaggle_df['video'] = kaggle_df['video'] == 'True'
    
    kaggle_df['budget'] = pd.to_numeric(kaggle_df['budget'], errors='raise')
    kaggle_df['id'] = pd.to_numeric(kaggle_df['id'], errors='raise')
    kaggle_df['popularity'] = pd.to_numeric(kaggle_df['popularity'], errors='raise')
    kaggle_df['release_date'] = pd.to_datetime(kaggle_df['release_date'])

    # 3. Merged the two DataFrames into the movies DataFrame.
    wiki_kaggle = pd.merge(wiki_df, kaggle_df, left_on='imdb_id', right_on='imdb_id', how='inner', suffixes=['_wiki', '_kaggle'])
    
    # 4. Drop unnecessary columns from the merged DataFrame.
    wiki_kaggle = wiki_kaggle.drop(wiki_kaggle[wiki_kaggle['imdb_id'].isna()].index)
    wiki_kaggle = wiki_kaggle.drop(['Release date', 'title_wiki', 'Language', 'Box office', 'Running time', 'Budget',
                                'Production Comapany', 'year', 'video', "release_date_wiki"], axis=1)
    

    # 5. Add in the function to fill in the missing Kaggle data.
    def fill_missing_kaggle_data(df, kaggle_column, wiki_column):
        df[kaggle_column] = df.apply(
            lambda row: row[wiki_column] if row[kaggle_column] == 0 else row[kaggle_column]
            , axis=1)
        df.drop(columns=wiki_column, inplace=True)
        
    # 6. Call the function in Step 5 with the DataFrame and columns as the arguments.
    fill_missing_kaggle_data(wiki_kaggle, 'budget_kaggle', 'budget_wiki')
    fill_missing_kaggle_data(wiki_kaggle, 'runtime', 'running_time')
    fill_missing_kaggle_data(wiki_kaggle, 'revenue', 'box_office')


    # 7. Filter the movies DataFrame for specific columns.
    column_names=  {'url' : 'wikipedia_url', 'Based on' : 'based_on', 'Starring' : "starring",
                    'Cinematography' : 'cinematography', 'Director': 'director', 'Distributor(s)': 'distributors',
                    'Editor(s)': 'editors', 'Producer(s)':'producers', 'Composer(s)':'composers',
                    'Writer(s)':'writers', 'id': 'kaggle_id', 'title_kaggle': 'title', 'Country':'country',
                    "budget_kaggle":"budget", "release_date_kaggle":"release_date" }
    wiki_kaggle.rename(column_names, inplace=True, axis='columns')
    
    # 8. Rename the columns in the movies DataFrame.
    wiki_kaggle.loc[:, ['imdb_id', 'kaggle_id','title','original_title','belongs_to_collection','tagline','wikipedia_url',
                'imdb_link','runtime','budget','revenue','release_date','popularity','vote_count', 'vote_average',
                 'genres','original_language','overview','spoken_languages','country', 'production_countries',
                 'production_companies', 'distributors','producers','director','starring', 'cinematography', 'editors',
                 'writers','composers','based_on']]

    # 9. Transform and merge the ratings DataFrame.
    rating_df = ratings_df.groupby(['rating', 'movieId'], as_index=False).count()
#     print(rating_df)
    rating_counts = rating_df.pivot(index="movieId", columns='rating', values="userId")
    rating_counts.columns = [f"rating_{col}" for col in rating_counts.columns]
    movie_ratings = pd.merge(wiki_kaggle, rating_counts, left_on='kaggle_id', right_index=True, how='left')
    
    
    
    
    
    
    
    
    
    
    
    

    
    return wiki_kaggle, movie_ratings

In [89]:
# 17. Create the path to your file directory and variables for the three files.

# The Wikipedia data
wiki_file = os.path.join('Data', 'wikipedia-movies.json')
# The Kaggle metadata
kaggle_file = os.path.join('Data', 'movies_metadata.csv')
# The MovieLens rating data.
ratings_file = os.path.join('Data', 'ratings.csv')



In [90]:
movies, movies_with_ratings = extract_transform_load(wiki_file, kaggle_file, ratings_file)

In [91]:
# 18. Check that the wiki_movies_df DataFrame looks like this. 
movies

Unnamed: 0,wikipedia_url,imdb_link,based_on,starring,cinematography,country,director,distributors,editors,producers,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,https://en.wikipedia.org/wiki/The_Adventures_o...,https://www.imdb.com/title/tt0098987/,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,United States,Renny Harlin,20th Century Fox,Michael Tronick,"[Steve Perry, Joel Silver]",...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1990-07-11,20423389.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Kojak. Columbo. Dirty Harry. Wimps.,The Adventures of Ford Fairlane,6.2,72.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",https://www.imdb.com/title/tt0098994/,"[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,United States,James Foley,Avenue Pictures,Howard E. Smith,"[Ric Kidney, Robert Redlin]",...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1990-08-24,2700000.0,114.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,All they risked was everything.,"After Dark, My Sweet",6.5,17.0
2,https://en.wikipedia.org/wiki/Air_America_(film),https://www.imdb.com/title/tt0099005/,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Roger Deakins,United States,Roger Spottiswoode,TriStar Pictures,"[John Bloom, Lois Freeman-Fox]",Daniel Melnick,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1990-08-10,33461269.0,112.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The few. The proud. The totally insane.,Air America,5.3,146.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),https://www.imdb.com/title/tt0099012/,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",Carlo Di Palma,United States,Woody Allen,Orion Pictures,Susan E. Morse,Robert Greenhut,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1990-12-25,7331647.0,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Alice,6.3,57.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,https://www.imdb.com/title/tt0099018/,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Russell Boyd,US,John Cornell,Paramount Pictures,David Stiven,John Cornell,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1990-12-21,6939946.0,95.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Who does he think he is?,Almost an Angel,5.6,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6064,https://en.wikipedia.org/wiki/A_Fantastic_Woman,https://www.imdb.com/title/tt5639354/,,"[Daniela Vega, Francisco Reyes]",Benjamín Echazarreta,"[Chile, Germany, Spain, United States, [2]]",Sebastián Lelio,"[Participant Media (Chile), Piffl Medien (Germ...",Soledad Salfate,"[Juan de Dios Larraín, Pablo Larraín]",...,"[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",2017-04-06,3700000.0,104.0,"[{'iso_639_1': 'es', 'name': 'Español'}]",Released,,A Fantastic Woman,7.2,13.0
6065,https://en.wikipedia.org/wiki/Permission_(film),https://www.imdb.com/title/tt5390066/,,"[Rebecca Hall, Dan Stevens, Morgan Spector, Fr...",Adam Bricker,United States,Brian Crano,Good Deed Entertainment,Matt Friedman,"[Brian Crano, Rebecca Hall]",...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2017-04-22,,96.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Permission,0.0,1.0
6066,https://en.wikipedia.org/wiki/Loveless_(film),https://www.imdb.com/title/tt6304162/,,"[Maryana Spivak, Aleksey Rozin, Matvey Novikov...",Mikhail Krichman,"[Russia, France, Belgium, Germany, [3]]",Andrey Zvyagintsev,"[Sony Pictures Releasing, (Russia), [1]]",Anna Mass,"[Alexander Rodnyansky, Sergey Melkumov, Gleb F...",...,"[{'iso_3166_1': 'RU', 'name': 'Russia'}, {'iso...",2017-06-01,4800000.0,128.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,,Loveless,7.8,26.0
6067,https://en.wikipedia.org/wiki/Gemini_(2017_film),https://www.imdb.com/title/tt5795086/,,"[Lola Kirke, Zoë Kravitz, Greta Lee, Michelle ...",Andrew Reed,United States,Aaron Katz,Neon,Aaron Katz,"[Mynette Louie, Sara Murphy, Adele Romanski]",...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2017-03-12,200340.0,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Post Production,,Gemini,0.0,0.0


In [92]:
# 21. Check that wiki_movies_df DataFrame columns are correct. 
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6052 entries, 0 to 6068
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   wikipedia_url          6052 non-null   object        
 1   imdb_link              6052 non-null   object        
 2   based_on               1972 non-null   object        
 3   starring               5940 non-null   object        
 4   cinematography         5568 non-null   object        
 5   country                5882 non-null   object        
 6   director               6052 non-null   object        
 7   distributors           5870 non-null   object        
 8   editors                5701 non-null   object        
 9   producers              5935 non-null   object        
 10  composers              5704 non-null   object        
 11  writers                5917 non-null   object        
 12  imdb_id                6052 non-null   object        
 13  adu

In [93]:
movies_with_ratings


Unnamed: 0,wikipedia_url,imdb_link,based_on,starring,cinematography,country,director,distributors,editors,producers,...,rating_0.5,rating_1.0,rating_1.5,rating_2.0,rating_2.5,rating_3.0,rating_3.5,rating_4.0,rating_4.5,rating_5.0
0,https://en.wikipedia.org/wiki/The_Adventures_o...,https://www.imdb.com/title/tt0098987/,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,United States,Renny Harlin,20th Century Fox,Michael Tronick,"[Steve Perry, Joel Silver]",...,,,,,,,,,,
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",https://www.imdb.com/title/tt0098994/,"[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,United States,James Foley,Avenue Pictures,Howard E. Smith,"[Ric Kidney, Robert Redlin]",...,,,,,,,,,,
2,https://en.wikipedia.org/wiki/Air_America_(film),https://www.imdb.com/title/tt0099005/,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Roger Deakins,United States,Roger Spottiswoode,TriStar Pictures,"[John Bloom, Lois Freeman-Fox]",Daniel Melnick,...,,,,,,,,,,
3,https://en.wikipedia.org/wiki/Alice_(1990_film),https://www.imdb.com/title/tt0099012/,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",Carlo Di Palma,United States,Woody Allen,Orion Pictures,Susan E. Morse,Robert Greenhut,...,,,,,,,,,,
4,https://en.wikipedia.org/wiki/Almost_an_Angel,https://www.imdb.com/title/tt0099018/,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Russell Boyd,US,John Cornell,Paramount Pictures,David Stiven,John Cornell,...,3.0,,3.0,2.0,5.0,26.0,37.0,46.0,16.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6064,https://en.wikipedia.org/wiki/A_Fantastic_Woman,https://www.imdb.com/title/tt5639354/,,"[Daniela Vega, Francisco Reyes]",Benjamín Echazarreta,"[Chile, Germany, Spain, United States, [2]]",Sebastián Lelio,"[Participant Media (Chile), Piffl Medien (Germ...",Soledad Salfate,"[Juan de Dios Larraín, Pablo Larraín]",...,,,,,,,,,,
6065,https://en.wikipedia.org/wiki/Permission_(film),https://www.imdb.com/title/tt5390066/,,"[Rebecca Hall, Dan Stevens, Morgan Spector, Fr...",Adam Bricker,United States,Brian Crano,Good Deed Entertainment,Matt Friedman,"[Brian Crano, Rebecca Hall]",...,,,,,,,,,,
6066,https://en.wikipedia.org/wiki/Loveless_(film),https://www.imdb.com/title/tt6304162/,,"[Maryana Spivak, Aleksey Rozin, Matvey Novikov...",Mikhail Krichman,"[Russia, France, Belgium, Germany, [3]]",Andrey Zvyagintsev,"[Sony Pictures Releasing, (Russia), [1]]",Anna Mass,"[Alexander Rodnyansky, Sergey Melkumov, Gleb F...",...,,,,,,,,,,
6067,https://en.wikipedia.org/wiki/Gemini_(2017_film),https://www.imdb.com/title/tt5795086/,,"[Lola Kirke, Zoë Kravitz, Greta Lee, Michelle ...",Andrew Reed,United States,Aaron Katz,Neon,Aaron Katz,"[Mynette Louie, Sara Murphy, Adele Romanski]",...,,,,,,,,,,


In [94]:
movies_with_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6052 entries, 0 to 6068
Data columns (total 45 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   wikipedia_url          6052 non-null   object        
 1   imdb_link              6052 non-null   object        
 2   based_on               1972 non-null   object        
 3   starring               5940 non-null   object        
 4   cinematography         5568 non-null   object        
 5   country                5882 non-null   object        
 6   director               6052 non-null   object        
 7   distributors           5870 non-null   object        
 8   editors                5701 non-null   object        
 9   producers              5935 non-null   object        
 10  composers              5704 non-null   object        
 11  writers                5917 non-null   object        
 12  imdb_id                6052 non-null   object        
 13  adu