In [1]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
import matplotlib.pyplot as plt

import tensorflow as tf
import numpy as np
import pandas as pd
import sqlalchemy

import sklearn.metrics as metrics
from tensorflow.python.data import Dataset

In [2]:
#getting movies dataset using Pandas from movie_metadata provided by Prof Zhang
movies_dataframe =  pd.read_csv("movie_metadata.csv", sep=",")

In [3]:
#getting actors dataset using Pandas from BuzzFeed
actors_dataframe = pd.read_csv("actor-metrics.csv", sep=",")

In [4]:
#getting biopics dataset using Pandas from 
biopics_dataframe = pd.read_csv("biopics.csv", sep=",", encoding='latin-1')

In [5]:
#renaming the lead actors column name for each table to have the same name 'lead_actors'
movies_dataframe = movies_dataframe.rename(columns={'actor_2_name': 'lead_actors'})
actors_dataframe = actors_dataframe.rename(columns={'actor': 'lead_actors'})
biopics_dataframe = biopics_dataframe.rename(columns={'lead_actor_actress': 'lead_actors'})

In [6]:
#Clean Up Part 1: Remove unnecessary columns
#I think we should first eliminate unnecessary columns and then merge, this way the table is easier to look at

In [11]:
#Clean Up Part 2: Merge
movies_dataframe_merged = movies_dataframe.merge(actors_dataframe,on='lead_actors').merge(biopics_dataframe,on='lead_actors')
movies_dataframe_merged
# Something doesn't seem to work. On the right part of the big table there are weird things happening: 
# Another director than at the start, other films? Somehow we can not merge the tables this way I think? 
# Or is this part not about movies but just the actors? Then we really need to get rid of most columns!

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,lead_actors,actor_1_facebook_likes,gross,genres,...,year_release,box_office,director,number_of_subjects,subject,type_of_subject,race_known,subject_race,person_of_color,subject_sex
0,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2014,$65M,Ridley Scott,1,Moses,Historical,Known,Middle Eastern (White),0,Male
1,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2006,$5.48M,Werner Herzog,1,Dieter Dengler,Military,Known,White,0,Male
2,Color,Michael Mann,357.0,140.0,0.0,1000.0,Christian Bale,40000.0,97030725.0,Biography|Crime|Drama|History|Romance,...,2014,$65M,Ridley Scott,1,Moses,Historical,Known,Middle Eastern (White),0,Male
3,Color,Michael Mann,357.0,140.0,0.0,1000.0,Christian Bale,40000.0,97030725.0,Biography|Crime|Drama|History|Romance,...,2006,$5.48M,Werner Herzog,1,Dieter Dengler,Military,Known,White,0,Male
4,Color,David O. Russell,538.0,138.0,737.0,14000.0,Christian Bale,34000.0,150117807.0,Crime|Drama,...,2014,$65M,Ridley Scott,1,Moses,Historical,Known,Middle Eastern (White),0,Male
5,Color,David O. Russell,538.0,138.0,737.0,14000.0,Christian Bale,34000.0,150117807.0,Crime|Drama,...,2006,$5.48M,Werner Herzog,1,Dieter Dengler,Military,Known,White,0,Male
6,Color,Adam McKay,426.0,130.0,285.0,767.0,Christian Bale,33000.0,70235322.0,Biography|Comedy|Drama|History,...,2014,$65M,Ridley Scott,1,Moses,Historical,Known,Middle Eastern (White),0,Male
7,Color,Adam McKay,426.0,130.0,285.0,767.0,Christian Bale,33000.0,70235322.0,Biography|Comedy|Drama|History,...,2006,$5.48M,Werner Herzog,1,Dieter Dengler,Military,Known,White,0,Male
8,Color,David Ayer,118.0,116.0,453.0,2000.0,Christian Bale,24000.0,3335839.0,Action|Crime|Drama|Thriller,...,2014,$65M,Ridley Scott,1,Moses,Historical,Known,Middle Eastern (White),0,Male
9,Color,David Ayer,118.0,116.0,453.0,2000.0,Christian Bale,24000.0,3335839.0,Action|Crime|Drama|Thriller,...,2006,$5.48M,Werner Herzog,1,Dieter Dengler,Military,Known,White,0,Male


In [13]:
#Clean Up Part 3: Remove duplicates
movies_dataframe_noDuplicates = movies_dataframe_merged.drop_duplicates(subset=['director_name', 'duration', 'lead_actors', 'gross', 'actor_1_facebook_likes'], keep='first', inplace=False)
movies_dataframe_noDuplicates

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,lead_actors,actor_1_facebook_likes,gross,genres,...,year_release,box_office,director,number_of_subjects,subject,type_of_subject,race_known,subject_race,person_of_color,subject_sex
0,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2014,$65M,Ridley Scott,1,Moses,Historical,Known,Middle Eastern (White),0,Male
2,Color,Michael Mann,357.0,140.0,0.0,1000.0,Christian Bale,40000.0,97030725.0,Biography|Crime|Drama|History|Romance,...,2014,$65M,Ridley Scott,1,Moses,Historical,Known,Middle Eastern (White),0,Male
4,Color,David O. Russell,538.0,138.0,737.0,14000.0,Christian Bale,34000.0,150117807.0,Crime|Drama,...,2014,$65M,Ridley Scott,1,Moses,Historical,Known,Middle Eastern (White),0,Male
6,Color,Adam McKay,426.0,130.0,285.0,767.0,Christian Bale,33000.0,70235322.0,Biography|Comedy|Drama|History,...,2014,$65M,Ridley Scott,1,Moses,Historical,Known,Middle Eastern (White),0,Male
8,Color,David Ayer,118.0,116.0,453.0,2000.0,Christian Bale,24000.0,3335839.0,Action|Crime|Drama|Thriller,...,2014,$65M,Ridley Scott,1,Moses,Historical,Known,Middle Eastern (White),0,Male
10,Color,Anthony Russo,516.0,147.0,94.0,11000.0,Scarlett Johansson,21000.0,407197282.0,Action|Adventure|Sci-Fi,...,2008,$26.8M,Justin Chadwick,2,Mary Boleyn,Historical,Known,White,0,Female
11,Color,Jon Favreau,453.0,124.0,4000.0,4000.0,Scarlett Johansson,21000.0,312057433.0,Action|Adventure|Sci-Fi,...,2008,$26.8M,Justin Chadwick,2,Mary Boleyn,Historical,Known,White,0,Female
12,Color,Justin Chadwick,169.0,115.0,56.0,19000.0,Scarlett Johansson,20000.0,26814957.0,Biography|Drama|History|Romance,...,2008,$26.8M,Justin Chadwick,2,Mary Boleyn,Historical,Known,White,0,Female
13,Color,Joseph Gordon-Levitt,364.0,90.0,23000.0,694.0,Scarlett Johansson,23000.0,24475193.0,Comedy|Drama|Romance,...,2008,$26.8M,Justin Chadwick,2,Mary Boleyn,Historical,Known,White,0,Female
14,Color,Marc Forster,654.0,123.0,395.0,1000.0,Brad Pitt,17000.0,202351611.0,Action|Adventure|Horror|Sci-Fi|Thriller,...,2011,$75.6M,Bennett Miller,1,Billy Beane,Athlete,Known,White,0,Male
