# Libraries and Helper Functions / Objects

In [39]:
import pandas as pd
import requests
import json
import numpy as np
from time import sleep
from ast import literal_eval
from imdb import IMDb

In [19]:
# json object of tmbd genre ids
genre_ids= {
  "genres": [
    {
      "id": 28,
      "name": "Action"
    },
    {
      "id": 12,
      "name": "Adventure"
    },
    {
      "id": 16,
      "name": "Animation"
    },
    {
      "id": 35,
      "name": "Comedy"
    },
    {
      "id": 80,
      "name": "Crime"
    },
    {
      "id": 99,
      "name": "Documentary"
    },
    {
      "id": 18,
      "name": "Drama"
    },
    {
      "id": 10751,
      "name": "Family"
    },
    {
      "id": 14,
      "name": "Fantasy"
    },
    {
      "id": 36,
      "name": "History"
    },
    {
      "id": 27,
      "name": "Horror"
    },
    {
      "id": 10402,
      "name": "Music"
    },
    {
      "id": 9648,
      "name": "Mystery"
    },
    {
      "id": 10749,
      "name": "Romance"
    },
    {
      "id": 878,
      "name": "Science Fiction"
    },
    {
      "id": 10770,
      "name": "TV Movie"
    },
    {
      "id": 53,
      "name": "Thriller"
    },
    {
      "id": 10752,
      "name": "War"
    },
    {
      "id": 37,
      "name": "Western"
    }
  ]
}

# Favorite Movie Information

### Get Favorite Movie ID

In [42]:
api_key = '9ec0c2e7850f575e7dcd37c195e45b69'
favorite_movie = 'Gladiator'
api_url = 'https://api.themoviedb.org/3/search/company?api_key={0}&query={1}&page=1'.format(api_key, favorite_movie)
response = requests.get(api_url)
print(response.text)

{"page":1,"results":[{"id":14559,"logo_path":null,"name":"Gladiator Productions"},{"id":69280,"logo_path":null,"name":"Gladiator Film"},{"id":74560,"logo_path":null,"name":"Films Internazionali Artistici (FIA) / Gladiator Film"}],"total_pages":1,"total_results":3}


### Get Favorite Movie Genre and Poster Path

In [46]:
favorite_movie_id = 74560
api_url = 'https://api.themoviedb.org/3/movie/{1}?api_key={0}'.format(api_key, favorite_movie_id)
response = requests.get(api_url)
tmdb_data = json.loads(response.text)
print('Genres: ' + str(tmdb_data['genres']))
print('Poster Path: ' + str(tmdb_data['poster_path']))

Genres: [{u'id': 53, u'name': u'Thriller'}, {u'id': 18, u'name': u'Drama'}, {u'id': 27, u'name': u'Horror'}]
Poster Path: /y8292lRPCc3NsnxHakY8upvciCI.jpg


### Genre from TMDB and IMDB

In [48]:
print("TMDB Genres:")
for genre in tmdb_data['genres']:
    print(genre['name'])

# get IMDB data
ia = IMDb()
imdb_data = ia.get_movie(str(tmdb_data['imdb_id'])[2:])

TMDB Genres:
Thriller
Drama
Horror


# Popular Movies from TMDB

### Get Popular Movies Data

In [13]:
# loops through each page of popular movies in TMDB and adds it to a dataframe object
df = []
total_pages = range(1,976+1)
for page in total_pages:
    current_page = requests.get('https://api.themoviedb.org/3/movie/popular?api_key=ed5609930331aab76658f32439c5e5e5&language=en-US&page='+str(page))
    sleep(0.25)
    data = json.loads(current_page.text)
    try:
        for element in data['results']:
            df.append(element)
    except:
        continue
df = pd.DataFrame(df)
df.head()

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/6aUWe0GSl69wMTSWWexsorMIvwU.jpg,"[14, 10402, 10749]",321612,en,Beauty and the Beast,A live-action adaptation of Disney's version o...,180.45132,/tWqifoYuwLETmmasnGHO7xBjEtt.jpg,2017-03-17,Beauty and the Beast,False,7.1,1246
1,False,/5pAGnkFYSsFJ99ZxDIYnhQbQFXs.jpg,"[28, 18, 878]",263115,en,Logan,"In the near future, a weary Logan cares for an...",117.369877,/45Y1G5FEgttPAwjTYic6czC9xCn.jpg,2017-02-28,Logan,False,7.6,2075
2,False,/fxDXp8un4qNY9b1dLd7SH6CKzC.jpg,"[16, 35, 18, 10751, 10402]",335797,en,Sing,A koala named Buster recruits his best friend ...,80.989984,/s9ye87pvq2IaDvjv9x4IOXVjvA7.jpg,2016-11-23,Sing,False,6.7,1007
3,False,/pGwChWiAY1bdoxL79sXmaFBlYJH.jpg,"[878, 28, 12, 14]",293167,en,Kong: Skull Island,Explore the mysterious and dangerous home of t...,61.933195,/aoUyphk4nwffrwlZRaOa0eijgpr.jpg,2017-03-08,Kong: Skull Island,False,6.1,876
4,False,/dkMD5qlogeRMiEixC4YNPUvax2T.jpg,"[28, 12, 878, 53]",135397,en,Jurassic World,Twenty-two years after the events of Jurassic ...,51.976724,/jjBgi2r5cRt36xF6iNUEhzscEcb.jpg,2015-06-09,Jurassic World,False,6.5,6699


In [14]:
# get shape of dataframe
df.shape

(19262, 14)

In [15]:
# put columns that cannot be saved directly to csv into list
columns_to_convert = ['overview', 'title', 'original_title']
for column in columns_to_convert:
    df[column] = df[column].apply(lambda x: [x])

### View Top 10 Movies and Genres

In [57]:
df = pd.read_csv('popular_movie_data_tmdb.csv')
df_top_10 = df.iloc[:10]

In [66]:
genre_id_map = {genre['id']:genre['name'] for genre in genre_ids['genres']}
for i in range(10):
    genres = [genre_id_map[genre_id] for genre_id in literal_eval(df_top_10.iloc[i]['genre_ids'])]
    print("{0}: {1}".format(literal_eval(df_top_10.iloc[i]['title'])[0], genres))

Beauty and the Beast: ['Fantasy', 'Music', 'Romance']
Logan: ['Action', 'Drama', 'Science Fiction']
Sing: ['Animation', 'Comedy', 'Drama', 'Family', 'Music']
Kong: Skull Island: ['Science Fiction', 'Action', 'Adventure', 'Fantasy']
Jurassic World: ['Action', 'Adventure', 'Science Fiction', 'Thriller']
Ghost in the Shell: ['Action', 'Drama', 'Science Fiction']
Fantastic Beasts and Where to Find Them: ['Adventure', 'Action', 'Fantasy']
The Boss Baby: ['Animation', 'Comedy', 'Family']
Interstellar: ['Adventure', 'Drama', 'Science Fiction']
Finding Dory: ['Adventure', 'Animation', 'Comedy', 'Family']


# Challenges for Prediction

**TBD by Tyler**

# Movie Genre Pairs

### Generate movie genre pairs

In [20]:
# get list of genre ids
col_ids = [genre['id'] for genre in genre_ids['genres']]

In [33]:
# encode genre matrix if movie contains genre
def movie_id_machine(genre_ids, columns):
    new_row = []
    for column in columns:
        if column in genre_ids:
            new_row.append(1)
        else:
            new_row.append(0)
    return new_row

In [34]:
# encode genre
df['genre_encoding'] = df['genre_ids'].apply(lambda ids: movie_id_machine(ids, col_ids))

In [35]:
# create list of genre names
col_names = [genre['name'] for genre in genre_ids['genres']]

# create separate dataframe with genre encoding
df_genres = pd.DataFrame(df['genre_encoding'].tolist(), columns = col_names)

# add TMDB id to genre encoding dataframe
df_genres['id'] = df['id']
df_genres.shape

(19262, 20)

In [36]:
df_genres.head()

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,id
0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,321612
1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,263115
2,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,335797
3,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,293167
4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,135397


### Genre Pairs Visualization

In [68]:
%%HTML
<div class='tableauPlaceholder' id='viz1491394078709' style='position: relative'><noscript><a href='#'><img alt='Dashboard 1 ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;cs&#47;cs109b_genre_raw_total&#47;Dashboard1&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='site_root' value='' /><param name='name' value='cs109b_genre_raw_total&#47;Dashboard1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;cs&#47;cs109b_genre_raw_total&#47;Dashboard1&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1491394078709');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='1004px';vizElement.style.height='869px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

# Additional Visualization and EDA

In [69]:
%%HTML
<div class='tableauPlaceholder' id='viz1491392940635' style='position: relative'><noscript><a href='#'><img alt='Dashboard 1 ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;cs&#47;cs109b_genre_proportion&#47;Dashboard1&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='site_root' value='' /><param name='name' value='cs109b_genre_proportion&#47;Dashboard1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;cs&#47;cs109b_genre_proportion&#47;Dashboard1&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1491392940635');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='1004px';vizElement.style.height='869px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

# Question List

1. Is there a relationship between cast size and genre?
2. Is there a relationship between overview text and genre?