In [82]:
import pandas as pd
import numpy as np
from pprint import pprint

pd.options.display.max_columns = None

In [109]:
from ast import literal_eval

def to_array_of_fields(field):
    def inner(string):
        return [
            item[field]
            for item in 
            ([] if string == "" else literal_eval(string))
        ]
    return inner

In [110]:
credits = pd.read_csv(
    'data/credits.csv',
    dtype={
        'id': 'int64'
    },
    converters={
        'cast': to_array_of_fields('name'),
        'crew': to_array_of_fields('name'),
    }
)
credits

Unnamed: 0,cast,crew,id
0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...",862
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...",8844
2,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[Howard Deutch, Mark Steven Johnson, Mark Stev...",15602
3,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...",31357
4,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...",11862
...,...,...,...
45471,"[Leila Hatami, Kourosh Tahami, Elham Korda]","[Hamid Nematollah, Hamid Nematollah, Farshad M...",439050
45472,"[Angel Aquino, Perry Dizon, Hazel Orencio, Joe...","[Lav Diaz, Lav Diaz, Dante Perez, Lav Diaz, La...",111109
45473,"[Erika Eleniak, Adam Baldwin, Julie du Page, J...","[Mark L. Lester, C. Courtney Joyner, Jeffrey G...",67758
45474,"[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...","[Yakov Protazanov, Joseph N. Ermolieff]",227506


In [111]:
keywords = pd.read_csv(
    'data/keywords.csv',
    dtype={
        'id': 'int64'
    },
    converters={
        'keywords': to_array_of_fields('name')
    }
)
keywords

Unnamed: 0,id,keywords
0,862,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[board game, disappearance, based on children'..."
2,15602,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,"[based on novel, interracial relationship, sin..."
4,11862,"[baby, midlife crisis, confidence, aging, daug..."
...,...,...
46414,439050,[tragic love]
46415,111109,"[artist, play, pinoy]"
46416,67758,[]
46417,227506,[]


In [113]:
movies = pd.read_csv(
    'data/movies_metadata.csv',
    dtype={
        'adult': 'boolean',
        'budget': 'int64',
        'homepage': 'str',
        'id': 'int64',
        'imdb_id': 'str',
        'original_language': 'str',
        'original_title': 'str',
        'overview': 'str',
        'popularity': 'float64',
        'poster_path': 'str',
        'revenue': 'float64',
        'runtime': 'float64',
        'status': 'str',
        'tagline': 'str',
        'title': 'str',
        'video': 'boolean',
        'vote_average': 'float64',
        'vote_count': 'int'
    },
    converters={
        'belongs_to_collection': lambda string: "" if string == "" else literal_eval(string)['name'],
        'genres': to_array_of_fields('name'),
        'production_companies': to_array_of_fields('name'),
        'production_countries': to_array_of_fields('name'),
        'spoken_languages': to_array_of_fields('name'),
    },
    skiprows=[
        19731, 29504, 35588, # strings in 'adult'
        19730, 29503, 35587  # 'NA' in 'vote_count' 
    ],
    parse_dates=['release_date']
)
movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,[Pixar Animation Studios],[United States of America],1995-10-30,373554033.0,81.0,[English],Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,262797249.0,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,Grumpy Old Men Collection,0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[Warner Bros., Lancaster Gate]",[United States of America],1995-12-22,0.0,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[Twentieth Century Fox Film Corporation],[United States of America],1995-12-22,81452156.0,127.0,[English],Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34
4,False,Father of the Bride Collection,0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[Sandollar Productions, Touchstone Pictures]",[United States of America],1995-02-10,76578911.0,106.0,[English],Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45455,False,,0,"[Drama, Family]",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,[],[Iran],NaT,0.0,90.0,[فارسی],Released,Rising and falling between a man and woman,Subdue,False,4.0,1
45456,False,,0,[Drama],,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,[Sine Olivia],[Philippines],2011-11-17,0.0,360.0,[],Released,,Century of Birthing,False,9.0,3
45457,False,,0,"[Action, Drama, Thriller]",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,[American World Pictures],[United States of America],2003-08-01,0.0,90.0,[English],Released,A deadly game of wits.,Betrayal,False,3.8,6
45458,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,[Yermoliev],[Russia],1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0


In [122]:
credits['id'].count(), movies['id'].count(), keywords['id'].count()

(45476, 45460, 46419)

In [131]:
everything = pd.merge(
    left=pd.merge(right=credits, left=movies, on='id'),
    right=keywords,
    on='id'
)
everything

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,[Pixar Animation Studios],[United States of America],1995-10-30,373554033.0,81.0,[English],Released,,Toy Story,False,7.7,5415,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...","[jealousy, toy, boy, friendship, friends, riva..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,262797249.0,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...","[board game, disappearance, based on children'..."
2,False,Grumpy Old Men Collection,0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[Warner Bros., Lancaster Gate]",[United States of America],1995-12-22,0.0,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[Howard Deutch, Mark Steven Johnson, Mark Stev...","[fishing, best friend, duringcreditsstinger, o..."
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[Twentieth Century Fox Film Corporation],[United States of America],1995-12-22,81452156.0,127.0,[English],Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...","[based on novel, interracial relationship, sin..."
4,False,Father of the Bride Collection,0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[Sandollar Productions, Touchstone Pictures]",[United States of America],1995-02-10,76578911.0,106.0,[English],Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...","[baby, midlife crisis, confidence, aging, daug..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46619,False,,0,"[Drama, Family]",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,[],[Iran],NaT,0.0,90.0,[فارسی],Released,Rising and falling between a man and woman,Subdue,False,4.0,1,"[Leila Hatami, Kourosh Tahami, Elham Korda]","[Hamid Nematollah, Hamid Nematollah, Farshad M...",[tragic love]
46620,False,,0,[Drama],,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,[Sine Olivia],[Philippines],2011-11-17,0.0,360.0,[],Released,,Century of Birthing,False,9.0,3,"[Angel Aquino, Perry Dizon, Hazel Orencio, Joe...","[Lav Diaz, Lav Diaz, Dante Perez, Lav Diaz, La...","[artist, play, pinoy]"
46621,False,,0,"[Action, Drama, Thriller]",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,[American World Pictures],[United States of America],2003-08-01,0.0,90.0,[English],Released,A deadly game of wits.,Betrayal,False,3.8,6,"[Erika Eleniak, Adam Baldwin, Julie du Page, J...","[Mark L. Lester, C. Courtney Joyner, Jeffrey G...",[]
46622,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,[Yermoliev],[Russia],1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0,"[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...","[Yakov Protazanov, Joseph N. Ermolieff]",[]


In [150]:
main = everything.drop([
    'adult', # there's only 9 adult films
    'budget', # might be useful for decision trees
    'homepage', 
    'imdb_id', 
    'poster_path',
    'revenue', # might be useful for decision trees
    'original_title', # its in the original language, 'title' is in english
    'video', # there's only 95 non video movies
], axis=1)
main = main[[
    'id', 
    'title', 
    'overview', 
    'genres',
    'belongs_to_collection',
    'original_language',
    'popularity',
    'production_companies',
    'production_countries', 
    'release_date', 
    'runtime', 
    'spoken_languages',
    'status', 
    'tagline', 
    'vote_average', 
    'vote_count', 
    'cast',
    'crew', 
    'keywords'
]]
main

Unnamed: 0,id,title,overview,genres,belongs_to_collection,original_language,popularity,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,vote_average,vote_count,cast,crew,keywords
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",Toy Story Collection,en,21.946943,[Pixar Animation Studios],[United States of America],1995-10-30,81.0,[English],Released,,7.7,5415,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...","[jealousy, toy, boy, friendship, friends, riva..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",,en,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,6.9,2413,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...","[board game, disappearance, based on children'..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]",Grumpy Old Men Collection,en,11.712900,"[Warner Bros., Lancaster Gate]",[United States of America],1995-12-22,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,6.5,92,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[Howard Deutch, Mark Steven Johnson, Mark Stev...","[fishing, best friend, duringcreditsstinger, o..."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]",,en,3.859495,[Twentieth Century Fox Film Corporation],[United States of America],1995-12-22,127.0,[English],Released,Friends are the people who let you be yourself...,6.1,34,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...","[based on novel, interracial relationship, sin..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],Father of the Bride Collection,en,8.387519,"[Sandollar Productions, Touchstone Pictures]",[United States of America],1995-02-10,106.0,[English],Released,Just When His World Is Back To Normal... He's ...,5.7,173,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...","[baby, midlife crisis, confidence, aging, daug..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46619,439050,Subdue,Rising and falling between a man and woman.,"[Drama, Family]",,fa,0.072051,[],[Iran],NaT,90.0,[فارسی],Released,Rising and falling between a man and woman,4.0,1,"[Leila Hatami, Kourosh Tahami, Elham Korda]","[Hamid Nematollah, Hamid Nematollah, Farshad M...",[tragic love]
46620,111109,Century of Birthing,An artist struggles to finish his work while a...,[Drama],,tl,0.178241,[Sine Olivia],[Philippines],2011-11-17,360.0,[],Released,,9.0,3,"[Angel Aquino, Perry Dizon, Hazel Orencio, Joe...","[Lav Diaz, Lav Diaz, Dante Perez, Lav Diaz, La...","[artist, play, pinoy]"
46621,67758,Betrayal,"When one of her hits goes wrong, a professiona...","[Action, Drama, Thriller]",,en,0.903007,[American World Pictures],[United States of America],2003-08-01,90.0,[English],Released,A deadly game of wits.,3.8,6,"[Erika Eleniak, Adam Baldwin, Julie du Page, J...","[Mark L. Lester, C. Courtney Joyner, Jeffrey G...",[]
46622,227506,Satan Triumphant,"In a small town live two brothers, one a minis...",[],,en,0.003503,[Yermoliev],[Russia],1917-10-21,87.0,[],Released,,0.0,0,"[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...","[Yakov Protazanov, Joseph N. Ermolieff]",[]


In [167]:
def analyse_array_column(column):
    counter = defaultdict(int)
    for array in main[column]:
        for item in array:
            counter[item] += 1
    print('=' * 20, column,'=' * 20)
    print('Unique Items', len(counter))
    items = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    pprint(items[:20])
    print('...' if len(items) > 20 else '')

In [168]:
analyse_array_column('genres')
analyse_array_column('production_companies')
analyse_array_column('production_countries')
analyse_array_column('spoken_languages')
analyse_array_column('cast')
analyse_array_column('crew')
analyse_array_column('keywords')

Unique Items 20
[('Drama', 20808),
 ('Comedy', 13467),
 ('Thriller', 7793),
 ('Romance', 6923),
 ('Action', 6727),
 ('Horror', 4760),
 ('Crime', 4387),
 ('Documentary', 4050),
 ('Adventure', 3587),
 ('Science Fiction', 3137),
 ('Family', 2831),
 ('Mystery', 2541),
 ('Fantasy', 2385),
 ('Animation', 1997),
 ('Foreign', 1681),
 ('Music', 1630),
 ('History', 1427),
 ('War', 1348),
 ('Western', 1059),
 ('TV Movie', 791)]

Unique Items 23537
[('Warner Bros.', 1256),
 ('Metro-Goldwyn-Mayer (MGM)', 1094),
 ('Paramount Pictures', 1025),
 ('Twentieth Century Fox Film Corporation', 841),
 ('Universal Pictures', 836),
 ('Canal+', 454),
 ('Columbia Pictures Corporation', 451),
 ('Columbia Pictures', 434),
 ('RKO Radio Pictures', 292),
 ('United Artists', 281),
 ('New Line Cinema', 278),
 ('Walt Disney Pictures', 268),
 ('Touchstone Pictures', 225),
 ('TriStar Pictures', 197),
 ('Mosfilm', 190),
 ('Miramax Films', 189),
 ('France 2 Cinéma', 179),
 ('Centre National de la Cinématographie (CNC)', 173