In [68]:
import pandas as pd
import numpy as np
from pprint import pprint

pd.options.display.max_columns = None

In [69]:
from ast import literal_eval

def to_array_of_fields(field):
    def inner(string):
        return tuple(sorted([
            item[field]
            for item in 
            ([] if string == "" else literal_eval(string))
        ]))
    return inner

In [70]:
credits = pd.read_csv(
    'data/credits.csv',
    dtype={
        'id': 'int64'
    },
    converters={
        'cast': to_array_of_fields('name'),
        'crew': to_array_of_fields('name'),
    }
)
credits

Unnamed: 0,cast,crew,id
0,"(Annie Potts, Don Rickles, Erik von Detten, Ji...","(Ada Cochavi, Alan Sperling, Alec Sokolow, And...",862
1,"(Adam Hann-Byrd, Annabel Kershaw, Bebe Neuwirt...","(Chris van Allsburg, Greg Taylor, James D. Bis...",8844
2,"(Ann-Margret, Burgess Meredith, Daryl Hannah, ...","(Howard Deutch, Jack Keller, Mark Steven Johns...",15602
3,"(Angela Bassett, Dennis Haysbert, Gregory Hine...","(Caron K, Deborah Schindler, Ezra Swerdlow, Fo...",31357
4,"(BD Wong, Diane Keaton, Eugene Levy, George Ne...","(Adam Bernardi, Alan Silvestri, Albert Hackett...",11862
...,...,...,...
45471,"(Elham Korda, Kourosh Tahami, Leila Hatami)","(Azadeh Ghavam, Babak Ardalan, Farshad Mohamma...",439050
45472,"(Angel Aquino, Angeli Bayani, Bart Guingona, B...","(Dante Perez, Lav Diaz, Lav Diaz, Lav Diaz, La...",111109
45473,"(Adam Baldwin, Damian Chapa, Darrell Dubovsky,...","(C. Courtney Joyner, Jeffrey Goldenberg, João ...",67758
45474,"(Aleksandr Chabrov, Iwan Mosschuchin, Nathalie...","(Joseph N. Ermolieff, Yakov Protazanov)",227506


In [71]:
credits[credits['crew'] == float('nan')]

Unnamed: 0,cast,crew,id


In [72]:
credits_unique = credits.drop_duplicates(subset='id')
credits_unique

Unnamed: 0,cast,crew,id
0,"(Annie Potts, Don Rickles, Erik von Detten, Ji...","(Ada Cochavi, Alan Sperling, Alec Sokolow, And...",862
1,"(Adam Hann-Byrd, Annabel Kershaw, Bebe Neuwirt...","(Chris van Allsburg, Greg Taylor, James D. Bis...",8844
2,"(Ann-Margret, Burgess Meredith, Daryl Hannah, ...","(Howard Deutch, Jack Keller, Mark Steven Johns...",15602
3,"(Angela Bassett, Dennis Haysbert, Gregory Hine...","(Caron K, Deborah Schindler, Ezra Swerdlow, Fo...",31357
4,"(BD Wong, Diane Keaton, Eugene Levy, George Ne...","(Adam Bernardi, Alan Silvestri, Albert Hackett...",11862
...,...,...,...
45471,"(Elham Korda, Kourosh Tahami, Leila Hatami)","(Azadeh Ghavam, Babak Ardalan, Farshad Mohamma...",439050
45472,"(Angel Aquino, Angeli Bayani, Bart Guingona, B...","(Dante Perez, Lav Diaz, Lav Diaz, Lav Diaz, La...",111109
45473,"(Adam Baldwin, Damian Chapa, Darrell Dubovsky,...","(C. Courtney Joyner, Jeffrey Goldenberg, João ...",67758
45474,"(Aleksandr Chabrov, Iwan Mosschuchin, Nathalie...","(Joseph N. Ermolieff, Yakov Protazanov)",227506


In [73]:
keywords = pd.read_csv(
    'data/keywords.csv',
    dtype={
        'id': 'int64'
    },
    converters={
        'keywords': to_array_of_fields('name')
    }
)
keywords

Unnamed: 0,id,keywords
0,862,"(boy, boy next door, friends, friendship, jeal..."
1,8844,"(based on children's book, board game, disappe..."
2,15602,"(best friend, duringcreditsstinger, fishing, o..."
3,31357,"(based on novel, chick flick, divorce, interra..."
4,11862,"(aging, baby, confidence, contraception, daugh..."
...,...,...
46414,439050,"(tragic love,)"
46415,111109,"(artist, pinoy, play)"
46416,67758,()
46417,227506,()


In [74]:
keywords_unique = keywords.drop_duplicates(subset='id')
keywords_unique

Unnamed: 0,id,keywords
0,862,"(boy, boy next door, friends, friendship, jeal..."
1,8844,"(based on children's book, board game, disappe..."
2,15602,"(best friend, duringcreditsstinger, fishing, o..."
3,31357,"(based on novel, chick flick, divorce, interra..."
4,11862,"(aging, baby, confidence, contraception, daugh..."
...,...,...
46414,439050,"(tragic love,)"
46415,111109,"(artist, pinoy, play)"
46416,67758,()
46417,227506,()


In [75]:
movies = pd.read_csv(
    'data/movies_metadata.csv',
    dtype={
        'adult': 'boolean',
        'budget': 'int64',
        'homepage': 'str',
        'id': 'int64',
        'imdb_id': 'str',
        'original_language': 'str',
        'original_title': 'str',
        'overview': 'str',
        'popularity': 'float64',
        'poster_path': 'str',
        'revenue': 'float64',
        'runtime': 'float64',
        'status': 'str',
        'tagline': 'str',
        'title': 'str',
        'video': 'boolean',
        'vote_average': 'float64',
        'vote_count': 'int'
    },
    converters={
        'belongs_to_collection': lambda string: "" if string == "" else literal_eval(string)['name'],
        'genres': to_array_of_fields('name'),
        'production_companies': to_array_of_fields('name'),
        'production_countries': to_array_of_fields('name'),
        'spoken_languages': to_array_of_fields('name'),
    },
    skiprows=[
        19731, 29504, 35588, # strings in 'adult'
        19730, 29503, 35587  # 'NA' in 'vote_count' 
    ],
    parse_dates=['release_date']
)
movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,Toy Story Collection,30000000,"(Animation, Comedy, Family)",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"(Pixar Animation Studios,)","(United States of America,)",1995-10-30,373554033.0,81.0,"(English,)",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"(Adventure, Family, Fantasy)",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"(Interscope Communications, Teitler Film, TriS...","(United States of America,)",1995-12-15,262797249.0,104.0,"(English, Français)",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,Grumpy Old Men Collection,0,"(Comedy, Romance)",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"(Lancaster Gate, Warner Bros.)","(United States of America,)",1995-12-22,0.0,101.0,"(English,)",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92
3,False,,16000000,"(Comedy, Drama, Romance)",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"(Twentieth Century Fox Film Corporation,)","(United States of America,)",1995-12-22,81452156.0,127.0,"(English,)",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34
4,False,Father of the Bride Collection,0,"(Comedy,)",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"(Sandollar Productions, Touchstone Pictures)","(United States of America,)",1995-02-10,76578911.0,106.0,"(English,)",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45455,False,,0,"(Drama, Family)",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,(),"(Iran,)",NaT,0.0,90.0,"(فارسی,)",Released,Rising and falling between a man and woman,Subdue,False,4.0,1
45456,False,,0,"(Drama,)",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,"(Sine Olivia,)","(Philippines,)",2011-11-17,0.0,360.0,"(,)",Released,,Century of Birthing,False,9.0,3
45457,False,,0,"(Action, Drama, Thriller)",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,"(American World Pictures,)","(United States of America,)",2003-08-01,0.0,90.0,"(English,)",Released,A deadly game of wits.,Betrayal,False,3.8,6
45458,False,,0,(),,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,"(Yermoliev,)","(Russia,)",1917-10-21,0.0,87.0,(),Released,,Satan Triumphant,False,0.0,0


In [76]:
movies[movies['title'] == 'The Avengers']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
2044,False,,60000000,"(Thriller,)",,9320,tt0118661,en,The Avengers,"British Ministry agent John Steed, under direc...",9.562953,/7cJGRajXMU2aYdTbElIl6FtzOl2.jpg,"(Jerry Weintraub Productions, Warner Bros.)","(United States of America,)",1998-08-13,48585420.0,89.0,"(English,)",Released,Saving the World in Style.,The Avengers,False,4.4,205
17818,False,The Avengers Collection,220000000,"(Action, Adventure, Science Fiction)",http://marvel.com/avengers_movie/,24428,tt0848228,en,The Avengers,When an unexpected enemy emerges and threatens...,89.887648,/cezWGskPY5x7GaglTTRN4Fugfb8.jpg,"(Marvel Studios, Paramount Pictures)","(United States of America,)",2012-04-25,1519558000.0,143.0,"(English,)",Released,Some assembly required.,The Avengers,False,7.4,12000


In [77]:
movie_title_to_id = dict()
movie_id_to_original = dict()
for idx, movie in movies.iterrows():
    id, title = movie['id'], movie['title']
    if title not in movie_title_to_id:
        movie_title_to_id[title] = id
    movie_id_to_original[id] = movie_title_to_id[title]

len(movie_title_to_id), len(movie_id_to_original)

(42277, 45430)

In [78]:
movies_unique = movies.drop_duplicates(subset=['title'])
movies_unique

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,Toy Story Collection,30000000,"(Animation, Comedy, Family)",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"(Pixar Animation Studios,)","(United States of America,)",1995-10-30,373554033.0,81.0,"(English,)",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"(Adventure, Family, Fantasy)",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"(Interscope Communications, Teitler Film, TriS...","(United States of America,)",1995-12-15,262797249.0,104.0,"(English, Français)",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,Grumpy Old Men Collection,0,"(Comedy, Romance)",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"(Lancaster Gate, Warner Bros.)","(United States of America,)",1995-12-22,0.0,101.0,"(English,)",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92
3,False,,16000000,"(Comedy, Drama, Romance)",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"(Twentieth Century Fox Film Corporation,)","(United States of America,)",1995-12-22,81452156.0,127.0,"(English,)",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34
4,False,Father of the Bride Collection,0,"(Comedy,)",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"(Sandollar Productions, Touchstone Pictures)","(United States of America,)",1995-02-10,76578911.0,106.0,"(English,)",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45453,False,,0,"(Science Fiction,)",,222848,tt0112613,en,Caged Heat 3000,It's the year 3000 AD. The world's most danger...,0.661558,/4lF9LH0b0Z1X94xGK9IOzqEW6k1.jpg,"(Concorde-New Horizons,)","(United States of America,)",1995-01-01,0.0,85.0,"(English,)",Released,,Caged Heat 3000,False,3.5,1
45455,False,,0,"(Drama, Family)",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,(),"(Iran,)",NaT,0.0,90.0,"(فارسی,)",Released,Rising and falling between a man and woman,Subdue,False,4.0,1
45456,False,,0,"(Drama,)",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,"(Sine Olivia,)","(Philippines,)",2011-11-17,0.0,360.0,"(,)",Released,,Century of Birthing,False,9.0,3
45458,False,,0,(),,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,"(Yermoliev,)","(Russia,)",1917-10-21,0.0,87.0,(),Released,,Satan Triumphant,False,0.0,0


In [79]:
movies[movies['genres'] == ()]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
55,False,,0,(),,124057,tt0113541,en,Kids of the Round Table,"Set in modern times, Alex finds King Arthur's ...",0.307075,/tL9ImxOccEshRlgmrKQabKH2tCd.jpg,"(Melenny Productions, Telefilm Canada)",(),1997-07-08,0.0,89.0,"(English,)",Released,,Kids of the Round Table,False,3.0,1
83,False,,0,(),,188588,tt0113612,en,Last Summer in the Hamptons,"Filmed entirely on location in East Hampton, L...",0.531159,/pfgpkDNcwSi1x4jVzeLqvxTwX5a.jpg,(),(),1995-11-22,0.0,108.0,(),Released,,Last Summer in the Hamptons,False,0.0,0
126,False,,0,(),,290157,tt0110217,en,Jupiter's Wife,"Michel Negroponte, a documentary filmmaker, me...",0.001178,/uUi23HjvDFYGfuVlCBGozUY1Ab4.jpg,(),(),1995-01-01,0.0,87.0,(),Released,A Haunting Real Life Mystery,Jupiter's Wife,False,0.0,0
137,False,,0,(),,124639,tt0114618,en,Target,A subtle yet violent commentary on feudal lords.,0.001205,/z0ezqAFMeGYd5mLEWaN8jC9eczF.jpg,(),(),1995-08-01,0.0,122.0,(),Released,,Target,False,0.0,0
390,False,,0,(),,267188,tt0112849,en,Desert Winds,Jackie and Eugene are joined by a mystical win...,0.251223,/taQ3WgPtiGT9bVPTlQ4caxFlvUA.jpg,(),(),1997-12-31,0.0,97.0,(),Released,It began with a whisper,Desert Winds,False,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45441,False,,0,(),,44324,tt0135631,fr,Le Roi du maquillage,The background of this picture represents a sc...,0.213973,/ifWvveLPWWrzitz4oL01YMEokiB.jpg,"(Star Film Company,)","(France,)",1904-03-05,0.0,3.0,(),Released,,The Untameable Whiskers,False,6.0,6
45442,False,,0,(),,122036,tt0224286,fr,Les Transmutations imperceptibles,This shows a prince entering upon the stage of...,0.071782,/ifWvveLPWWrzitz4oL01YMEokiB.jpg,"(Star Film Company,)","(France,)",1904-01-01,0.0,2.0,(),Released,,The Imperceptable Transmutations,False,5.0,2
45449,False,,0,(),,67179,tt0069215,it,San Michele aveva un gallo,Sentenced to life imprisonment for illegal act...,0.225051,/j1AN0L4motTt8SBxeTMXDtExsYl.jpg,(),(),1972-01-01,0.0,90.0,"(Italiano,)",Released,,St. Michael Had a Rooster,False,6.0,3
45458,False,,0,(),,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,"(Yermoliev,)","(Russia,)",1917-10-21,0.0,87.0,(),Released,,Satan Triumphant,False,0.0,0


In [80]:
duplicates = 0
for i, m in movies_unique.iterrows():
    id = m['id']
    if id != movie_id_to_original[id]:
        duplicates += 1
print(duplicates)

0


In [81]:
credits_unique['id'].count(), movies_unique['id'].count(), keywords_unique['id'].count()

(45432, 42277, 45432)

In [82]:
not_real = 0
for i, c in keywords_unique.iterrows():
    id = c['id']
    if id not in movie_id_to_original or movie_id_to_original[id] != id:
        not_real += 1

not_real

3156

In [83]:
everything = pd.merge(
    left=pd.merge(left=movies, right=credits_unique, on='id', how='inner'),
    right=keywords_unique,
    on='id',
    how='inner'
)
everything

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,Toy Story Collection,30000000,"(Animation, Comedy, Family)",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"(Pixar Animation Studios,)","(United States of America,)",1995-10-30,373554033.0,81.0,"(English,)",Released,,Toy Story,False,7.7,5415,"(Annie Potts, Don Rickles, Erik von Detten, Ji...","(Ada Cochavi, Alan Sperling, Alec Sokolow, And...","(boy, boy next door, friends, friendship, jeal..."
1,False,,65000000,"(Adventure, Family, Fantasy)",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"(Interscope Communications, Teitler Film, TriS...","(United States of America,)",1995-12-15,262797249.0,104.0,"(English, Français)",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413,"(Adam Hann-Byrd, Annabel Kershaw, Bebe Neuwirt...","(Chris van Allsburg, Greg Taylor, James D. Bis...","(based on children's book, board game, disappe..."
2,False,Grumpy Old Men Collection,0,"(Comedy, Romance)",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"(Lancaster Gate, Warner Bros.)","(United States of America,)",1995-12-22,0.0,101.0,"(English,)",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92,"(Ann-Margret, Burgess Meredith, Daryl Hannah, ...","(Howard Deutch, Jack Keller, Mark Steven Johns...","(best friend, duringcreditsstinger, fishing, o..."
3,False,,16000000,"(Comedy, Drama, Romance)",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"(Twentieth Century Fox Film Corporation,)","(United States of America,)",1995-12-22,81452156.0,127.0,"(English,)",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34,"(Angela Bassett, Dennis Haysbert, Gregory Hine...","(Caron K, Deborah Schindler, Ezra Swerdlow, Fo...","(based on novel, chick flick, divorce, interra..."
4,False,Father of the Bride Collection,0,"(Comedy,)",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"(Sandollar Productions, Touchstone Pictures)","(United States of America,)",1995-02-10,76578911.0,106.0,"(English,)",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173,"(BD Wong, Diane Keaton, Eugene Levy, George Ne...","(Adam Bernardi, Alan Silvestri, Albert Hackett...","(aging, baby, confidence, contraception, daugh..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45454,False,,0,"(Drama, Family)",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,(),"(Iran,)",NaT,0.0,90.0,"(فارسی,)",Released,Rising and falling between a man and woman,Subdue,False,4.0,1,"(Elham Korda, Kourosh Tahami, Leila Hatami)","(Azadeh Ghavam, Babak Ardalan, Farshad Mohamma...","(tragic love,)"
45455,False,,0,"(Drama,)",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,"(Sine Olivia,)","(Philippines,)",2011-11-17,0.0,360.0,"(,)",Released,,Century of Birthing,False,9.0,3,"(Angel Aquino, Angeli Bayani, Bart Guingona, B...","(Dante Perez, Lav Diaz, Lav Diaz, Lav Diaz, La...","(artist, pinoy, play)"
45456,False,,0,"(Action, Drama, Thriller)",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,"(American World Pictures,)","(United States of America,)",2003-08-01,0.0,90.0,"(English,)",Released,A deadly game of wits.,Betrayal,False,3.8,6,"(Adam Baldwin, Damian Chapa, Darrell Dubovsky,...","(C. Courtney Joyner, Jeffrey Goldenberg, João ...",()
45457,False,,0,(),,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,"(Yermoliev,)","(Russia,)",1917-10-21,0.0,87.0,(),Released,,Satan Triumphant,False,0.0,0,"(Aleksandr Chabrov, Iwan Mosschuchin, Nathalie...","(Joseph N. Ermolieff, Yakov Protazanov)",()


In [84]:
main = everything.drop([
    'adult', # there's only 9 adult films
    'budget', # might be useful for decision trees
    'homepage', 
    'imdb_id', 
    'poster_path',
    'revenue', # might be useful for decision trees
    'original_title', # its in the original language, 'title' is in english
    'video', # there's only 95 non video movies
], axis=1)

main = main[[
    'id', 
    'title', 
    'overview', 
    'genres',
    'belongs_to_collection',
    'original_language',
    'popularity',
    'production_companies',
    'production_countries', 
    'release_date', 
    'runtime', 
    'spoken_languages',
    'status', 
    'tagline', 
    'vote_average', 
    'vote_count', 
    'cast',
    'crew', 
    'keywords'
]]

main

Unnamed: 0,id,title,overview,genres,belongs_to_collection,original_language,popularity,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,vote_average,vote_count,cast,crew,keywords
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","(Animation, Comedy, Family)",Toy Story Collection,en,21.946943,"(Pixar Animation Studios,)","(United States of America,)",1995-10-30,81.0,"(English,)",Released,,7.7,5415,"(Annie Potts, Don Rickles, Erik von Detten, Ji...","(Ada Cochavi, Alan Sperling, Alec Sokolow, And...","(boy, boy next door, friends, friendship, jeal..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"(Adventure, Family, Fantasy)",,en,17.015539,"(Interscope Communications, Teitler Film, TriS...","(United States of America,)",1995-12-15,104.0,"(English, Français)",Released,Roll the dice and unleash the excitement!,6.9,2413,"(Adam Hann-Byrd, Annabel Kershaw, Bebe Neuwirt...","(Chris van Allsburg, Greg Taylor, James D. Bis...","(based on children's book, board game, disappe..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"(Comedy, Romance)",Grumpy Old Men Collection,en,11.712900,"(Lancaster Gate, Warner Bros.)","(United States of America,)",1995-12-22,101.0,"(English,)",Released,Still Yelling. Still Fighting. Still Ready for...,6.5,92,"(Ann-Margret, Burgess Meredith, Daryl Hannah, ...","(Howard Deutch, Jack Keller, Mark Steven Johns...","(best friend, duringcreditsstinger, fishing, o..."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","(Comedy, Drama, Romance)",,en,3.859495,"(Twentieth Century Fox Film Corporation,)","(United States of America,)",1995-12-22,127.0,"(English,)",Released,Friends are the people who let you be yourself...,6.1,34,"(Angela Bassett, Dennis Haysbert, Gregory Hine...","(Caron K, Deborah Schindler, Ezra Swerdlow, Fo...","(based on novel, chick flick, divorce, interra..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"(Comedy,)",Father of the Bride Collection,en,8.387519,"(Sandollar Productions, Touchstone Pictures)","(United States of America,)",1995-02-10,106.0,"(English,)",Released,Just When His World Is Back To Normal... He's ...,5.7,173,"(BD Wong, Diane Keaton, Eugene Levy, George Ne...","(Adam Bernardi, Alan Silvestri, Albert Hackett...","(aging, baby, confidence, contraception, daugh..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45454,439050,Subdue,Rising and falling between a man and woman.,"(Drama, Family)",,fa,0.072051,(),"(Iran,)",NaT,90.0,"(فارسی,)",Released,Rising and falling between a man and woman,4.0,1,"(Elham Korda, Kourosh Tahami, Leila Hatami)","(Azadeh Ghavam, Babak Ardalan, Farshad Mohamma...","(tragic love,)"
45455,111109,Century of Birthing,An artist struggles to finish his work while a...,"(Drama,)",,tl,0.178241,"(Sine Olivia,)","(Philippines,)",2011-11-17,360.0,"(,)",Released,,9.0,3,"(Angel Aquino, Angeli Bayani, Bart Guingona, B...","(Dante Perez, Lav Diaz, Lav Diaz, Lav Diaz, La...","(artist, pinoy, play)"
45456,67758,Betrayal,"When one of her hits goes wrong, a professiona...","(Action, Drama, Thriller)",,en,0.903007,"(American World Pictures,)","(United States of America,)",2003-08-01,90.0,"(English,)",Released,A deadly game of wits.,3.8,6,"(Adam Baldwin, Damian Chapa, Darrell Dubovsky,...","(C. Courtney Joyner, Jeffrey Goldenberg, João ...",()
45457,227506,Satan Triumphant,"In a small town live two brothers, one a minis...",(),,en,0.003503,"(Yermoliev,)","(Russia,)",1917-10-21,87.0,(),Released,,0.0,0,"(Aleksandr Chabrov, Iwan Mosschuchin, Nathalie...","(Joseph N. Ermolieff, Yakov Protazanov)",()


In [85]:
from collections import defaultdict

def analyse_array_column(column):
    counter = defaultdict(int)
    for array in main[column]:
        for item in array:
            counter[item] += 1
    print('=' * 20, column, '=' * 20)
    print('Unique Items', len(counter))
    items = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    print(items[:20])
    print('...' if len(items) > 20 else '')

In [86]:
analyse_array_column('genres')
analyse_array_column('production_companies')
analyse_array_column('production_countries')
analyse_array_column('spoken_languages')
analyse_array_column('cast')
analyse_array_column('crew')
analyse_array_column('keywords')

Unique Items 20
[('Drama', 20264), ('Comedy', 13182), ('Thriller', 7623), ('Romance', 6735), ('Action', 6594), ('Horror', 4672), ('Crime', 4307), ('Documentary', 3932), ('Adventure', 3496), ('Science Fiction', 3047), ('Family', 2770), ('Mystery', 2467), ('Fantasy', 2313), ('Animation', 1934), ('Foreign', 1622), ('Music', 1598), ('History', 1398), ('War', 1323), ('Western', 1042), ('TV Movie', 766)]

Unique Items 23537
[('Warner Bros.', 1250), ('Metro-Goldwyn-Mayer (MGM)', 1076), ('Paramount Pictures', 1003), ('Twentieth Century Fox Film Corporation', 836), ('Universal Pictures', 830), ('Columbia Pictures Corporation', 448), ('Canal+', 438), ('Columbia Pictures', 431), ('RKO Radio Pictures', 290), ('United Artists', 279), ('New Line Cinema', 277), ('Walt Disney Pictures', 263), ('Touchstone Pictures', 225), ('TriStar Pictures', 197), ('Mosfilm', 188), ('Miramax Films', 183), ('France 2 Cinéma', 168), ('Centre National de la Cinématographie (CNC)', 163), ('Toho Company', 142), ('BBC Film

In [87]:
main.to_csv('./data/basic.csv')

In [88]:
main

Unnamed: 0,id,title,overview,genres,belongs_to_collection,original_language,popularity,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,vote_average,vote_count,cast,crew,keywords
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","(Animation, Comedy, Family)",Toy Story Collection,en,21.946943,"(Pixar Animation Studios,)","(United States of America,)",1995-10-30,81.0,"(English,)",Released,,7.7,5415,"(Annie Potts, Don Rickles, Erik von Detten, Ji...","(Ada Cochavi, Alan Sperling, Alec Sokolow, And...","(boy, boy next door, friends, friendship, jeal..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"(Adventure, Family, Fantasy)",,en,17.015539,"(Interscope Communications, Teitler Film, TriS...","(United States of America,)",1995-12-15,104.0,"(English, Français)",Released,Roll the dice and unleash the excitement!,6.9,2413,"(Adam Hann-Byrd, Annabel Kershaw, Bebe Neuwirt...","(Chris van Allsburg, Greg Taylor, James D. Bis...","(based on children's book, board game, disappe..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"(Comedy, Romance)",Grumpy Old Men Collection,en,11.712900,"(Lancaster Gate, Warner Bros.)","(United States of America,)",1995-12-22,101.0,"(English,)",Released,Still Yelling. Still Fighting. Still Ready for...,6.5,92,"(Ann-Margret, Burgess Meredith, Daryl Hannah, ...","(Howard Deutch, Jack Keller, Mark Steven Johns...","(best friend, duringcreditsstinger, fishing, o..."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","(Comedy, Drama, Romance)",,en,3.859495,"(Twentieth Century Fox Film Corporation,)","(United States of America,)",1995-12-22,127.0,"(English,)",Released,Friends are the people who let you be yourself...,6.1,34,"(Angela Bassett, Dennis Haysbert, Gregory Hine...","(Caron K, Deborah Schindler, Ezra Swerdlow, Fo...","(based on novel, chick flick, divorce, interra..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"(Comedy,)",Father of the Bride Collection,en,8.387519,"(Sandollar Productions, Touchstone Pictures)","(United States of America,)",1995-02-10,106.0,"(English,)",Released,Just When His World Is Back To Normal... He's ...,5.7,173,"(BD Wong, Diane Keaton, Eugene Levy, George Ne...","(Adam Bernardi, Alan Silvestri, Albert Hackett...","(aging, baby, confidence, contraception, daugh..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45454,439050,Subdue,Rising and falling between a man and woman.,"(Drama, Family)",,fa,0.072051,(),"(Iran,)",NaT,90.0,"(فارسی,)",Released,Rising and falling between a man and woman,4.0,1,"(Elham Korda, Kourosh Tahami, Leila Hatami)","(Azadeh Ghavam, Babak Ardalan, Farshad Mohamma...","(tragic love,)"
45455,111109,Century of Birthing,An artist struggles to finish his work while a...,"(Drama,)",,tl,0.178241,"(Sine Olivia,)","(Philippines,)",2011-11-17,360.0,"(,)",Released,,9.0,3,"(Angel Aquino, Angeli Bayani, Bart Guingona, B...","(Dante Perez, Lav Diaz, Lav Diaz, Lav Diaz, La...","(artist, pinoy, play)"
45456,67758,Betrayal,"When one of her hits goes wrong, a professiona...","(Action, Drama, Thriller)",,en,0.903007,"(American World Pictures,)","(United States of America,)",2003-08-01,90.0,"(English,)",Released,A deadly game of wits.,3.8,6,"(Adam Baldwin, Damian Chapa, Darrell Dubovsky,...","(C. Courtney Joyner, Jeffrey Goldenberg, João ...",()
45457,227506,Satan Triumphant,"In a small town live two brothers, one a minis...",(),,en,0.003503,"(Yermoliev,)","(Russia,)",1917-10-21,87.0,(),Released,,0.0,0,"(Aleksandr Chabrov, Iwan Mosschuchin, Nathalie...","(Joseph N. Ermolieff, Yakov Protazanov)",()


In [89]:
movies_unique.to_csv('data/metadata.csv')