In [278]:
import pandas as pd
from ast import literal_eval
import numpy as np
from sklearn.cross_validation import train_test_split

In [251]:
# load datasets
df_tmdb = pd.read_csv('detailed_movie_data_tmdb_final.csv')
df_imdb = pd.read_csv('detailed_movie_data_imdb_final.csv')

In [252]:
# filter out records without imdb dataa
df_imdb = df_imdb[df_imdb['movie_status_code'] == True]
print(df_tmdb.shape)
print(df_imdb.shape)

(13386, 18)
(13383, 19)


In [253]:
# merge tmdb and imdb
df = df_tmdb.merge(df_imdb, how = 'inner', on = 'imdb_id', suffixes = ('_tmdb', '_imdb'))

In [254]:
df.shape

(13383, 36)

In [255]:
# compare titles
df[df['title_tmdb'] !=  df['title_imdb']][['title_imdb', 'title_imdb']]

Unnamed: 0,title_imdb,title_imdb.1
0,[u'Beauty and the Beast'],[u'Beauty and the Beast']
1,[u'Logan'],[u'Logan']
2,[u'Sing'],[u'Sing']
3,[u'Kong: Skull Island'],[u'Kong: Skull Island']
4,[u'Jurassic World'],[u'Jurassic World']
5,[u'Fantastic Beasts and Where to Find Them'],[u'Fantastic Beasts and Where to Find Them']
6,[u'Finding Dory'],[u'Finding Dory']
7,[u'Ghost in the Shell'],[u'Ghost in the Shell']
8,[u'Interstellar'],[u'Interstellar']
9,[u'Life'],[u'Life']


In [256]:
# save raw data
df.to_csv('../Milestone 3/combined_data_full.csv', index = False)

In [257]:
# convert genre dicts to genre names for tmdb
df['genres_tmdb'] = df['genres_tmdb'].apply(lambda x:[genre['name'] for genre in literal_eval(x)])

In [258]:
# encode imdb genres in same format as tmdb
df['genres_imdb'] = df['genres_imdb'].apply(lambda x:[genre.encode('utf8') for genre in literal_eval(x)] if x == x else [])

In [259]:
# view genres
df[df['genres_tmdb'] !=  df['genres_imdb']][['genres_tmdb', 'genres_imdb']]

Unnamed: 0,genres_tmdb,genres_imdb
0,"[Fantasy, Music, Romance]","[Family, Fantasy, Musical, Romance]"
1,"[Action, Drama, Science Fiction]","[Action, Drama, Sci-Fi, Thriller]"
2,"[Animation, Comedy, Drama, Family, Music]","[Animation, Comedy, Family, Music]"
3,"[Science Fiction, Action, Adventure, Fantasy]","[Action, Adventure, Fantasy, Sci-Fi]"
4,"[Action, Adventure, Science Fiction, Thriller]","[Action, Adventure, Sci-Fi]"
5,"[Adventure, Action, Fantasy]","[Adventure, Family, Fantasy]"
6,"[Adventure, Animation, Comedy, Family]","[Animation, Adventure, Comedy, Family]"
7,"[Action, Drama, Science Fiction]","[Action, Crime, Drama, Mystery, Sci-Fi, Thriller]"
8,"[Adventure, Drama, Science Fiction]","[Adventure, Drama, Sci-Fi]"
9,"[Horror, Science Fiction, Thriller]","[Horror, Sci-Fi, Thriller]"


In [260]:
# encode genre matrix if movie contains genre
def movie_id_machine(genre_list, genres):
    new_row = []
    for genre in genres:
        if genre in genre_list:
            new_row.append(1)
        else:
            new_row.append(0)
    return new_row

In [261]:
# encode genre
genres = ["Action", "Adventure", "Animation", "Comedy", "Crime", "Documentary", "Drama", "Family", "Fantasy", "History",
          "Horror", "Music", "Mystery", "Romance", "Science Fiction", "TV Movie", "Thriller", "War", "Western", "Foreign"]
df_genres = df[['genres_tmdb']]
df_genres['genre_encoding'] = df_genres['genres_tmdb'].apply(lambda genres_list: movie_id_machine(genres_list, genres))
df_genres = pd.DataFrame(df_genres['genre_encoding'].tolist(), columns = genres)
df_genres['id'] = df['id']
df_genres.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(13383, 21)

In [262]:
# get genre totals
genres_num = df_genres.drop('id', axis = 1).sum()

In [263]:
# choose single genre as outcome variable
def get_genre(tmdb_genres, imdb_genres, genres_num):
    intersection = list(set(tmdb_genres).intersection(set(imdb_genres)))
    if len(intersection) == 0:
        if len(tmdb_genres) > 0:
            intersection = tmdb_genres
        else:
            intersection = imdb_genres
    if len(intersection) == 1:
        return intersection[0]
    else:
        max_genre_count = 0
        max_genre = ''
        for genre in intersection:
            if genre in genres_num.keys():
                if genres_num[genre] > max_genre_count:
                    max_genre_count = genres_num[genre]
                    max_genre = genre
        return max_genre

In [264]:
# choose single genre as outcome variable
df['genre_intersect'] = df.apply(lambda row: get_genre(row['genres_tmdb'], row['genres_imdb'], genres_num), axis = 1)

In [265]:
df['genre_intersect'].value_counts()

Drama              5501
Comedy             3034
Thriller           1763
Action              793
Horror              717
Documentary         328
Adventure           326
Family              289
Romance             129
Western             100
Animation            93
Music                72
Science Fiction      59
Fantasy              52
Crime                45
Mystery              36
History              17
War                  14
                     12
Short                 1
Sci-Fi                1
Foreign               1
Name: genre_intersect, dtype: int64

In [266]:
# filter out small genres
print(df.shape)
df = df[(df['genre_intersect'] != '') & (df['genre_intersect'] != 'Short') & (df['genre_intersect'] != 'Sci-Fi') & (df['genre_intersect'] != 'Foreign')]
df = df[df['release_date'].notnull() & df['runtime'].notnull() & df['votes'].notnull()]
print(df.shape)

(13383, 37)
(13245, 37)


In [267]:
# add additional features
df['release_month'] = pd.to_datetime(df['release_date']).dt.month
df['release_year'] = pd.to_datetime(df['release_date']).dt.year
df['production_companies'] = df['production_companies'].apply(lambda x: len(literal_eval(x)))
df['spoken_languages'] = df['spoken_languages'].apply(lambda x: len(literal_eval(x)))

In [268]:
# calculate length of text predictor
def get_text_length(text):
    try:
        text = literal_eval(text)
        text = text[0]
        return len(text)
    except:
        return 0

In [269]:
# get length of text predictors
df['overview_length'] = df['overview'].apply(lambda x: get_text_length(x))
df['tagline_length'] = df['tagline'].apply(lambda x: get_text_length(x))
df['plot_length'] = df['plot'].apply(lambda x: get_text_length(x))
df['plot_outline_length'] = df['plot outline'].apply(lambda x: get_text_length(x))

In [270]:
# extract mpaa rating
df['mpaa_rating'] = df['mpaa'].apply(lambda x: literal_eval(x)[0].split(' ')[1] if x == x else  None)

In [271]:
# extract mpaa rating text and length
df ['mpaa_rating_text'] = df.apply(lambda row: row['mpaa'].replace('Rated ' + row['mpaa_rating'] + ' for ', '') if row['mpaa'] == row['mpaa'] else None, axis = 1)
df ['mpaa_rating_text_length'] = df['mpaa_rating_text'].apply(lambda x: get_text_length(x))

In [272]:
# keep only predictor columns
columns_to_keep = ['part_of_collection', 'budget', 'overview', 'popularity', 'production_companies', 'release_month', 'release_year', 
                   'revenue', 'runtime', 'spoken_languages', 'tagline', 'vote_average', 'vote_count', 'animation department',
                   'art department', 'camera and electrical department', 'cast', 'casting department', 'costume department',
                   'distributors', 'editorial department', 'music department', 'plot', 'plot outline', 'rating',
                   'visual effects', 'votes', 'genre_intersect', 'overview_length', 'tagline_length', 'plot_length',
                   'plot_outline_length', 'mpaa_rating_text', 'mpaa_rating_text_length', 'mpaa_rating', 'id']
df_data = df[columns_to_keep]

In [273]:
df_data.shape

(13245, 36)

In [274]:
# fill null values
df_data['plot'] = df_data['plot'].fillna('[]')
df_data['plot outline'] = df_data['plot outline'].fillna('[]')
df_data['mpaa_rating_text'] = df_data['mpaa_rating_text'].fillna('[]')
df_data['mpaa_rating'] = df_data['mpaa_rating'].fillna('NR')
df_data = df_data.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

In [275]:
df_data.isnull().sum()

part_of_collection                  0
budget                              0
overview                            0
popularity                          0
production_companies                0
release_month                       0
release_year                        0
revenue                             0
runtime                             0
spoken_languages                    0
tagline                             0
vote_average                        0
vote_count                          0
animation department                0
art department                      0
camera and electrical department    0
cast                                0
casting department                  0
costume department                  0
distributors                        0
editorial department                0
music department                    0
plot                                0
plot outline                        0
rating                              0
visual effects                      0
votes       

In [276]:
# save entire dataset
df_data.to_csv('../Milestone 3/combined_data_clean.csv', index = False)

In [281]:
# split into train and test sets
train, test = train_test_split(df_data, test_size = 0.3, random_state = 42)

In [282]:
print(train.shape)
print(test.shape)

(9271, 36)
(3974, 36)


In [283]:
train.to_csv('../Milestone 3/combined_data_clean_train.csv', index = False)
test.to_csv('../Milestone 3/combined_data_clean_test.csv', index = False)