# Generating Similarity Search CSV and Prediction CSV

### 1 Load Packages

In [68]:
# ========================General========================
import json
import joblib
import scipy
import pandas as pd
import numpy as np
import requests
# ignore warnings
import warnings
warnings.filterwarnings('ignore')
# ========================Data Preprocessing========================
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# ========================Modeling========================
from sklearn.tree import DecisionTreeClassifier
# ========================Evaluation========================
from sklearn.model_selection import  cross_val_score
from sklearn.model_selection import GridSearchCV

In [69]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 2 Load dataset
In this kernel, we standardize the TMDB dataset
to the same pattern as the IMDB dataset to make the best use of existing
models on Kaggle. This whole load dataset session we referred:
https://www.kaggle.com/sohier/getting-imdb-kernels-working-with-tmdb-data/

In [70]:
# load movie
def load_tmdb_movies(path):
    df = pd.read_csv(path)
    df = df[df.release_date.notnull()]
    df = df[df.release_date.str.contains("^\d{4}\-\d{2}.")]
    df = df[df.release_date>="1970-01-01"]
    df['release_year'] = pd.to_datetime(df['release_date']).apply(lambda x: int(x.year) if x==x else None)
    df['release_month'] = pd.to_datetime(df['release_date']).apply(lambda x: int(x.month) if x==x else None)
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [71]:
# load movie credits
def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [72]:
# Columns that existed in IMDB version of the dataset and are gone.
LOST_COLUMNS = [
    'actor_1_facebook_likes',
    'actor_2_facebook_likes',
    'actor_3_facebook_likes',
    'aspect_ratio',
    'cast_total_facebook_likes',
    'color',
    'content_rating',
    'director_facebook_likes',
    'facenumber_in_poster',
    'movie_facebook_likes',
    'movie_imdb_link',
    'num_critic_for_reviews',
    'num_user_for_reviews']

# Columns in TMDb that had direct equivalents in the IMDB version. 
TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = {
    'budget': 'budget',
    'genres': 'genres',
    'revenue': 'box_office',
    'title': 'movie_title',
    'runtime': 'duration',
    'original_language': 'language',  
    'keywords': 'plot_keywords',
    'vote_count': 'num_voted_users'}

IMDB_COLUMNS_TO_REMAP = {'imdb_score': 'vote_average'}


def safe_access(container, index_values):
    # return a missing value rather than an error upon indexing/key failure
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan


def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return safe_access(directors, [0])


def pipe_flatten_names(keywords):
    return '|'.join([x['name'] for x in keywords])

# Function to convert dataset to IMDB format
def convert_to_original_format(movies, credits):
    tmdb_movies = movies.copy()
    tmdb_movies.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES, inplace=True)
    tmdb_movies['country'] = tmdb_movies['production_countries'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['language'] = tmdb_movies['spoken_languages'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['director_name'] = credits['crew'].apply(get_director)
    tmdb_movies['actor_1_name'] = credits['cast'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['actor_2_name'] = credits['cast'].apply(lambda x: safe_access(x, [1, 'name']))
    tmdb_movies['actor_3_name'] = credits['cast'].apply(lambda x: safe_access(x, [2, 'name']))
    tmdb_movies['companies_1'] = tmdb_movies['production_companies'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['companies_2'] = tmdb_movies['production_companies'].apply(lambda x: safe_access(x, [1, 'name']))
    tmdb_movies['companies_3'] = tmdb_movies['production_companies'].apply(lambda x: safe_access(x, [2, 'name']))
    tmdb_movies['genres'] = tmdb_movies['genres'].apply(pipe_flatten_names)
    tmdb_movies['plot_keywords'] = tmdb_movies['plot_keywords'].apply(pipe_flatten_names)
    return tmdb_movies

In [73]:
# link to raw data: https://drive.google.com/file/d/19C8m6CwRu9I-eydnTbp4gPCp_eeFMb7t/view?usp=sharing
movies_path = '/content/drive/My Drive/INFO7374/movie_data.csv'
# link to raw data: https://drive.google.com/file/d/1mnH9UaaXZ-gP3At0Q2Qus-Gz0EJOqiYx/view?usp=sharing
credits_path = '/content/drive/My Drive/INFO7374/movie_credits.csv'

In [74]:
movies = load_tmdb_movies(movies_path)
credits = load_tmdb_credits(credits_path)
original_format =convert_to_original_format(movies, credits)

### 3 Data Preprocessing
Drop duplicate and invalid values

In [75]:
original_format.head()

Unnamed: 0,budget,genres,homepage,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,box_office,duration,spoken_languages,status,tagline,movie_title,vote_average,num_voted_users,release_year,release_month,country,director_name,actor_1_name,actor_2_name,actor_3_name,companies_1,companies_2,companies_3
3,0,Comedy,,21624,,,Jatts in Golmaal,Jatts in Golmal is an Comedy based movie. In w...,0.6,[],[],2003-02-21,0.0,,[],Released,,Jatts in Golmaal,0.0,0.0,2003,2,,,,,,,,
4,0,,http://www.nwdfilms.com,25449,sport|mountain bike,English,New World Disorder 9: Never Enough,Gee Atherton ripping the Worlds course the day...,1.475,[],[],2008-12-08,0.0,69.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,New World Disorder 9: Never Enough,4.5,2.0,2008,12,,Derek Westerlund,Darren Berrecloth,Cameron McCaul,Paul Basagoitia,,,
5,0,Family,,31975,,,Sesame Street: Elmo Loves You!,"Elmo is making a very, very super special surp...",0.843,[],[],2010-01-05,0.0,46.0,[],Released,,Sesame Street: Elmo Loves You!,0.0,0.0,2010,1,,,,,,,,
6,0,Drama|Crime|Comedy,,2,underdog|prison|factory worker|prisoner|helsin...,Deutsch,Ariel,Taisto Kasurinen is a Finnish coal miner whose...,8.665,"[{'id': 2303, 'logo_path': None, 'name': 'Vill...","[{'iso_3166_1': 'FI', 'name': 'Finland'}]",1988-10-21,0.0,73.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}, {'iso...",Released,,Ariel,6.8,99.0,1988,10,Finland,Aki Kaurismäki,Turo Pajala,Susanna Haavisto,Matti Pellonpää,Villealfa Filmproductions,,
7,0,Drama|Comedy,,3,salesclerk|helsinki|garbage,svenska,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",7.43,"[{'id': 2303, 'logo_path': None, 'name': 'Vill...","[{'iso_3166_1': 'FI', 'name': 'Finland'}]",1986-10-17,0.0,72.0,"[{'iso_639_1': 'sv', 'name': 'svenska'}, {'iso...",Released,,Shadows in Paradise,7.4,105.0,1986,10,Finland,Aki Kaurismäki,Matti Pellonpää,Kati Outinen,Sakari Kuosmanen,Villealfa Filmproductions,,


In [76]:
# Drop Duplication Values
duplicated_values = ('num_voted_users', 'popularity', 'budget', 'genres', 'id', 'plot_keywords', 'language', 'original_title','overview', 'popularity','release_date', 'box_office', 'duration','status','movie_title','vote_average','num_voted_users')
original_format.drop_duplicates(subset=duplicated_values, keep='first', inplace=True)

In [77]:
# Homepage and tagline are useless in our model
original_format.drop(['homepage','tagline'], axis=1, inplace=True)

In [78]:
# Drop rows with invalid voting score
original_format = original_format[original_format['vote_average'] > 0]

In [79]:
# Drop rows with invalid duration
original_format = original_format[10 < original_format['duration']]
original_format = original_format[original_format['duration'] < 300]

In [80]:
# Drop rows with invalid budget
original_format.budget=original_format.budget.astype(int)
original_format = original_format[original_format.budget!=0]

In [81]:
original_format.head()

Unnamed: 0,budget,genres,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,box_office,duration,spoken_languages,status,movie_title,vote_average,num_voted_users,release_year,release_month,country,director_name,actor_1_name,actor_2_name,actor_3_name,companies_1,companies_2,companies_3
8,4000000,Crime|Comedy,5,hotel|new year's eve|witch|bet|hotel room|sper...,English,Four Rooms,It's Ted the Bellhop's first night on the job....,9.841,"[{'id': 14, 'logo_path': '/m6AHu84oZQxvq7n1rsv...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-09,4257354.0,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Four Rooms,6.0,1468.0,1995,12,United States of America,Allison Anders,Tim Roth,Sammi Davis,Amanda de Cadenet,Miramax,A Band Apart,
9,21000000,Action|Thriller|Crime,6,"chicago, usa|drug dealer|boxing match|escape|o...",English,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",6.773,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",1993-10-15,12136938.0,110.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Judgment Night,6.5,140.0,1993,10,Japan,Stephen Hopkins,Emilio Estevez,Cuba Gooding Jr.,Denis Leary,Universal Pictures,Largo Entertainment,JVC
10,42000,Documentary,8,remix|megacities,English,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,1.35,"[{'id': 19123, 'logo_path': None, 'name': 'inL...","[{'iso_3166_1': 'AT', 'name': 'Austria'}]",2006-01-01,0.0,80.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Life in Loops (A Megacities RMX),7.5,11.0,2006,1,Austria,Timo Novotny,,,,inLoops,,
12,11000000,Adventure|Action|Science Fiction,11,android|galaxy|hermit|death star|lightsaber|je...,English,Star Wars,Princess Leia is captured and held hostage by ...,67.56,"[{'id': 1, 'logo_path': '/o86DbpburjxrqAzEDhXZ...","[{'iso_3166_1': 'US', 'name': 'United States o...",1977-05-25,775398007.0,121.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Star Wars,8.2,13071.0,1977,5,United States of America,George Lucas,Mark Hamill,Harrison Ford,Carrie Fisher,Lucasfilm,20th Century Fox,
13,94000000,Animation|Family,12,parent child relationship|harbor|anthropomorph...,English,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",33.368,"[{'id': 3, 'logo_path': '/1TjvGVDMYsj6JBxOAkUH...","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-05-30,940335536.0,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Finding Nemo,7.8,13065.0,2003,5,United States of America,Andrew Stanton,Albert Brooks,Ellen DeGeneres,Alexander Gould,Pixar,,


### 4 Definition of wise investment
There are a lot of ways we can define whether a movie is successful or not.
Here we use a simple definition: the box is 1.5 times bigger than the budget. If the
box is 1.5 times bigger, we can assume investors can earn profit from this movie, which means this is a wise investment.

In [82]:
# Numeric budget to float, so that we can do dive on it
original_format.budget = original_format.budget.astype(float)

In [83]:
original_format['ratio'] = original_format['box_office']/original_format['budget']
original_format['success'] = original_format['ratio'].map(lambda s :1  if s > 1.5 else 0)

In [84]:
original_format.drop(['ratio', 'box_office'], axis=1, inplace=True)

In [85]:
original_format.success.head()

8     0
9     0
10    0
12    1
13    1
Name: success, dtype: int64

### 5 Give threshold for each column to assist similar search
Based on the discussion with Professor in class, we decide to numeralize and give a threshold for each column, so that we can make a similar search more clearly. For example, we only allow the top ten producing companies in the dataset, all other companies will fall in 'other company'. Thus, when we ask users to put into the company name, they are only allowed to select from these 10 + 1 names. And the similarity search will then become very clear since there are very clear 11 categories to choose from. 

In [86]:
# For budget column
original_format.budget=original_format.budget.astype(int)

In [87]:
# Create threshold to decide five levels of budget: VeryLowBudget, LowBudget, MedBudget, HighBudget, VeryHighBudget
# After this step, each new movie's budget will only fall in one of them. Thus, we can have a much clear way to make
# similar seach. 
level_1 = original_format.budget[original_format.budget>0].quantile(0.25)
level_2 = original_format.budget[original_format.budget>0].quantile(0.5)
level_3 = original_format.budget[original_format.budget>0].quantile(0.75)
level_4 = original_format.budget[original_format.budget>0].quantile(0.95)

In [88]:

original_format['VeryLowBudget'] = original_format['budget'].map(lambda s: 1 if 0< s < level_1 else 0)
original_format['LowBudget'] = original_format['budget'].map(lambda s: 1 if level_1 <= s < level_2 else 0)
original_format['MedBudget'] = original_format['budget'].map(lambda s: 1 if  level_2 <= s < level_3 else 0)
original_format['HighBudget'] = original_format['budget'].map(lambda s: 1 if level_3 <= s < level_4 else 0)
original_format['VeryHighBudget'] = original_format['budget'].map(lambda s: 1 if s >= level_4 else 0)

In [89]:
# Similarly, we also seperate the length of movie into three levels
original_format['ShortMovie'] = original_format['duration'].map(lambda s: 1 if s < 90 else 0)
original_format['NormalMovie'] = original_format['duration'].map(lambda s: 1 if 90 <= s < 120 else 0)
original_format['LongMovie'] = original_format['duration'].map(lambda s: 1 if s >= 120 else 0)

In [90]:
# For genres, we spread the genre column and combined them with the main columns
# So that, for each new movie, it will fall in one or more columns, instead of getting a list of genres
genre = []

In [91]:
def get_genre_lists(data, column):
    genre = []
    for i in data[column]:
        splited_genre = list(map(str, i.split('|')))
        for j in splited_genre:
            if j not in genre:
                genre.append(j)
    return genre

In [92]:
genre = get_genre_lists(original_format, "genres")
# Drop invalid genre name
genre = list(filter(lambda x:len(x)>0, genre))

In [93]:
# Numeralize
for g in genre:
    original_format[g] = original_format['genres'].map(lambda s: 1 if g in str(s) else 0)

In [94]:
original_format.head()

Unnamed: 0,budget,genres,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,duration,spoken_languages,status,movie_title,vote_average,num_voted_users,release_year,release_month,country,director_name,actor_1_name,actor_2_name,actor_3_name,companies_1,companies_2,companies_3,success,VeryLowBudget,LowBudget,MedBudget,HighBudget,VeryHighBudget,ShortMovie,NormalMovie,LongMovie,Crime,Comedy,Action,Thriller,Documentary,Adventure,Science Fiction,Animation,Family,Drama,Romance,Fantasy,War,Music,Western,Mystery,History,Horror,TV Movie
8,4000000,Crime|Comedy,5,hotel|new year's eve|witch|bet|hotel room|sper...,English,Four Rooms,It's Ted the Bellhop's first night on the job....,9.841,"[{'id': 14, 'logo_path': '/m6AHu84oZQxvq7n1rsv...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-09,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Four Rooms,6.0,1468.0,1995,12,United States of America,Allison Anders,Tim Roth,Sammi Davis,Amanda de Cadenet,Miramax,A Band Apart,,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,21000000,Action|Thriller|Crime,6,"chicago, usa|drug dealer|boxing match|escape|o...",English,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",6.773,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",1993-10-15,110.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Judgment Night,6.5,140.0,1993,10,Japan,Stephen Hopkins,Emilio Estevez,Cuba Gooding Jr.,Denis Leary,Universal Pictures,Largo Entertainment,JVC,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,42000,Documentary,8,remix|megacities,English,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,1.35,"[{'id': 19123, 'logo_path': None, 'name': 'inL...","[{'iso_3166_1': 'AT', 'name': 'Austria'}]",2006-01-01,80.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Life in Loops (A Megacities RMX),7.5,11.0,2006,1,Austria,Timo Novotny,,,,inLoops,,,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12,11000000,Adventure|Action|Science Fiction,11,android|galaxy|hermit|death star|lightsaber|je...,English,Star Wars,Princess Leia is captured and held hostage by ...,67.56,"[{'id': 1, 'logo_path': '/o86DbpburjxrqAzEDhXZ...","[{'iso_3166_1': 'US', 'name': 'United States o...",1977-05-25,121.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Star Wars,8.2,13071.0,1977,5,United States of America,George Lucas,Mark Hamill,Harrison Ford,Carrie Fisher,Lucasfilm,20th Century Fox,,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
13,94000000,Animation|Family,12,parent child relationship|harbor|anthropomorph...,English,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",33.368,"[{'id': 3, 'logo_path': '/1TjvGVDMYsj6JBxOAkUH...","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-05-30,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Finding Nemo,7.8,13065.0,2003,5,United States of America,Andrew Stanton,Albert Brooks,Ellen DeGeneres,Alexander Gould,Pixar,,,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0


In [95]:
# We use the same method as genre for director and company columns
original_format.director_name.fillna('null', inplace=True)

In [96]:
# We cannot numeric all the directors name, so we choose to numeric
# top 10 directors, and put all the other directors into 'other director'
def get_frenquency(data):
    frequency = {}
    for key in data:
        if key in frequency and key:
            frequency[key] += 1
        else:
            frequency[key] = 1
    return frequency

In [97]:
director_freq = get_frenquency(original_format['director_name'])
director_list = list(director_freq.items())
director_list.sort(key=lambda tup:tup[1],reverse=True)

In [98]:
# drop 'null' item
director_list=list(filter(lambda x:x[0]!='null', director_list))

In [99]:
def Top(List, top):
    Top = list()
    for i in range(0, top):
        Top.append(List[i][0])
    return Top

In [100]:
# Numeric
top_list = Top(director_list, 10)
for t in top_list:
    original_format[t] = original_format['director_name'].map(lambda s: 1 if t in str(s) else 0)
original_format['Other Director'] = original_format['director_name'].map(lambda x:1 if x not in top_list else 0)

In [101]:
original_format.head()

Unnamed: 0,budget,genres,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,duration,spoken_languages,status,movie_title,vote_average,num_voted_users,release_year,release_month,country,director_name,actor_1_name,actor_2_name,actor_3_name,companies_1,companies_2,companies_3,success,VeryLowBudget,LowBudget,MedBudget,HighBudget,VeryHighBudget,ShortMovie,NormalMovie,LongMovie,Crime,Comedy,Action,Thriller,Documentary,Adventure,Science Fiction,Animation,Family,Drama,Romance,Fantasy,War,Music,Western,Mystery,History,Horror,TV Movie,Steven Spielberg,Clint Eastwood,Woody Allen,Martin Scorsese,Ridley Scott,Brian De Palma,Steven Soderbergh,Wes Craven,Francis Ford Coppola,Ron Howard,Other Director
8,4000000,Crime|Comedy,5,hotel|new year's eve|witch|bet|hotel room|sper...,English,Four Rooms,It's Ted the Bellhop's first night on the job....,9.841,"[{'id': 14, 'logo_path': '/m6AHu84oZQxvq7n1rsv...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-09,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Four Rooms,6.0,1468.0,1995,12,United States of America,Allison Anders,Tim Roth,Sammi Davis,Amanda de Cadenet,Miramax,A Band Apart,,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9,21000000,Action|Thriller|Crime,6,"chicago, usa|drug dealer|boxing match|escape|o...",English,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",6.773,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",1993-10-15,110.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Judgment Night,6.5,140.0,1993,10,Japan,Stephen Hopkins,Emilio Estevez,Cuba Gooding Jr.,Denis Leary,Universal Pictures,Largo Entertainment,JVC,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
10,42000,Documentary,8,remix|megacities,English,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,1.35,"[{'id': 19123, 'logo_path': None, 'name': 'inL...","[{'iso_3166_1': 'AT', 'name': 'Austria'}]",2006-01-01,80.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Life in Loops (A Megacities RMX),7.5,11.0,2006,1,Austria,Timo Novotny,,,,inLoops,,,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
12,11000000,Adventure|Action|Science Fiction,11,android|galaxy|hermit|death star|lightsaber|je...,English,Star Wars,Princess Leia is captured and held hostage by ...,67.56,"[{'id': 1, 'logo_path': '/o86DbpburjxrqAzEDhXZ...","[{'iso_3166_1': 'US', 'name': 'United States o...",1977-05-25,121.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Star Wars,8.2,13071.0,1977,5,United States of America,George Lucas,Mark Hamill,Harrison Ford,Carrie Fisher,Lucasfilm,20th Century Fox,,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
13,94000000,Animation|Family,12,parent child relationship|harbor|anthropomorph...,English,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",33.368,"[{'id': 3, 'logo_path': '/1TjvGVDMYsj6JBxOAkUH...","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-05-30,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Finding Nemo,7.8,13065.0,2003,5,United States of America,Andrew Stanton,Albert Brooks,Ellen DeGeneres,Alexander Gould,Pixar,,,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [102]:
# Similar process for companies. The raw data has three columns for companies,
# we need to merge them together
for i in range(1, 4):
  column_name = 'companies_' + str(i)
  original_format[column_name].fillna('null',inplace=True)

In [103]:
tmp_list = []
for i in range(1, 4):
  column_name = 'companies_' + str(i)
  # sort company names by frequency
  company_freq = get_frenquency(original_format[column_name])
  company_list = list(company_freq.items())
  company_list.sort(key=lambda tup:tup[1],reverse=True)
  # drop 'null' item
  company_list =list(filter(lambda x:x[0]!='null', company_list))
  tmp_list += company_list

In [104]:
# Merge three columns of company names together, and remove duplicates 
company_list=sorted(tmp_list)
company_dict={}
for a,b in company_list:
    if a not in company_dict.keys():
        company_dict[a] = b
    elif a in company_dict.keys():
        company_dict[a] = company_dict.get(a) + b

merged_company_list = list(company_dict.items())
merged_company_list.sort(key=lambda tup:tup[1],reverse=True)
# drop 'null' item
merged_company_list=list(filter(lambda x:x[0]!='null', merged_company_list))

In [105]:
# We cannot numeric all the companies name, so we choose to numeric
# top 10 companies, and put all the other companies into others
toplist = Top(merged_company_list, 10)
for t in toplist:
    original_format[t] = 0

In [106]:
original_format.head()

Unnamed: 0,budget,genres,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,duration,spoken_languages,status,movie_title,vote_average,num_voted_users,release_year,release_month,country,director_name,actor_1_name,actor_2_name,actor_3_name,companies_1,companies_2,companies_3,success,VeryLowBudget,LowBudget,MedBudget,HighBudget,VeryHighBudget,ShortMovie,NormalMovie,LongMovie,Crime,Comedy,Action,Thriller,Documentary,Adventure,Science Fiction,Animation,Family,Drama,Romance,Fantasy,War,Music,Western,Mystery,History,Horror,TV Movie,Steven Spielberg,Clint Eastwood,Woody Allen,Martin Scorsese,Ridley Scott,Brian De Palma,Steven Soderbergh,Wes Craven,Francis Ford Coppola,Ron Howard,Other Director,Universal Pictures,Paramount,Columbia Pictures,Warner Bros. Pictures,20th Century Fox,New Line Cinema,Walt Disney Pictures,Touchstone Pictures,Miramax,Metro-Goldwyn-Mayer
8,4000000,Crime|Comedy,5,hotel|new year's eve|witch|bet|hotel room|sper...,English,Four Rooms,It's Ted the Bellhop's first night on the job....,9.841,"[{'id': 14, 'logo_path': '/m6AHu84oZQxvq7n1rsv...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-09,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Four Rooms,6.0,1468.0,1995,12,United States of America,Allison Anders,Tim Roth,Sammi Davis,Amanda de Cadenet,Miramax,A Band Apart,,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9,21000000,Action|Thriller|Crime,6,"chicago, usa|drug dealer|boxing match|escape|o...",English,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",6.773,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",1993-10-15,110.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Judgment Night,6.5,140.0,1993,10,Japan,Stephen Hopkins,Emilio Estevez,Cuba Gooding Jr.,Denis Leary,Universal Pictures,Largo Entertainment,JVC,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
10,42000,Documentary,8,remix|megacities,English,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,1.35,"[{'id': 19123, 'logo_path': None, 'name': 'inL...","[{'iso_3166_1': 'AT', 'name': 'Austria'}]",2006-01-01,80.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Life in Loops (A Megacities RMX),7.5,11.0,2006,1,Austria,Timo Novotny,,,,inLoops,,,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
12,11000000,Adventure|Action|Science Fiction,11,android|galaxy|hermit|death star|lightsaber|je...,English,Star Wars,Princess Leia is captured and held hostage by ...,67.56,"[{'id': 1, 'logo_path': '/o86DbpburjxrqAzEDhXZ...","[{'iso_3166_1': 'US', 'name': 'United States o...",1977-05-25,121.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Star Wars,8.2,13071.0,1977,5,United States of America,George Lucas,Mark Hamill,Harrison Ford,Carrie Fisher,Lucasfilm,20th Century Fox,,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
13,94000000,Animation|Family,12,parent child relationship|harbor|anthropomorph...,English,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",33.368,"[{'id': 3, 'logo_path': '/1TjvGVDMYsj6JBxOAkUH...","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-05-30,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Finding Nemo,7.8,13065.0,2003,5,United States of America,Andrew Stanton,Albert Brooks,Ellen DeGeneres,Alexander Gould,Pixar,,,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [107]:
# Numeric
for i in range(1, 4):
  for t in toplist:
      column_name = 'companies_' + str(i)
      original_format[t][original_format[t]!=1] = original_format[column_name].map(lambda x: 1 if x in t else 0)

In [108]:
original_format.head(10)

Unnamed: 0,budget,genres,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,duration,spoken_languages,status,movie_title,vote_average,num_voted_users,release_year,release_month,country,director_name,actor_1_name,actor_2_name,actor_3_name,companies_1,companies_2,companies_3,success,VeryLowBudget,LowBudget,MedBudget,HighBudget,VeryHighBudget,ShortMovie,NormalMovie,LongMovie,Crime,Comedy,Action,Thriller,Documentary,Adventure,Science Fiction,Animation,Family,Drama,Romance,Fantasy,War,Music,Western,Mystery,History,Horror,TV Movie,Steven Spielberg,Clint Eastwood,Woody Allen,Martin Scorsese,Ridley Scott,Brian De Palma,Steven Soderbergh,Wes Craven,Francis Ford Coppola,Ron Howard,Other Director,Universal Pictures,Paramount,Columbia Pictures,Warner Bros. Pictures,20th Century Fox,New Line Cinema,Walt Disney Pictures,Touchstone Pictures,Miramax,Metro-Goldwyn-Mayer
8,4000000,Crime|Comedy,5,hotel|new year's eve|witch|bet|hotel room|sper...,English,Four Rooms,It's Ted the Bellhop's first night on the job....,9.841,"[{'id': 14, 'logo_path': '/m6AHu84oZQxvq7n1rsv...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-09,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Four Rooms,6.0,1468.0,1995,12,United States of America,Allison Anders,Tim Roth,Sammi Davis,Amanda de Cadenet,Miramax,A Band Apart,,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
9,21000000,Action|Thriller|Crime,6,"chicago, usa|drug dealer|boxing match|escape|o...",English,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",6.773,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",1993-10-15,110.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Judgment Night,6.5,140.0,1993,10,Japan,Stephen Hopkins,Emilio Estevez,Cuba Gooding Jr.,Denis Leary,Universal Pictures,Largo Entertainment,JVC,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
10,42000,Documentary,8,remix|megacities,English,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,1.35,"[{'id': 19123, 'logo_path': None, 'name': 'inL...","[{'iso_3166_1': 'AT', 'name': 'Austria'}]",2006-01-01,80.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Life in Loops (A Megacities RMX),7.5,11.0,2006,1,Austria,Timo Novotny,,,,inLoops,,,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
12,11000000,Adventure|Action|Science Fiction,11,android|galaxy|hermit|death star|lightsaber|je...,English,Star Wars,Princess Leia is captured and held hostage by ...,67.56,"[{'id': 1, 'logo_path': '/o86DbpburjxrqAzEDhXZ...","[{'iso_3166_1': 'US', 'name': 'United States o...",1977-05-25,121.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Star Wars,8.2,13071.0,1977,5,United States of America,George Lucas,Mark Hamill,Harrison Ford,Carrie Fisher,Lucasfilm,20th Century Fox,,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
13,94000000,Animation|Family,12,parent child relationship|harbor|anthropomorph...,English,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",33.368,"[{'id': 3, 'logo_path': '/1TjvGVDMYsj6JBxOAkUH...","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-05-30,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Finding Nemo,7.8,13065.0,2003,5,United States of America,Andrew Stanton,Albert Brooks,Ellen DeGeneres,Alexander Gould,Pixar,,,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
14,55000000,Comedy|Drama|Romance,13,vietnam veteran|hippie|washington d.c.|mentall...,English,Forrest Gump,A man with a low IQ has accomplished great thi...,34.297,"[{'id': 4, 'logo_path': '/fycMZt242LVjagMByZOL...","[{'iso_3166_1': 'US', 'name': 'United States o...",1994-07-06,142.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Forrest Gump,8.4,17136.0,1994,7,United States of America,Robert Zemeckis,Tom Hanks,Robin Wright,Gary Sinise,Paramount,,,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
15,15000000,Drama,14,adultery|movie business|parent child relations...,English,American Beauty,"Lester Burnham, a depressed suburban father in...",25.847,"[{'id': 2721, 'logo_path': None, 'name': 'Jink...","[{'iso_3166_1': 'US', 'name': 'United States o...",1999-09-15,122.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,American Beauty,8.0,7559.0,1999,9,United States of America,Sam Mendes,Kevin Spacey,Annette Bening,Thora Birch,Jinks/Cohen Company,DreamWorks Pictures,,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
17,12800000,Drama|Crime,16,individual|dancing|robbery|factory worker|secr...,English,Dancer in the Dark,"Selma, a Czech immigrant on the verge of blind...",10.994,"[{'id': 53671, 'logo_path': None, 'name': 'Lan...","[{'iso_3166_1': 'AR', 'name': 'Argentina'}, {'...",2000-05-17,141.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Dancer in the Dark,8.0,935.0,2000,5,Argentina,Lars von Trier,Björk,Catherine Deneuve,David Morse,Lantia Cinema & Audiovisivi,Fine Line Features,Zentropa Entertainments,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
19,90000000,Adventure|Fantasy|Action|Thriller|Science Fiction,18,"new york city, usa|clone|taxi|cyborg|egypt|fut...",English,The Fifth Element,"In 2257, a taxi driver is unintentionally give...",21.71,"[{'id': 5, 'logo_path': '/71BqEFAF4V3qjjMPCpLu...","[{'iso_3166_1': 'FR', 'name': 'France'}]",1997-05-02,126.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Fifth Element,7.5,6992.0,1997,5,France,Luc Besson,Bruce Willis,Gary Oldman,Ian Holm,Columbia Pictures,Gaumont,Sony Pictures,1,0,0,0,0,1,0,0,1,0,0,1,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
23,140000000,Adventure|Fantasy|Action,22,exotic island|blacksmith|east india trading co...,English,Pirates of the Caribbean: The Curse of the Bla...,"Jack Sparrow, a freewheeling 18th-century pira...",43.331,"[{'id': 130, 'logo_path': '/c9dVHPOL3cqCr2593A...","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-07-09,143.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pirates of the Caribbean: The Curse of the Bla...,7.7,13864.0,2003,7,United States of America,Gore Verbinski,Johnny Depp,Geoffrey Rush,Orlando Bloom,Jerry Bruckheimer Films,Walt Disney Pictures,,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0


In [109]:
def check_other_company (row):
   if row['companies_1'] not in toplist and row['companies_2'] not in toplist and row['companies_3'] not in toplist:
      return 1
   return 0

In [110]:
# create column for other company
original_format['Other Company'] = original_format.apply (lambda row: check_other_company(row), axis=1)

In [111]:
original_format.head()

Unnamed: 0,budget,genres,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,duration,spoken_languages,status,movie_title,vote_average,num_voted_users,release_year,release_month,country,director_name,actor_1_name,actor_2_name,actor_3_name,companies_1,companies_2,companies_3,success,VeryLowBudget,LowBudget,MedBudget,HighBudget,VeryHighBudget,ShortMovie,NormalMovie,LongMovie,Crime,Comedy,Action,Thriller,Documentary,Adventure,Science Fiction,Animation,Family,Drama,Romance,Fantasy,War,Music,Western,Mystery,History,Horror,TV Movie,Steven Spielberg,Clint Eastwood,Woody Allen,Martin Scorsese,Ridley Scott,Brian De Palma,Steven Soderbergh,Wes Craven,Francis Ford Coppola,Ron Howard,Other Director,Universal Pictures,Paramount,Columbia Pictures,Warner Bros. Pictures,20th Century Fox,New Line Cinema,Walt Disney Pictures,Touchstone Pictures,Miramax,Metro-Goldwyn-Mayer,Other Company
8,4000000,Crime|Comedy,5,hotel|new year's eve|witch|bet|hotel room|sper...,English,Four Rooms,It's Ted the Bellhop's first night on the job....,9.841,"[{'id': 14, 'logo_path': '/m6AHu84oZQxvq7n1rsv...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-09,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Four Rooms,6.0,1468.0,1995,12,United States of America,Allison Anders,Tim Roth,Sammi Davis,Amanda de Cadenet,Miramax,A Band Apart,,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
9,21000000,Action|Thriller|Crime,6,"chicago, usa|drug dealer|boxing match|escape|o...",English,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",6.773,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",1993-10-15,110.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Judgment Night,6.5,140.0,1993,10,Japan,Stephen Hopkins,Emilio Estevez,Cuba Gooding Jr.,Denis Leary,Universal Pictures,Largo Entertainment,JVC,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
10,42000,Documentary,8,remix|megacities,English,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,1.35,"[{'id': 19123, 'logo_path': None, 'name': 'inL...","[{'iso_3166_1': 'AT', 'name': 'Austria'}]",2006-01-01,80.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Life in Loops (A Megacities RMX),7.5,11.0,2006,1,Austria,Timo Novotny,,,,inLoops,,,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
12,11000000,Adventure|Action|Science Fiction,11,android|galaxy|hermit|death star|lightsaber|je...,English,Star Wars,Princess Leia is captured and held hostage by ...,67.56,"[{'id': 1, 'logo_path': '/o86DbpburjxrqAzEDhXZ...","[{'iso_3166_1': 'US', 'name': 'United States o...",1977-05-25,121.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Star Wars,8.2,13071.0,1977,5,United States of America,George Lucas,Mark Hamill,Harrison Ford,Carrie Fisher,Lucasfilm,20th Century Fox,,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
13,94000000,Animation|Family,12,parent child relationship|harbor|anthropomorph...,English,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",33.368,"[{'id': 3, 'logo_path': '/1TjvGVDMYsj6JBxOAkUH...","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-05-30,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Finding Nemo,7.8,13065.0,2003,5,United States of America,Andrew Stanton,Albert Brooks,Ellen DeGeneres,Alexander Gould,Pixar,,,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


In [112]:
# Generate a back_up dataset
data_back_up = original_format.copy()

In [113]:
# After numeric all the columns, let's drop the original columns
original_format.drop(['budget','genres','id','plot_keywords','language','original_title','overview',
               'production_companies','production_countries','release_date','duration','spoken_languages','status','movie_title','vote_average','country',
               'director_name',
               'actor_1_name','actor_2_name','actor_3_name',
               'companies_1','companies_2','companies_3'], axis=1, inplace=True)

In [114]:
# Export Similarity search dataset
# original_format.to_csv ('/content/drive/My Drive/INFO7374/similar_movie_data.csv', index = False, header=True)

In [115]:
original_format.drop(['release_year','release_month', 'popularity', 'num_voted_users'], axis=1, inplace=True)

### 6 Algorithm learned from Class: Decision Tree


In [116]:
# Split data into training and testing data set
y = original_format['success']
y = np.array(y).reshape(-1,1)
x = original_format.drop('success',axis=1)

In [117]:
x_train_all, x_test, y_train_all, y_test = train_test_split(x,y,random_state=10, test_size=.15)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state=10, test_size=0.15)

In [118]:
decision_tree = DecisionTreeClassifier(criterion='entropy',max_depth=20, min_samples_leaf=10)
decision_tree.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=20, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [119]:
score_all=cross_val_score(decision_tree, x_train_all, y_train_all, cv=5)
avg_score=score_all.mean()
print("Accuracy Score based on 5-fold Cross Validation: {}\n".format(round(avg_score,2)))

Accuracy Score based on 5-fold Cross Validation: 0.7



In [120]:
x_test.head()

Unnamed: 0,VeryLowBudget,LowBudget,MedBudget,HighBudget,VeryHighBudget,ShortMovie,NormalMovie,LongMovie,Crime,Comedy,Action,Thriller,Documentary,Adventure,Science Fiction,Animation,Family,Drama,Romance,Fantasy,War,Music,Western,Mystery,History,Horror,TV Movie,Steven Spielberg,Clint Eastwood,Woody Allen,Martin Scorsese,Ridley Scott,Brian De Palma,Steven Soderbergh,Wes Craven,Francis Ford Coppola,Ron Howard,Other Director,Universal Pictures,Paramount,Columbia Pictures,Warner Bros. Pictures,20th Century Fox,New Line Cinema,Walt Disney Pictures,Touchstone Pictures,Miramax,Metro-Goldwyn-Mayer,Other Company
98339,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
276,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
4735,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
7971,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
67126,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


In [121]:
y_test_pred=decision_tree.predict(x_test)

In [122]:
# Assuming the test data are new datas users put
x_test['success'] = y_test_pred

In [123]:
# Export the predicted value
# x_test.to_csv ('/content/drive/My Drive/INFO7374/predicted_data.csv', index = False, header=True)