In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Imports
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
stop = set(stopwords.words('english'))
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import xgboost as xgb
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import json
import ast
import eli5
import shap
from catboost import CatBoostRegressor
from urllib.request import urlopen
from PIL import Image
from sklearn.preprocessing import LabelEncoder
import time
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from pathlib import Path
from pandas.plotting import scatter_matrix

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**BOX OFFICE PREDICTION**

# **1. Big picture**

As mentioned in the report belonging to this project, our model will be used as a proof of concept for predicting revenues of movies. If this is successfull, a more comprehensive model with more input can be made for production companies, cinemas and other parties involved in movies to make future decisions. Therefore an important role of the model is to gain insight as well as making predictions. We will explore all the features in the dataset, even if some of them cant be used in our interactive proof of concept


**Performance measure**

RMSE?

# **2. Get the data**

In [None]:
DATA = Path('/kaggle/input/box-office')

In [None]:
test_df = pd.read_csv(DATA/'test.csv')
train_df = pd.read_csv(DATA/'train.csv')

The following cell is copied from https://www.kaggle.com/code/artgor/eda-feature-engineering-and-model-interpretation/comments and converts string objects into python dictionaries.

In [None]:
train_exp = train_df.copy()

In [None]:
#Copied code
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

def text_to_dict(df, cols):
    for column in cols:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df

train_exp = text_to_dict(train_exp, dict_columns)
#test_df = text_to_dict(test_df)

# **3. Explore data**

In [None]:
#train_exp.head()

We have 23 columns, 1 being id and one being revenue. Meaning we have 21 features.

In [None]:
train_exp.info()

18 of 21 relevant features for engineering are objects. This means we have some work to do. We will look at each object-feature, and try to gain some insights into how we should tackle these. Firstly we will look at the numerical attributes.

**Numerical attributes**

In [None]:
train_exp.drop('id', axis=1, inplace=True)

In [None]:
train_exp.hist(bins=50, figsize=(15,10))
plt.show()

In [None]:
scatter_matrix(train_exp, figsize=(15,10))
plt.show()

In [None]:
train_exp[['revenue', 'budget']].head(15)

In [None]:
len(train_exp[train_exp['budget'] == 0])

In [None]:
len(train_exp[train_exp['revenue'] == 0])

Both budget and revenue are very skewed. Budget also contains 812 0-values. While some movies could theoretically have a bugdet of zero, they are most likely just missing values. We will impute these values with the mean budget.

In [None]:
#Replace budget zero with budget mean
train_exp['budget']=train_exp['budget'].replace(0,train_exp['budget'].mean())

In [None]:
#New log of budget and revenue features
train_exp['log_budget'] = np.log1p(train_exp['budget'])
train_exp['log_revenue'] = np.log1p(train_exp['revenue'])

In [None]:
#We will check correlation of old/new attributes several times so lets make a function

def show_corr(df):
    corr_matrix = df.corr()
    return corr_matrix["revenue"].sort_values(ascending=False)

In [None]:
show_corr(train_exp)

In [None]:
train_exp['runtime'].head(10)

In [None]:
len(train_exp[train_exp['runtime'] == 0])

In [None]:
train_exp[train_exp['runtime'].isnull()]

Runtime is in minutes it seems, and 12 movies have 0 in runtime. 2 movies have a null value as runtime. This will need to be fixed

In [None]:
train_exp = train_exp.loc[(train_exp['runtime'] > 0)]

In [None]:
len(train_exp[train_exp['runtime'] == 0])

In [None]:
train_exp['popularity'].head(10)

Popularity seems to be pretty weird. I guess it is taken from a ratings system on the website where the movie data is gathered from. From the correlation it seems to be a good feature, but if we cant understand it and it is hard to reproduce when we make an interactive model, we might need to consider dropping it.

**imdb_id, poster_path**

These values are only used to access external resources. They will not be relevant for training the model, and in the preperation step we will simply remove them.

In [None]:
train_exp.drop(['imdb_id','poster_path'], axis=1, inplace=True)

**belongs_to_collection**

This could be a relevant feature, as movie collections such as "star wars" or "harry potter" are very popular. Fans are likely to watch a new film in the same series.

Lets take a look at what this attribute looks like:


In [None]:
train_exp['belongs_to_collection'].head()

In [None]:
len(train_exp[train_exp['belongs_to_collection'] != {}])

604 movies belong to a collection. Lets look at they way a collection is represented:

In [None]:
pd.set_option('display.max_colwidth', None)
#collection['belongs_to_collection'].head(10)

belongs to collection has an id for the collection, a name, poster path and backdrop path. Our strategy for this attribute will probably be to replace it with a 1 or 0 value for if the movie belongs to a collection or not. Lets check the correlation with revenue for such an attrbiute:

In [None]:
train_exp['belongs_to_collection'] = (train_exp['belongs_to_collection'] != {}).astype(int)

train_exp['belongs_to_collection'].head(10)

In [None]:
show_corr(train_exp)

Our new version of belongs_to_collection seems well correlated to the revenue, so we will implement this strategy.

**genres**

Lets explore genres attribute

In [None]:
train_exp['genres'].head()

In [None]:
train_exp['genres'].value_counts()

In [None]:
train_exp['genres'].apply(lambda x: len(x) if x != {} else 0).value_counts()

It appears that most movies have 1,2 or 3 genres, while some have more with drama being the most popular. Each genre has an id. One idea for transforming these objects into numerical values could be to look at median revenue for different genres, and possible one-hot-encode the most popular genres. First we do some more exploring:

Lets get a list of genres with frequency

In [None]:
genres = list(train_exp['genres'].apply(lambda x: [i['name'] for i in x]
                                    if x != {} else []).values)

In [None]:
genres_freq = Counter([i for j in genres for i in j]).most_common()
genres_freq

In [None]:
top_genres = [m[0] for m in genres_freq]
top_genres

Drama is the most popular genre, with comedy, thriller, action, romance, crime and adventure following. Lets try to make a new column for each of the genres, and have values 1 if present and 0 if not for each movie. Maybe some genres are correlated to the revenue.

In [None]:
train_exp['genres'].head()

lets make temporary column with a simple list of all the genres for a movie (instead of a list of dictionaries)

In [None]:
train_exp['genres'][0]

In [None]:
for g in top_genres:
    train_exp[g] = train_exp['genres'].apply(lambda x: 1 if any(d['name'] == g for d in x) else 0)

In [None]:
#genres_df.head()

show_corr(train_exp)

Some of the genres are indeed correlated, if not strongly. Some good indicative genres seem to be Adventure, Action, Fantasy and Drama. This will be useful for our pipeline later.

In [None]:
train_exp.drop('genres', axis=1, inplace=True)

**homepage**

Homepage feature has a link to the movies homepage which is not in itself useful. We could however turn it into a feature that is 1 if movie has a homepage and 0 otherwise. This seems like it could be good, as most likely only big productions will make a homepage for a movie.

In [None]:
train_exp['homepage'].head()

In [None]:
train_exp['homepage'].value_counts()

As suspected big productions such as transformers movies and lotr/hobbit have homepages for several of their movies

In [None]:
train_exp['homepage'] = train_exp['homepage'].notnull().astype(int)

In [None]:
train_exp['homepage'].head()

In [None]:
#Checking correlation

show_corr(train_exp)

Having a homepage or not seems to be somewhat correlated, and we will implement this in the pipeline.

**orlginal_language**

In [None]:
train_exp['original_language'].head(10)

In [None]:
train_exp['original_language'].value_counts()

An overwhelming majority of movies have english as their original language. It may therefore make little sense to have a column with 1 for english as ol and 0 otherwise. Lets try it and take a look at the correlation anyway.

In [None]:
#coll_copy['belongs_to_collection'] = coll_copy['belongs_to_collection'].notnull().astype(int)

train_exp['og_language_en'] = (train_exp['original_language'] == 'en').astype(int)

In [None]:
train_exp['og_language_en'].head(10)

In [None]:
#Checking correlation
show_corr(train_exp)

original language in english seems to be correlated, this might need some more exploration in relation to other features.

In [None]:
train_exp.drop('original_language', axis=1, inplace=True)

**production companies**

As with genres it could be interesting to have some 1 or 0 value per the most popular production companies. We could choose a subset of companies based on how many movies they are involved in, or the median value of the movies they are involed in. Possibly both?

Lets first explore what kind of values are in this column

In [None]:
train_exp['production_companies'].head(10)

In [None]:
train_exp['production_companies'].apply(lambda x: len(x) if x != {} else 0).value_counts()

Most movies have 1 or 2 production companies, 1 movie has whopping 17 production companies! Since there are some outliers with very many prod companies, we should be careful not to make a feature based on number of companies per movie. This was not the plan either

In [None]:
comps = list(train_exp['production_companies'].apply(lambda x: [i['name'] for i in x]
                                    if x != {} else []).values)

In [None]:
comps_freq = Counter([i for j in comps for i in j]).most_common()
comps_freq[:36]

The 35 most common production companies are all pretty well known. Lets make some new attributes to indicate if the movie is produced by some of these popular companies

In [None]:
top_comps = [m[0] for m in comps_freq][:11]

In [None]:
print(top_comps)

In [None]:
#for c in top_comps:
#    train_exp[c] = train_exp['production_companies'].apply(lambda x: 1 if any(d['name'] == c for d in x) else 0)

In [None]:
#show_corr(train_exp)

Having a new column for each of the most popular companies gets messy and as we can see from the correlations, not that relevant. Instead we try another strategy: An attribute that determines if the movie has a production company among the most popular ones. So a 1 if present and 0 otherwise

In [None]:
train_exp['popular_prod_comp'] = train_exp['production_companies'].apply(lambda x: 1 if any(d['name'] in top_comps for d in x) else 0)

In [None]:
show_corr(train_exp)

Making a feature popular_prod_comp that is 1 if the movie is produced by one of the 10 (can be changed) most popular companies gives a good correlation. I suspect that making the list of popular prod companies based on their median revenue could be even better.

In [None]:
train_exp.drop('production_companies', axis=1, inplace=True)

**production countries**

The first things that springs to mind is that the huge productions are mostly made in US, so we could possibly have a made_in_us column. Another interesting attribute could be number of productions countries. Lets explore some of the data before we decide: 

In [None]:
train_exp['production_countries'].head(15)

In [None]:
train_exp['production_countries'].value_counts()

1752, over half of the movies are produced in the US. Lets make the made_in_us feature:

In [None]:
train_exp['made_in_us'] = train_exp['production_countries'].apply(lambda x: 1 if any (d['iso_3166_1'] == 'US' for d in x) else 0)

#lambda x: 1 if any(d['name'] == c for d in x) else 0

In [None]:
show_corr(train_exp)

Our new attribute made_in_us is somewhat correlated. Due to the sheir number of movies made in the US, some of them are bound to be unsuccesfull or small movies as well. We will probably add this attribute as part of our pipeline.

Another idea could be to one-hot-encode each of the available countries.

In [None]:
train_exp.drop('production_countries', axis=1, inplace=True)

**Release date**

The first thought that comes to mind is that old movies would make less money. Since our model is a proof of concept of a model that could be used in the future to predict or gain insight, maybe the time of year a movie is released could be more relevant? Lets explore both options,
by adding each part of the date as a new attribute (day, month and year)

In [None]:
train_exp['release_date'][620:636]

In [None]:
test_date = train_exp['release_date'][631]
test_date

In [None]:
def formatDate(date_str):
    year = date_str.split('/')[2]
    if int(year) <= 20:
        return date_str[:-2] + '20' + year
    else:
        return date_str[:-2] + '19' + year
    
    

In [None]:
fd = formatDate(test_date)
fd

In [None]:
fd.split('/')[2]

In [None]:
def dateAttribs(df):
    df['release_date'] = df['release_date'].apply(formatDate)
    
    df['release_month'] = df['release_date'].apply(lambda s: int(s.split('/')[0]))
    df['release_day'] = df['release_date'].apply(lambda s: int(s.split('/')[1]))
    df['release_year'] = df['release_date'].apply(lambda s: int(s.split('/')[2]))
    

In [None]:
dateAttribs(train_exp)

In [None]:
show_corr(train_exp)

The date values dont seem to be correlated that much. It is surprising that release_year does not correlate better. Maybe modern films are much more hit-or-miss than I thought, since there are so many movies made. This could need some more exploring.

In [None]:
train_exp.drop('release_date', axis=1, inplace=True)

**spoken_languages**

This should be the languages that the movie is available in. The first strategy that comes to mind is to make an attribute with number of spoken languages. Dubbing the movie in several languages gives some extra cost to a movie but will most likely spread it to even more viewers which means more revenue. Lets see if that is the case

In [None]:
train_exp['spoken_languages'].head(15)

I suspect most movies to have english as their spoken language. As we can see, some movies have several spoken languages. Lets try and also make a column that is 1 if spoken language includes english, 0 otherwise

In [None]:
train_exp['spoken_lang_en'] = train_exp['spoken_languages'].apply(lambda x: 1 if any((d['name'] == 'English') for d in x) else 0)

In [None]:
train_exp['spoken_langs'] = train_exp['spoken_languages'].apply(lambda x: len(x))

In [None]:
show_corr(train_exp)

Number of spoken languages is not all that correlated, spoken languages including english is somewhat correlated. We can keep the latter, if we decide to keep any of them.

In [None]:
train_exp.drop('spoken_languages', axis=1, inplace=True)

**status**

Here i am curious to see what values are present

In [None]:
train_exp['status'].value_counts()

A vast majority of released movies. This feature can simply be dropped, without creating any new ones. It could however be useful to see if the movies with status Rumored are outliers that skew our predictions. But for now the strategy is to just drop this feature.

In [None]:
train_exp.drop('status', axis=1, inplace=True)

**cast**

With this feature there is definetly some possibility to search for mainstream popular actors among the cast. First lets see if size of cast can have any relevance.

In [None]:
train_exp['cast_size'] = train_exp['cast'].apply(lambda x: len(x))

Now lets do something similar to what we did with production companies, making a list of the most frequent cast members and making attributes by comparing cast to that list

In [None]:
casts = list(train_exp['cast'].apply(lambda x: [i['name'] for i in x]
                                    if x != {} else []).values)
casts[1]

In [None]:
cast_freq = Counter([i for j in casts for i in j]).most_common()

In [None]:
cast_freq[:11]

Not surprisingly, the most frequent cast members are very famous actors.

In [None]:
top_actors = [m[0] for m in cast_freq][:50]

In [None]:
def cast_in_top_actors(x):
    nof = 0
    for d in x:
        if (d['name'] in top_actors):
            nof+=1
    return nof
        

In [None]:
#train_exp['popular_actors'] = train_exp['cast'].apply(lambda x: 1 if any(d['name'] in top_actors for d in x) else 0)
train_exp['nof_pop_actors'] = train_exp['cast'].apply(cast_in_top_actors)

In [None]:
#for c in top_actors:
#    train_exp[c] = train_exp['cast'].apply(lambda x: 1 if any(d['name'] == c for d in x) else 0)

In [None]:
show_corr(train_exp)

Cast size looks good, lets keep that in mind for our pipeline. Both having seperate columns for each popular actor and having a single feature popular_actors seem to give ok-ish results. We will make our decision when making the pipeline, maybe we need to come back to this step to do some more experimenting.

In [None]:
train_exp.drop('cast', axis=1, inplace=True)

**crew**

Lets look at some values first, but I suspect we should/could drop this without much thought. It might be very useful to check which director or producer is a part of the crew, but more general crew members are irrelevant. Perhaps number of crew members could be useful, but we dont really know the quality of the data (are numbers for movies equally accurate?)

In [None]:
train_exp['crew'].head(1)

These are a lot of crew members listed for a single movie. I think it would be useful to experiment with a crew_size variable, and perhaps also getting a list of popular directors and searching if any crew member with job director is in that list. 

In [None]:
train_exp['crew_size'] = train_exp['crew'].apply(lambda x: len(x))

In [None]:
show_corr(train_exp)

Okay so crew size is actually a pretty good attribute

In [None]:
train_exp.drop('crew', axis=1, inplace=True)

**original_title, overview, tagline, title, keywords**

These columns include a lot of text. One idea could be to make some search for each row, how many popular buzzwords like "war", "kill", "chase", "explosion" etc. appear in these text columns. This would however take a lot of time in the processing/training step. Additionally it does not really fit our project scope of making a proof of concept that can be interacted with by normal people. They would need to copy and paste a lot of text.

In [None]:
train_exp['original_title'][0]

In [None]:
train_exp['overview'][0]

In [None]:
train_exp['tagline'][0]

In [None]:
train_exp['title'][0]

In [None]:
train_exp['Keywords'][0]

Our strategy will be to simply delete these columns. Keywords is the only column that could be interesting for the use mentioned above, as it only contains a few very relevant words for the movie. It is also formatted as a dictionary so searching for buzzwords would be more effective here. We will note this as a possible future improvement.

In [None]:
del_columns = ['original_title', 'overview', 'tagline', 'title', 'Keywords']
train_exp.drop(del_columns, axis=1, inplace=True)

# **Combining features**

Now we have experimented and made some useful numerical features for the train_exp dataframe. Lets take a look at them, and try to combine some of them to make useful attributes

In [None]:
train_exp.head(20)

In [None]:
show_corr(train_exp)

In [None]:
#budget to year
train_exp['budget_to_year'] = train_exp['budget']/train_exp['release_year']

#budget to runtime
train_exp['budget_to_runtime'] = train_exp['budget']/train_exp['runtime']

#Budget to number of famous actors
train_exp['budget_to_pop_actors'] = train_exp['budget']/train_exp['nof_pop_actors']

#runtime to year
train_exp['runtime_to_year'] = train_exp['runtime']/train_exp['release_year']




In [None]:
show_corr(train_exp)

# **4. Prep the data**

Here we will make our pipeline. Our **strategy** so far, based on the exploration of the data in step 3:

* belongs_to_collection: 1-hot
* budget: Keep
* genres: New feature for at least some of the genres, 1-hot
* homepage: 1-hot
* imdb_id: Drop
* original_language: 1-hot if english or not
* original_title: Drop
* overview: Drop
* popularity: Drop (because of user interaction)
* poster_path: Drop
* production_companies: 1-hot some of the most popular ones
* production_countries: 1-hot if made in us
* release_date: New feature for year (possibly month)
* runtime: Keep
* spoken_languages: 1-hot if english included
* status: Drop
* tagline: Drop
* title: Drop
* Keywords: Drop
* cast: 1-hot if present actors from popular actors list
* crew: Drop (because of user interaction)
* new combined features:
* budget to year, budget to runtime, budget to number of popular actors, runtime to year

Our results should be easy to reproduce, and our plan is to have an interactive version of the model with a ui. In this ui it is unlikely that the user will be able to fill in the whole crew and the whole cast. So these sizes should probably not be a part of our proof of concept model, but instead be a part of the final and professional model if it is made. Popularity might also need to be dropped, as we discovered early it is a weird attribute possibly gathered from a movie website

**Pipeline components**

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
#Our lists of different popular values, used for several attributes
final_pop_prod_companies = top_comps
final_pop_actors = top_actors
final_pop_genres = top_genres

In [None]:
dict_cols_after_drop = ['cast','genres','belongs_to_collection','production_companies', 'production_countries', 'spoken_languages']

In [None]:
class transform_to_dictionaries(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = text_to_dict(X, dict_cols_after_drop)
        
        return X

In [None]:
class drop_irrelevant_features(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        drops = ['imdb_id','poster_path','original_title','overview','popularity','poster_path', 'status', 'tagline', 'title', 'Keywords', 'crew']
        
        X.drop(drops, axis=1, inplace=True)
        
        return X

In [None]:
class impute_zeroes(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['runtime'] = X['runtime'].replace(0,X['runtime'].mean())
        X['budget'] = X['budget'].replace(0,X['budget'].mean())
                  
        return X

In [None]:
class impute_nans(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['budget'] = X['budget'].replace(np.nan, X['budget'].mean())
        X['runtime'] = X['runtime'].replace(np.nan, X['runtime'].mean())
        
        return X
        

In [None]:
class date_transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        dateAttribs(X)
        
        X.drop('release_date', axis=1, inplace=True)
        
        return X
        
        

In [None]:
class add_nof_pop_actors(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        #X['nof_pop_actors'] = X['cast'].apply(cast_in_top_actors)
        
        def cast_in_top_preset_actors(l):
            nof = 0
            for d in l:
                if (d.get("name") in final_pop_actors):
                    nof+=1
            return nof
        
        X['nof_pop_actors'] = X['cast'].apply(cast_in_top_preset_actors)
        
        X.drop('cast', axis=1, inplace=True)
        
        return X
        

In [None]:
class one_hot_collection(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['belongs_to_collection'] = (X['belongs_to_collection'] != {}).astype(int)
        
        return X

In [None]:
class one_hot_genres(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        for g in final_pop_genres:
            X[g] = X['genres'].apply(lambda x: 1 if any(d['name'] == g for d in x) else 0)
            
        X.drop('genres', axis=1, inplace=True)
            
        return X
            
        

In [None]:
class one_hot_homepage(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['homepage'] = X['homepage'].notnull().astype(int)
        
        return X

In [None]:
class one_hot_og_language_en(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['og_language_en'] = (X['original_language'] == 'en').astype(int)
        
        X.drop('original_language', axis=1, inplace=True)
        
        return X

In [None]:
class one_hot_if_pop_prod_company(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['popular_prod_comp'] = X['production_companies'].apply(lambda x: 1 if any(d['name'] in final_pop_prod_companies for d in x) else 0)
        
        X.drop('production_companies', axis=1, inplace=True)
        
        return X

In [None]:
class made_in_us(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['made_in_us'] = X['production_countries'].apply(lambda x: 1 if any (d['iso_3166_1'] == 'US' for d in x) else 0)
        
        X.drop('production_countries', axis=1, inplace=True)
        
        return X

In [None]:
class english_spoken(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['spoken_lang_en'] = X['spoken_languages'].apply(lambda x: 1 if any((d['name'] == 'English') for d in x) else 0)
        
        X.drop('spoken_languages', axis=1, inplace=True)
        
        return X

In [None]:
class add_combined_features(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):

        #budget to year
        X['budget_to_year'] = X['budget']/X['release_year']
        #budget to runtime
        X['budget_to_runtime'] = X['budget']/X['runtime']

        #Budget to number of famous actors
        #TODO needs fixing for 0 pop actors = INF
        #X['budget_to_pop_actors'] = X['budget']/X['nof_pop_actors']

        #runtime to year
        X['runtime_to_year'] = X['runtime']/X['release_year']
        
        return X

In [None]:
#Not in use
class drop_processed_features(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X

class add_features_transformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):

        findLength(X)

        return X

    def findLength(df):
        df['length'] = np.nan
        df['length'] = df['full_text'].str.len()


pipe = Pipeline(
    steps=[
        ("addLength", add_features_transformer())
    ]
)

pipe.fit_transform(test_copy)
test_copy.head()

In [None]:
#Not used
final_pipeline = Pipeline(
        steps=[ 
            ("drop_irrelevant_features", drop_irrelevant_features()),
            ("to_dictionaries", transform_to_dictionaries()),
            ("fix_dates", date_transformer()),
            ("pop_actors", add_nof_pop_actors()),
            #("zeroes_imputer", SimpleImputer(missing_values=0)),
            ("one_hot_collection", one_hot_collection()),
            ("one_hot_genres", one_hot_genres()),
            ("one_hot_homepage", one_hot_homepage()),
            ("one_hot_og_lang_en", one_hot_og_language_en()),
            ("one_hot_prod_comp", one_hot_if_pop_prod_company()),
            ("one_hot_made_in_us", made_in_us()),
            ("one_hot_english_spoken", english_spoken()),
            #("nan_imputer", SimpleImputer())
            ("add_combined_features", add_combined_features()),
            #("std_scaler", StandardScaler())
        ]
)

drop_and_dict = Pipeline(
        steps=[
            ("drop_irr", drop_irrelevant_features()),
            ("to_dict", transform_to_dictionaries())
              ]
)

nums = ['budget', 'runtime']

nans = Pipeline(
        steps=[("impute_nans", impute_nans())]
)

zeroes = Pipeline(
        steps=[("impute_zeroes", impute_zeroes())]
)

new_features = Pipeline(
        steps=[
            ("fix_dates", date_transformer()),
            ("pop_actors", add_nof_pop_actors()),
            ("one_hot_collection", one_hot_collection()),
            ("one_hot_genres", one_hot_genres()),
            ("one_hot_homepage", one_hot_homepage()),
            ("one_hot_og_lang_en", one_hot_og_language_en()),
            ("one_hot_prod_comp", one_hot_if_pop_prod_company()),
            ("one_hot_made_in_us", made_in_us()),
            ("one_hot_english_spoken", english_spoken())
        ]
)

combining_features = Pipeline(
        steps=[
            ("add_combined_features", add_combined_features())
        ]
)

scaling_features = Pipeline(
        steps=[
            ("std_scale", StandardScaler())
        ]
)



#1. Fjerne irrelevante features og #2 Tekst -> Dictionaries
#3. Impute NaN og 0 i budget og runtime
#4. Adde nye features
#5. Combine og adde nye features
#6. Scale features

def toDf(X):
    return pd.DataFrame(X)

def raw_data_pipeline(X):
    X_prep1 = toDf(drop_and_dict.fit_transform(X))
    X_df1 = toDf(X_prep1)
    X_prep2 = nans.fit_transform(X_df1)
    X_df2 = toDf(X_prep2)
    X_prep3 = zeroes.fit_transform(X_df2)
    X_df3 = toDf(X_prep3)
    X_prep4 = new_features.fit_transform(X_df3)
    X_df4 = toDf(X_prep4)
    X_prep5 = combining_features.fit_transform(X_df4)
    X_df5 = toDf(X_prep5)
    X_prep6 = scaling_features.fit_transform(X_df5)
    #X_df6 = toDf(X_prep6)
    
    return X_prep6
    
    
#For the user input data we will make a pipeline function like this:
def user_input_pipeline():
    return "Not implemented"
    
    

In [None]:
X_train = train_df.drop(['revenue', 'id'], axis=1)
y_train = train_df['revenue']

In [None]:
#X_train_prepped = final_pipeline.fit_transform(X_train)
#X_train_prepped_df = pd.DataFrame(X_train_prepped)
X_train_prepped = raw_data_pipeline(X_train)
X_train_prepped_df = toDf(X_train_prepped) 
X_train_prepped_df.head()

In [None]:
X_train_prepped_df.info()

# **5. Select and train a model**

Lets evaluate Random Forests and Gradient Boosting models



In [None]:
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std:", scores.std())

# Random forests

In [None]:
from sklearn.ensemble import RandomForestRegressor

scores = cross_val_score(RandomForestRegressor(random_state=42), X_train_prepped, y_train,
                        scoring="neg_mean_squared_error", cv=5)
forest_rmse_scores = np.sqrt(-scores)

display_scores(forest_rmse_scores)

# Gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

scores = cross_val_score(GradientBoostingRegressor(random_state=42), X_train_prepped, y_train,
                        scoring="neg_mean_squared_error", cv=5)
gradient_rmse_scores = np.sqrt(-scores)

display_scores(gradient_rmse_scores)

Lets use gradient boosting for our model

# **6. Fine tune model**

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "learning_rate": [0.2, 0.3, 0.4, 0.5],
        "max_depth":[4, 8, 10, 12],
        "n_estimators":[10, 20, 30]
    }
]

grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=5,
                          scoring='neg_mean_squared_error',
                          return_train_score=True)

In [None]:
grid_search.fit(X_train_prepped, y_train)

cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
grid_search.best_estimator_

In [None]:
final_model = grid_search.best_estimator_
#Save final model?

# **7. Present solution**

Our model will be used in a web app that is made in another notebook. It can be used to predict movie revenue based on user input.