In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from statistics import mean

Instructions for updating:
non-resource variables are not supported in the long term


In [29]:
def calculate_profitability(revenue, budget):
    revenue, budget = float(revenue), float(budget)
    profit = revenue - budget
    return profit/budget if profit > 0 else profit/revenue

def calculate_revenue(profitability, budget):
    profitability, budget = float(profitability), float(budget)
    return budget * (profitability + 1) if profitability > 0 else budget / (1 - profitability)

def get_profitability_col(df, col_revenue, col_budget):
    return [calculate_profitability(row[col_revenue], row[col_budget]) for i, row in df.iterrows()]

def get_revenue_col(df, col_profitability, col_budget):
    return [calculate_revenue(row[col_profitability], row[col_budget]) for i, row in df.iterrows()]

In [30]:
df_init = pd.read_csv('16 topics.csv')

In [31]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def get_metrics(y_test, y_pred):
    return  {
        'mae': metrics.mean_absolute_error(y_test, y_pred),
        'mse': metrics.mean_squared_error(y_test, y_pred),
        'rmse': np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
        'mape': mean_absolute_percentage_errorr(y_test, y_pred),
    }

def print_metrics(y_test, y_pred):
    res = get_metrics(y_test, y_pred)
    for key in res:
        print(f'{key}: {res[key]}')
    return res

In [32]:
def prepare_X_y(df):
    dff = df[[col for col in df.columns if not 'META' in col and not 'PROCESS' in col]]
    min_prof = dff['profitability'].min()
    dff['profitability_moved'] = dff['profitability'] - min_prof + 1
    dff = dff.drop(['revenue','profitability'], axis=1)
    
    X = dff.drop('profitability_moved', axis=1)
    X.fillna(X.mean(), inplace=True)
    y = dff['profitability_moved']
    
    return X, y

In [33]:
def prepare_X_y_revenue(df):
    dff = df[[col for col in df.columns if not 'META' in col and not 'PROCESS' in col]]
    dff = dff.drop(['profitability'], axis=1)
    
    X = dff.drop('revenue', axis=1)
    X.fillna(X.mean(), inplace=True)
    y = dff['revenue']
    
    return X, y

In [34]:
def run_RF_splits(X, y, splits):
    metrics_dict = {key: [] for key in ['mae', 'mse', 'rmse', 'mape']}
    skf = KFold(n_splits=splits)
    split_num=0
    for train_index, test_index in skf.split(X, y):
        split_num += 1
        print(f'split: {split_num}')
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        RFR = RandomForestRegressor(n_estimators=100, min_samples_split=3, random_state=0, n_jobs=-1)
        RFR.fit(X_train, y_train)
        y_pred = RFR.predict(X_test)

        validation = print_metrics(y_test, y_pred)

        for v in validation:
            metrics_dict[v].append(validation[v])
        print('\n')

    for v in metrics_dict:
        print(f'mean {v}: {mean(metrics_dict[v])}')
    return y_pred, metrics_dict

In [None]:
# run full dataset with splits
X, y = prepare_X_y(df_init)
full_y_pred, full_metrics = run_RF(X, y, splits=10)

In [36]:
# run dataset without extremes with splits
df_without_extremes = df_init.loc[(df_init.profitability < 1000) & (df_init.profitability > -1000)].reset_index(drop=True)
X, y = prepare_X_y(df_without_extremes)
y_pred, without_extremes_metrics = run_RF_splits(X, y, splits=15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


split: 1
mae: 12.353108394734958
mse: 1135.4465362717951
rmse: 33.69638758489989
mape: 1.366217992298707


split: 2
mae: 8.597917243645593
mse: 339.45564616756747
rmse: 18.424322135904145
mape: 0.9603231949601524


split: 3
mae: 7.61737384464277
mse: 448.93301681510394
rmse: 21.188039475494282
mape: 0.8478932255723619


split: 4
mae: 8.217579051916989
mse: 647.5009548990606
rmse: 25.446040063221243
mape: 0.886341282014275


split: 5
mae: 6.889201020133799
mse: 276.26241083198073
rmse: 16.621143487497505
mape: 0.7736321784616385


split: 6
mae: 9.224845120346592
mse: 528.195455200131
rmse: 22.982503240511704
mape: 1.066119129624807


split: 7
mae: 11.922782653824356
mse: 1764.3238197577384
rmse: 42.00385482021547
mape: 1.8070498466349358


split: 8
mae: 17.61052927205968
mse: 3778.7271090350496
rmse: 61.47135193759
mape: 177.5612496885289


split: 9
mae: 11.423137619153719
mse: 687.5564418626388
rmse: 26.221297486254162
mape: 1.33527331924161


split: 10
mae: 20.010325922457884
mse: 416

In [161]:
# run dataset without extremes and 2000 - 2016 years with splits
df_2000_2016 = df_init.loc[
    (df_init.profitability < 1000) & (df_init.profitability > -1000) &
    (df_init.META__year > 1999) & (df_init.META__year < 2017)
].reset_index(drop=True)
X, y = prepare_X_y(df_2000_2016)
without_extremes_2000_2016_metrics = run_RF(X, y, splits=15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


split: 1
mae: 11.08249938567679
mse: 893.2698357742238
rmse: 29.887620108905022
mape: 1.1925120938581122


split: 2
mae: 6.828108396328576
mse: 213.70049773850744
rmse: 14.618498477562852
mape: 0.7768119090636273


split: 3
mae: 8.584473412543147
mse: 942.6786609742697
rmse: 30.703072500553912
mape: 0.9116273533086406


split: 4
mae: 7.527343998258807
mse: 405.7135460348314
rmse: 20.14233218956612
mape: 0.8604458913788755


split: 5
mae: 8.130501210608445
mse: 323.0738768014977
rmse: 17.974255945699053
mape: 0.9487204470599803


split: 6
mae: 13.747660469191786
mse: 2335.807907610154
rmse: 48.33019664361147
mape: 2.3064985480560516


split: 7
mae: 15.608990167777568
mse: 3508.1134902245667
rmse: 59.22932964524051
mape: 303.3971161708261


split: 8
mae: 11.956768000131042
mse: 855.6338134180618
rmse: 29.25121900738603
mape: 1.4514730438478496


split: 9
mae: 22.232388197188893
mse: 4595.641605850821
rmse: 67.79116170896337
mape: 5.0421883799812415


split: 10
mae: 19.595153813141348
mse

In [35]:
# run dataset without extremes without splits, validation on profitability
df_without_extremes = df_init.loc[
    (df_init.profitability < 1000) & (df_init.profitability > -1000)
].reset_index(drop=True)
X, y = prepare_X_y(df_without_extremes)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

RFR = RandomForestRegressor(n_estimators=100, min_samples_split=3, n_jobs=8)
RFR.fit(X_train, y_train)
y_pred = RFR.predict(X_test)

validation = print_metrics(y_test, y_pred)

mae: 0.0069421859812688145
mse: 0.0004643333584251547
rmse: 0.021548395727412163
mape: 0.47894776029959757


In [13]:
# run dataset without extremes without splits, validation on profitability, test with revenue
df_without_extremes = df_init.loc[
    (df_init.profitability < 1000) & (df_init.profitability > -1000)
].reset_index(drop=True)
X, y = prepare_X_y(df_without_extremes)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

RFR = RandomForestRegressor(n_estimators=100, min_samples_split=3, n_jobs=8)
RFR.fit(X_train, y_train)
y_pred = RFR.predict(X_test)

print("predicted profitability")
print_metrics(y_test, y_pred)

print("predicted revenue from profitability")
df_test = df_without_extremes[df_without_extremes.index.isin(X_test.index)].reindex(X_test.index)

min_prof = df_test['profitability'].min()
df_test['profitability_pred'] = y_pred - 1 + min_prof
y_pred_revenue = get_revenue_col(df_test, 'profitability_pred', 'budget')
validation = print_metrics(list(df_test['revenue']), y_pred_revenue)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


predicted profitability
mae: 11.95857582142652
mse: 1510.2667052802956
rmse: 38.86215003419517
mape: 1.8445687662727128
predicted revenue from profitability
mae: 2649217754.8656735
mse: 2.1404302000301167e+19
rmse: 4626478358.352189
mape: 51235.729547628456


In [26]:
# revenue
df_without_extremes = df_init.loc[
    (df_init.profitability < 1000) & (df_init.profitability > -1000)
].reset_index(drop=True)
X, y = prepare_X_y_revenue(df_without_extremes)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

RFR = RandomForestRegressor(n_estimators=100, min_samples_split=3, n_jobs=8)
RFR.fit(X_train, y_train)
y_pred = RFR.predict(X_test)

print("predicted revenue")
print_metrics(y_test, y_pred)

print("predicted profitability from revenue")
df_test_revenue = df_without_extremes[df_without_extremes.index.isin(X_test.index)].reindex(X_test.index)
min_prof = df_test_revenue['profitability'].min()
df_test_revenue['revenue_pred'] = y_pred
df_test_revenue['profitability_moved'] = df_test_revenue['profitability'] + 1 - min_prof

df_test_revenue['profitability_pred'] = get_profitability_col(df_test_revenue, 'revenue_pred', 'budget')
df_test_revenue['profitability_pred'] = df_test_revenue['profitability_pred'] + 1 - min_prof
validation = print_metrics(list(df_test_revenue['profitability_moved']), list(df_test_revenue['profitability_pred']))

predicted revenue
mae: 45857965.225732565
mse: 8555634838383524.0
rmse: 92496674.7423037
mape: 2618.897657751647
predicted profitability from revenue
mae: 13.210061534540921
mse: 2743.3991550485694
rmse: 52.37746800914081
mape: 69.62818195767936


In [173]:
list(df_init.columns)

['budget',
 'META__id',
 'PROCESS__original_language',
 'revenue',
 'runtime',
 'spoken_languages',
 'META__year',
 'META__month',
 'META__day',
 'weekend',
 'META__collection_name',
 'META__cast_1_name',
 'META__cast_2_name',
 'META__cast_3_name',
 'META__cast_4_name',
 'META__cast_5_name',
 'META__cast_6_name',
 'META__cast_7_name',
 'META__cast_8_name',
 'META__crew__sound__music_editor',
 'META__crew__sound__original_music_composer',
 'META__crew__sound__sound_designer',
 'META__crew__sound__sound_effects_editor',
 'META__crew__sound__sound_re_recording_mixer',
 'META__crew__sound__supervising_sound_editor',
 'META__crew__directing__director__1',
 'META__crew__directing__script_supervisor',
 'META__crew__production__casting',
 'META__crew__production__executive_producer__1',
 'META__crew__production__producer__1',
 'META__crew__production__producer__2',
 'META__crew__editing__editor__1',
 'META__crew__costume__costume_designer',
 'META__crew__costume__costume_supervisor',
 'META__c