In [1]:
import numpy as np
import pandas as pd
import time
import pickle


from joblib import dump, load
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer

pd.options.display.float_format = '{:20,.15f}'.format
pd.options.display.max_columns = None

In [2]:
from sklearn.ensemble import IsolationForest
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

from tensorflow.keras.callbacks import EarlyStopping

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import ParameterSampler, ParameterGrid
import sklearn
from statistics import mean
from sklearn import metrics

In [3]:
from importlib import reload
# module references for reload
import process_df, output, run_models

from process_df import Process, create_average_columns, split_process_df, split_df
from output import output_metrics, mape, smape, get_metrics, print_sorted_actual_to_predicted_graphs_only_test
from run_models import run_nn, run_lgb, run_rf, build_nn_model

reload(process_df)
reload(output)
reload(run_models)

# reimport in case changed
from process_df import Process, create_average_columns, split_process_df, split_df
from output import output_metrics, mape, smape, get_metrics, print_sorted_actual_to_predicted_graphs_only_test
from run_models import run_nn, run_lgb, run_rf, build_nn_model

In [4]:
def get_df_work_columns(df):
    return df[[col for col in df_full.columns if not 'META' in col or col == 'META__revenue']]

In [16]:
df_raw = pd.read_csv('outliers/df_raw.csv', index_col='id')
df_full = df_raw.copy()
df_full_name = 'outliers/df_full.csv'
df_full = create_average_columns(df_full, verbose=0)
df_full.to_csv(df_full_name)

In [17]:
df_full = pd.read_csv(df_full_name, index_col='id')

data_raw = split_df(get_df_work_columns(df_full))
with open(f'{df_full_name}_raw_data.pickle', 'wb') as handle:
    pickle.dump(data_raw, handle)

data, process = split_process_df(get_df_work_columns(df_full))
with open(f'{df_full_name}_data.pickle', 'wb') as handle:
    pickle.dump(data, handle)
with open(f'{df_full_name}_process.pickle', 'wb') as handle:
    pickle.dump(process, handle)

In [5]:
df_full_name = 'outliers/df_full.csv'

df_full = pd.read_csv(df_full_name, index_col='id')
df_raw = pd.read_csv('outliers/df_raw.csv', index_col='id')

with open(f'{df_full_name}_raw_data.pickle', 'rb') as handle:
    data_raw = pickle.load(handle)
X_raw = pd.concat([data_raw['X_train'], data_raw['X_test'], data_raw['X_val']])
y_raw = pd.concat([data_raw['y_train'], data_raw['y_test'], data_raw['y_val']])

with open(f'{df_full_name}_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
    
with open(f'{df_full_name}_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

In [6]:
class NN_estimator(BaseEstimator, RegressorMixin):
    def __init__(self, print_graphs = False):
        self.print_graphs = print_graphs

    
    def fit(self, X, y, **kwargs):
        self.process_ = kwargs['process']
        input_shape = len(data['X_train'].keys())
        model = build_nn_model(input_shape)

        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)

        model.fit(X, y, epochs=10000, validation_split=0.1, verbose=0, batch_size=256,
            shuffle=True, callbacks=[es], use_multiprocessing=True)
        self.model_ = model
        return self
    
    def score(self, X, y):
        yhat = self.model_.predict(X.values)
        if len(yhat.shape) == 2 and yhat.shape[1] == 1:
            yhat = yhat.flatten()

        print_log=False
        for apply_function in reversed(self.process_.y_process):
            print_log=True
            y = apply_function(y)
            yhat = apply_function(yhat)

        if self.print_graphs:
            print_sorted_actual_to_predicted_graphs_only_test(y, yhat, print_log=print_log)
        return get_metrics(y, yhat, X.shape[1])

def param_search(estimator, param_dict, n_iter=None, seed=None):
    if n_iter is None:
        param_iter = ParameterGrid(param_dict)
    else:
        param_iter = ParameterSampler(param_dict,
                                      n_iter,
                                      random_state=seed)
 
    estimators = []
    for params in param_iter:
        new_estimator = sklearn.clone(estimator)
        new_estimator.set_params(**params)
        estimators.append(new_estimator)
    return estimators

In [7]:
class OutliersEstimator(BaseEstimator, RegressorMixin):
    def __init__(self, outliers_detector=None, drop_index_list=None, folds=10, print_graphs=False):
        self.outliers_detector = outliers_detector
        self.drop_index_list = drop_index_list
        self.folds = folds
        self.print_graphs = print_graphs

    def fit(self, X, y, **kwargs):
        def get_df_work_column(df):
            return df[[col for col in df.columns if not 'META' in col or col == 'META__revenue']]

        df_raw = pd.read_csv('outliers/df_raw.csv', index_col='id')

        print("######################################################")
        if self.outliers_detector is not None:
            print(self.outliers_detector.get_params())
            X_full = X.copy()
            X_full['revenue'] = y
            mask_to_drop = self.outliers_detector.fit_predict(X_full) == -1
            drop_index_list = df_raw[mask_to_drop].index
        else:
            drop_index_list = self.drop_index_list
        self.drop_index_list_ = drop_index_list
        print('Number of outliers: ', len(drop_index_list))
        new_df = df_raw.drop(drop_index_list)
        df = create_average_columns(new_df, verbose=0)
        print('Average created')
        df = get_df_work_columns(df)
        data, self.process_ = split_process_df(df)
        print('Processed')
        self.X_ = pd.concat([data['X_train'], data['X_test'], data['X_val']])
        self.y_ = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
        return self

    def score(self, X=None, y=None):
        cv_results = []
        kf = KFold(n_splits=self.folds, random_state=0)
        for train_index, test_index in kf.split(self.X_, self.y_):
            X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
            y_train, y_test = y[train_index], y[test_index]
            data_part = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
            estimator = NN_estimator(print_graphs=self.print_graphs)
            estimator.fit(X_train, y_train, process=self.process_)
            fm = estimator.score(X_test, y_test)
            print(f'smape: {fm["smape"]}, mape: {fm["mape"]}, mae: {fm["mae"]}')
            cv_results.append(fm)
        metrics = list(cv_results[0].keys())
        metric_results = {metric: np.mean([cv_results[split_index][metric] for split_index in range(len(cv_results))]) for metric in metrics}
        print(metric_results)
        return {'cv_iterations': cv_results, 'cv_metrics': metric_results}

In [8]:
def get_sorted_metric(arr, metric):
    return sorted([{
        'i': i+1, 
        'smape': fold['cv_metrics']['smape'],
        'mape': fold['cv_metrics']['mape'],
        'mae': fold['cv_metrics']['mae']
        } for i, fold in enumerate(arr)],
    key=lambda x: x[metric])

In [22]:
del_revenue = df_full.META__revenue < 250000
print('del revenue', sum(del_revenue))
del_year = df_raw.META__year < 1970
print('del year', sum(del_year))
del_budget = df_full.budget < 250000
print('del budget', sum(del_budget))
profitability_quantiles = df_raw.META__profitability.quantile([0.1, 0.9])
del_profitability = (df_raw.META__profitability < profitability_quantiles.iloc[0]) | (df_raw.META__profitability > profitability_quantiles.iloc[1])
print('del profitability', sum(del_profitability))

common_del = del_revenue.astype(int) + del_year.astype(int) + del_budget.astype(int) + del_profitability.astype(int)

for c in df_full.columns:
    if 'profit' in c:
        quantiles = df_full[c].quantile([0.1, 0.9])
        print(c, quantiles.iloc[0], quantiles.iloc[1])
        del_pr = (df_full[c] < quantiles.iloc[0]) | (df_full[c] > quantiles.iloc[1])
        print(c, sum(del_pr))
        common_del = common_del.astype(int) + del_pr.astype(int)

for i in range(max(common_del)):
    print(i, sum(common_del > i))

del revenue 534
del year 382
del budget 364
del profitability 1500
META__profitability -4.424662959957207 7.39777777777778
META__profitability 1500
META__year_avg_profitability -10.812845724436539 16.025788582653266
META__year_avg_profitability 1267
cast_1_avg_profit 1571591.4 143041135.70000005
cast_1_avg_profit 1160
cast_2_avg_profit 197068.00000000116 133116862.20000002
cast_2_avg_profit 1064
cast_3_avg_profit -328240.0 124520203.0
cast_3_avg_profit 986
cast_4_avg_profit -917845.5999999997 127494148.40000002
cast_4_avg_profit 910
cast_5_avg_profit -1728369.5999999999 124271503.39999998
cast_5_avg_profit 818
cast_6_avg_profit -2677605.1999999993 121049458.39999996
cast_6_avg_profit 746
cast_7_avg_profit -3828683.0 119660173.60000005
cast_7_avg_profit 657
cast_8_avg_profit -5025942.1 119892339.10000002
cast_8_avg_profit 586
production_company_1_avg_profit 273410.2000000003 124954690.80000004
production_company_1_avg_profit 966
production_company_2_avg_profit 527914.4000000003 12953267

In [11]:
###########################  custom outliers

full_custom = []
outlier_detector_number = 0
for i in range(5, 17):
    outlier_detector_number += 1
    print('outlier detector number', outlier_detector_number)
    drop_index_list = df_full[common_del > i].index
    outliers_estimator = OutliersEstimator(drop_index_list=drop_index_list)
    outliers_estimator.fit(X, y)
    score = outliers_estimator.score(X,y)
    full_custom.append({
        'cv_iteration_metrics': score['cv_iterations'],
        'cv_metrics': score['cv_metrics'],
        'drop_index_list': outliers_estimator.drop_index_list_,
    })

outlier detector number 1
######################################################
Number of outliers:  1548
Average created
Processed
Epoch 00147: early stopping
smape: 75.96709583399445, mape: 595.8460261822111, mae: 23342589.0
Epoch 00139: early stopping
smape: 75.87053000609036, mape: 442.4296505141194, mae: 22157547.0
Epoch 00142: early stopping
smape: 74.27619573051292, mape: 576.6120743598985, mae: 28019832.0
Epoch 00137: early stopping
smape: 72.16243871871151, mape: 710.7722284827328, mae: 23966384.0
Epoch 00163: early stopping
smape: 73.3391275785005, mape: 480.6883463722996, mae: 24131503.0
Epoch 00130: early stopping
smape: 75.3865731387193, mape: 322.6378158091641, mae: 30748370.0
Epoch 00150: early stopping
smape: 74.01804650821138, mape: 568.742288638622, mae: 23667672.0
Epoch 00127: early stopping
smape: 71.15274940170175, mape: 598.4095652936913, mae: 25672328.0
Epoch 00156: early stopping
smape: 76.18397238790205, mape: 426.6392404261476, mae: 22712642.0
Epoch 00156: ea

In [12]:
dump(full_custom, 'outliers/drop_indices/full_custom.joblib')

[&#39;outliers/drop_indices/full_custom.joblib&#39;]

In [9]:
full_custom = load('outliers/drop_indices/full_custom.joblib')

In [32]:
get_sorted_metric(full_custom, 'mape')

[{&#39;i&#39;: 1,
  &#39;smape&#39;: 74.196265957187,
  &#39;mape&#39;: 528.2764523375284,
  &#39;mae&#39;: 24976081.5},
 {&#39;i&#39;: 11,
  &#39;smape&#39;: 78.03229367730759,
  &#39;mape&#39;: 619.6738770950244,
  &#39;mae&#39;: 36659319.0},
 {&#39;i&#39;: 10,
  &#39;smape&#39;: 78.3010358222167,
  &#39;mape&#39;: 649.0413721410722,
  &#39;mae&#39;: 37210851.3},
 {&#39;i&#39;: 12,
  &#39;smape&#39;: 77.47238669327609,
  &#39;mape&#39;: 675.8189951995835,
  &#39;mae&#39;: 36655062.8},
 {&#39;i&#39;: 7,
  &#39;smape&#39;: 77.11133194726828,
  &#39;mape&#39;: 683.1573867303961,
  &#39;mae&#39;: 32799923.8},
 {&#39;i&#39;: 9,
  &#39;smape&#39;: 77.87027466525929,
  &#39;mape&#39;: 696.7555279357057,
  &#39;mae&#39;: 35941667.0},
 {&#39;i&#39;: 8,
  &#39;smape&#39;: 77.32282136595421,
  &#39;mape&#39;: 709.7558981408156,
  &#39;mae&#39;: 34154078.2},
 {&#39;i&#39;: 6,
  &#39;smape&#39;: 77.24465505296797,
  &#39;mape&#39;: 741.0067377067674,
  &#39;mae&#39;: 31861354.3},
 {&#39;i&#39;: 4

In [48]:
outliers_estimator = OutliersEstimator(drop_index_list=full_custom[12]['drop_index_list'])
outliers_estimator.fit(X, y)
best_custom = outliers_estimator.score(X,y)

######################################################
Number of outliers:  63
Average created
Processed
1:	split num
Epoch 00116: early stopping
2:	split num
Epoch 00123: early stopping
3:	split num
Epoch 00149: early stopping
4:	split num
Epoch 00152: early stopping
5:	split num
Epoch 00141: early stopping
{&#39;smape&#39;: 78.86781900493382, &#39;mape&#39;: 758.802669203645, &#39;mae&#39;: 37201768.0, &#39;rmse&#39;: 85191143.2, &#39;adj_r2&#39;: 0.5730848608486628}


In [9]:
###########################  ISO forest 
param_grid = {
    'max_samples': sp_randint(256, 7500),
    'max_features': sp_uniform(loc=0.5, scale=0.5),
    'contamination': sp_uniform(loc=0.01, scale=0.05),
}
outlier_detectors = param_search(IsolationForest(n_jobs=-1, random_state=0, n_estimators=5000), param_grid, n_iter=10)

full_iso = []
outlier_detector_number = 0
for outlier_detector in outlier_detectors:
    outlier_detector_number += 1
    print('outlier detector number', outlier_detector_number)
    outliers_estimator = OutliersEstimator(outliers_detector=outlier_detector)
    outliers_estimator.fit(X, y)
    score = outliers_estimator.score(X,y)
    full_iso.append({
        'cv_iteration_metrics': score['cv_iterations'],
        'cv_metrics': score['cv_metrics'],
        'drop_index_list': outliers_estimator.drop_index_list_,
    })

outlier detector number 1
######################################################
{&#39;behaviour&#39;: &#39;deprecated&#39;, &#39;bootstrap&#39;: False, &#39;contamination&#39;: 0.05170343400034513, &#39;max_features&#39;: 0.8458931108551717, &#39;max_samples&#39;: 4505, &#39;n_estimators&#39;: 5000, &#39;n_jobs&#39;: -1, &#39;random_state&#39;: 0, &#39;verbose&#39;: 0, &#39;warm_start&#39;: False}
Number of outliers:  388
Average created
Processed
Epoch 00191: early stopping
smape: 81.73468966966412, mape: 618.5310912010285, mae: 36884349.0
Epoch 00120: early stopping
smape: 83.34586236178802, mape: 607.415424975081, mae: 41627842.0
Epoch 00164: early stopping
smape: 79.2118253710323, mape: 800.3718340329801, mae: 43809896.0
Epoch 00199: early stopping
smape: 77.49835115096748, mape: 489.1120759846312, mae: 38168593.0
Epoch 00157: early stopping
smape: 79.84398297953815, mape: 386.9801985053163, mae: 50318929.0
Epoch 00199: early stopping
smape: 79.56465006981394, mape: 459.9157983966

In [10]:
dump(full_iso, 'outliers/drop_indices/full_iso.joblib')

[&#39;outliers/drop_indices/full_iso.joblib&#39;]

In [10]:
full_iso = load('outliers/drop_indices/full_iso.joblib')

In [36]:
get_sorted_metric(full_iso, 'mape')

[{&#39;i&#39;: 2,
  &#39;smape&#39;: 78.82767176091583,
  &#39;mape&#39;: 556.9972645924881,
  &#39;mae&#39;: 40408405.5},
 {&#39;i&#39;: 9,
  &#39;smape&#39;: 79.4892839428293,
  &#39;mape&#39;: 593.821083949237,
  &#39;mae&#39;: 40806196.0},
 {&#39;i&#39;: 6,
  &#39;smape&#39;: 79.44932274312981,
  &#39;mape&#39;: 606.723543148081,
  &#39;mae&#39;: 40835787.8},
 {&#39;i&#39;: 10,
  &#39;smape&#39;: 78.48033567772087,
  &#39;mape&#39;: 611.1444016665662,
  &#39;mae&#39;: 40024131.8},
 {&#39;i&#39;: 8,
  &#39;smape&#39;: 78.64346241874092,
  &#39;mape&#39;: 611.4869083835815,
  &#39;mae&#39;: 40552172.0},
 {&#39;i&#39;: 5,
  &#39;smape&#39;: 78.88249709906673,
  &#39;mape&#39;: 628.9174522530113,
  &#39;mae&#39;: 40672952.7},
 {&#39;i&#39;: 4,
  &#39;smape&#39;: 79.01388451625655,
  &#39;mape&#39;: 630.3565414904372,
  &#39;mae&#39;: 41277595.6},
 {&#39;i&#39;: 3,
  &#39;smape&#39;: 78.95408500108609,
  &#39;mape&#39;: 655.6216396322618,
  &#39;mae&#39;: 41000789.5},
 {&#39;i&#39;: 1,


In [51]:
outliers_estimator = OutliersEstimator(drop_index_list=full_iso[5]['drop_index_list'])
outliers_estimator.fit(X, y)
best_iso = outliers_estimator.score(X,y)

######################################################
Number of outliers:  451
Average created
Processed
1:	split num
Epoch 00189: early stopping
2:	split num
Epoch 00208: early stopping
3:	split num
Epoch 00141: early stopping
4:	split num
Epoch 00206: early stopping
5:	split num
Epoch 00171: early stopping
{&#39;smape&#39;: 79.9151963446374, &#39;mape&#39;: 759.6551037917527, &#39;mae&#39;: 40362289.2, &#39;rmse&#39;: 95566578.2, &#39;adj_r2&#39;: 0.572733518675042}


In [11]:
###########################  Local outlier factor
param_grid = {
    'n_neighbors': sp_randint(1, 30),
    'contamination': sp_uniform(0.01, 0.05),
}
outlier_detectors = param_search(LocalOutlierFactor(n_jobs=-1, algorithm='kd_tree'), param_grid, n_iter=10)

full_lof = []
outlier_detector_number = 0
for outlier_detector in outlier_detectors:
    outlier_detector_number += 1
    print('outlier detector number', outlier_detector_number)
    outliers_estimator = OutliersEstimator(outliers_detector=outlier_detector)
    outliers_estimator.fit(X, y)
    score = outliers_estimator.score(X,y)
    full_lof.append({
        'cv_iteration_metrics': score['cv_iterations'],
        'cv_metrics': score['cv_metrics'],
        'drop_index_list': outliers_estimator.drop_index_list_,
    })

outlier detector number 1
######################################################
{&#39;algorithm&#39;: &#39;kd_tree&#39;, &#39;contamination&#39;: 0.017492854051696313, &#39;leaf_size&#39;: 30, &#39;metric&#39;: &#39;minkowski&#39;, &#39;metric_params&#39;: None, &#39;n_jobs&#39;: -1, &#39;n_neighbors&#39;: 2, &#39;novelty&#39;: False, &#39;p&#39;: 2}
Number of outliers:  132
Average created
Processed
Epoch 00149: early stopping
smape: 80.70788169344364, mape: 684.9268241262993, mae: 35888046.0
Epoch 00149: early stopping
smape: 80.84412321332162, mape: 732.3050105632881, mae: 44115348.0
Epoch 00150: early stopping
smape: 79.99147917270365, mape: 762.9809276590008, mae: 40271848.0
Epoch 00112: early stopping
smape: 77.54039838985521, mape: 397.3358878856311, mae: 38988811.0
Epoch 00204: early stopping
smape: 80.09598819958248, mape: 419.73888113781214, mae: 52276235.0
Epoch 00140: early stopping
smape: 80.32149628992704, mape: 785.1168411699467, mae: 38761071.0
Epoch 00147: early stopp

In [12]:
dump(full_lof, 'outliers/drop_indices/full_lof.joblib')

[&#39;outliers/drop_indices/full_lof.joblib&#39;]

In [11]:
full_lof = load('outliers/drop_indices/full_lof.joblib')

In [42]:
get_sorted_metric(full_lof, 'mape')

[{&#39;i&#39;: 3,
  &#39;smape&#39;: 79.6836454289854,
  &#39;mape&#39;: 582.8558829932995,
  &#39;mae&#39;: 40934213.9},
 {&#39;i&#39;: 8,
  &#39;smape&#39;: 78.98223124978108,
  &#39;mape&#39;: 602.426446482492,
  &#39;mae&#39;: 40524790.4},
 {&#39;i&#39;: 10,
  &#39;smape&#39;: 79.24105422508345,
  &#39;mape&#39;: 610.7955922861822,
  &#39;mae&#39;: 40370176.7},
 {&#39;i&#39;: 2,
  &#39;smape&#39;: 78.91865878898382,
  &#39;mape&#39;: 630.6836009786718,
  &#39;mae&#39;: 39592009.7},
 {&#39;i&#39;: 7,
  &#39;smape&#39;: 78.69167426828265,
  &#39;mape&#39;: 637.414599345647,
  &#39;mae&#39;: 40922653.5},
 {&#39;i&#39;: 1,
  &#39;smape&#39;: 78.68297270239688,
  &#39;mape&#39;: 640.395660705046,
  &#39;mae&#39;: 40882052.7},
 {&#39;i&#39;: 4,
  &#39;smape&#39;: 79.09687922046163,
  &#39;mape&#39;: 661.5074214929475,
  &#39;mae&#39;: 40300491.2},
 {&#39;i&#39;: 6,
  &#39;smape&#39;: 78.89549248980984,
  &#39;mape&#39;: 688.2950540689587,
  &#39;mae&#39;: 40829837.9},
 {&#39;i&#39;: 9,
 

In [53]:
outliers_estimator = OutliersEstimator(drop_index_list=full_lof[9]['drop_index_list'])
outliers_estimator.fit(X, y)
best_lof = outliers_estimator.score(X,y)

######################################################
Number of outliers:  107
Average created
Processed
1:	split num
Epoch 00122: early stopping
2:	split num
Epoch 00150: early stopping
3:	split num
Epoch 00164: early stopping
4:	split num
Epoch 00148: early stopping
5:	split num
Epoch 00177: early stopping
{&#39;smape&#39;: 79.47346793674883, &#39;mape&#39;: 699.5024022170949, &#39;mae&#39;: 40887748.2, &#39;rmse&#39;: 97364345.4, &#39;adj_r2&#39;: 0.5785134663548075}


In [13]:
###########################  One class SVM
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'nu': sp_uniform(0.01, 0.05),
}
outlier_detectors = param_search(OneClassSVM(gamma='auto'), param_grid, n_iter=10)

full_ocsvm = []
outlier_detector_number = 0
for outlier_detector in outlier_detectors:
    outlier_detector_number += 1
    print('outlier detector number', outlier_detector_number)
    outliers_estimator = OutliersEstimator(outliers_detector=outlier_detector)
    outliers_estimator.fit(X, y)
    score = outliers_estimator.score(X,y)
    full_ocsvm.append({
        'cv_iteration_metrics': score['cv_iterations'],
        'cv_metrics': score['cv_metrics'],
        'drop_index_list': outliers_estimator.drop_index_list_,
    })

outlier detector number 1
######################################################
{&#39;cache_size&#39;: 200, &#39;coef0&#39;: 0.0, &#39;degree&#39;: 3, &#39;gamma&#39;: &#39;auto&#39;, &#39;kernel&#39;: &#39;poly&#39;, &#39;max_iter&#39;: -1, &#39;nu&#39;: 0.04823798844950552, &#39;shrinking&#39;: True, &#39;tol&#39;: 0.001, &#39;verbose&#39;: False}
Number of outliers:  1843
Average created
Processed
Epoch 00190: early stopping
smape: 79.75890475491413, mape: 947.4304460061085, mae: 37714917.0
Epoch 00165: early stopping
smape: 82.16733520144373, mape: 574.8765532726355, mae: 35903647.0
Epoch 00206: early stopping
smape: 79.24787886898449, mape: 1092.9300564854768, mae: 43957640.0
Epoch 00192: early stopping
smape: 79.73875199442008, mape: 1039.1107582318207, mae: 41241989.0
Epoch 00182: early stopping
smape: 77.79890974002643, mape: 615.1280980276774, mae: 38631471.0
Epoch 00183: early stopping
smape: 79.48945054642604, mape: 358.78416596770114, mae: 45815733.0
Epoch 00146: early sto

In [14]:
dump(full_ocsvm, 'outliers/drop_indices/full_ocsvm.joblib')

[&#39;outliers/drop_indices/full_ocsvm.joblib&#39;]

In [12]:
full_ocsvm = load('outliers/drop_indices/full_ocsvm.joblib')

In [44]:
get_sorted_metric(full_ocsvm, 'mape')

[{&#39;i&#39;: 6,
  &#39;smape&#39;: 79.43193296031026,
  &#39;mape&#39;: 573.2936140726778,
  &#39;mae&#39;: 41375935.0},
 {&#39;i&#39;: 10,
  &#39;smape&#39;: 79.09731293741514,
  &#39;mape&#39;: 599.9230846540452,
  &#39;mae&#39;: 40188386.7},
 {&#39;i&#39;: 7,
  &#39;smape&#39;: 78.86431581464589,
  &#39;mape&#39;: 612.8706042045851,
  &#39;mae&#39;: 40814944.7},
 {&#39;i&#39;: 5,
  &#39;smape&#39;: 78.52610415741063,
  &#39;mape&#39;: 617.3815822404961,
  &#39;mae&#39;: 40505700.2},
 {&#39;i&#39;: 8,
  &#39;smape&#39;: 79.20968517229899,
  &#39;mape&#39;: 631.1203163734467,
  &#39;mae&#39;: 41077979.7},
 {&#39;i&#39;: 9,
  &#39;smape&#39;: 79.88640871862154,
  &#39;mape&#39;: 642.2755522621966,
  &#39;mae&#39;: 40508811.9},
 {&#39;i&#39;: 3,
  &#39;smape&#39;: 78.62049163083798,
  &#39;mape&#39;: 653.3049455330945,
  &#39;mae&#39;: 39751562.6},
 {&#39;i&#39;: 4,
  &#39;smape&#39;: 79.06735363310574,
  &#39;mape&#39;: 684.0379783422791,
  &#39;mae&#39;: 40614179.7},
 {&#39;i&#39;: 

In [None]:
outliers_estimator = OutliersEstimator(drop_index_list=full_ocsvm[9]['drop_index_list'])
outliers_estimator.fit(X, y)
best_ocsvm = outliers_estimator.score(X,y)

In [13]:
indices = {
    'bc': full_custom[10]['drop_index_list'],
    'bi': full_iso[1]['drop_index_list'],
    'bo': full_ocsvm[9]['drop_index_list'],
    'bl': full_lof[7]['drop_index_list'],
}

In [14]:
best_combinations = {
    key: None for key in ['bc_bi', 'bc_bo', 'bc_bl', 'bi_bo', 'bi_bl', 'bo_bl', 'bc_bo_bl', 'bc_bi_bo', 'bc_bi_bl', 'bi_bo_bl', 'bc_bi_bo_bl']
}

In [15]:
for combination in best_combinations:
    print(combination)
    parts = [indices[key] for key in combination.split('_')]
    drop_index_list = list(set([y for x in parts for y in x]))
    outliers_estimator = OutliersEstimator(drop_index_list=drop_index_list)
    outliers_estimator.fit(X, y)
    best_combinations[combination] = outliers_estimator.score(X,y)

bc_bi
######################################################
Number of outliers:  188
Average created
Processed
Epoch 00121: early stopping
smape: 80.33988645877008, mape: 827.6071665739337, mae: 33134049.0
Epoch 00194: early stopping
smape: 82.31289991504389, mape: 622.9094059855489, mae: 41065382.0
Epoch 00203: early stopping
smape: 78.76420414048542, mape: 743.8492283973674, mae: 35585430.0
Epoch 00149: early stopping
smape: 76.18561788755693, mape: 569.8808261956663, mae: 35429273.0
Epoch 00165: early stopping
smape: 79.69961229471922, mape: 406.0294561605531, mae: 46030547.0
Epoch 00168: early stopping
smape: 78.8212047022682, mape: 510.3832402072987, mae: 34569535.0
Epoch 00160: early stopping
smape: 76.9178599402494, mape: 777.0801378825558, mae: 36708331.0
Epoch 00146: early stopping
smape: 77.70993875957204, mape: 986.0971612206644, mae: 33897354.0
Epoch 00112: early stopping
smape: 76.79740250449778, mape: 966.7878268419074, mae: 36109398.0
Epoch 00140: early stopping
smape: 

In [16]:
best_combinations

{&#39;bc_bi&#39;: {&#39;cv_iterations&#39;: [{&#39;smape&#39;: 80.33988645877008,
    &#39;mape&#39;: 827.6071665739337,
    &#39;mae&#39;: 33134049.0,
    &#39;rmse&#39;: 74090977.0,
    &#39;adj_r2&#39;: 0.525846949914238},
   {&#39;smape&#39;: 82.31289991504389,
    &#39;mape&#39;: 622.9094059855489,
    &#39;mae&#39;: 41065382.0,
    &#39;rmse&#39;: 110838373.0,
    &#39;adj_r2&#39;: 0.39365850743409825},
   {&#39;smape&#39;: 78.76420414048542,
    &#39;mape&#39;: 743.8492283973674,
    &#39;mae&#39;: 35585430.0,
    &#39;rmse&#39;: 73342294.0,
    &#39;adj_r2&#39;: 0.49829730679694395},
   {&#39;smape&#39;: 76.18561788755693,
    &#39;mape&#39;: 569.8808261956663,
    &#39;mae&#39;: 35429273.0,
    &#39;rmse&#39;: 75643476.0,
    &#39;adj_r2&#39;: 0.5129168745743246},
   {&#39;smape&#39;: 79.69961229471922,
    &#39;mape&#39;: 406.0294561605531,
    &#39;mae&#39;: 46030547.0,
    &#39;rmse&#39;: 114290731.0,
    &#39;adj_r2&#39;: 0.4192433506223593},
   {&#39;smape&#39;: 78.821204

In [28]:
sorted([{
        'key': key, 
        'smape': best_combinations[key]['cv_metrics']['smape'],
        'mape': best_combinations[key]['cv_metrics']['mape'],
        'mae': best_combinations[key]['cv_metrics']['mae']
        } for i, key in enumerate(best_combinations)],
    key=lambda x: x['mae'])

[{&#39;key&#39;: &#39;bc_bi_bo_bl&#39;,
  &#39;smape&#39;: 78.11561334390336,
  &#39;mape&#39;: 733.5563913402956,
  &#39;mae&#39;: 36048028.9},
 {&#39;key&#39;: &#39;bc_bi_bl&#39;,
  &#39;smape&#39;: 77.90068195288313,
  &#39;mape&#39;: 743.0890924978464,
  &#39;mae&#39;: 36622088.1},
 {&#39;key&#39;: &#39;bc_bo_bl&#39;,
  &#39;smape&#39;: 77.9812299409007,
  &#39;mape&#39;: 692.9400337275691,
  &#39;mae&#39;: 36702222.8},
 {&#39;key&#39;: &#39;bc_bl&#39;,
  &#39;smape&#39;: 78.2075581036431,
  &#39;mape&#39;: 719.1582025088153,
  &#39;mae&#39;: 36793684.3},
 {&#39;key&#39;: &#39;bc_bi&#39;,
  &#39;smape&#39;: 78.3635955946614,
  &#39;mape&#39;: 678.1343878084289,
  &#39;mae&#39;: 36997349.3},
 {&#39;key&#39;: &#39;bc_bi_bo&#39;,
  &#39;smape&#39;: 78.32105323159448,
  &#39;mape&#39;: 659.3382773124229,
  &#39;mae&#39;: 37124694.5},
 {&#39;key&#39;: &#39;bc_bo&#39;,
  &#39;smape&#39;: 78.24322762485863,
  &#39;mape&#39;: 681.746139503129,
  &#39;mae&#39;: 37209692.8},
 {&#39;key&#39;:

In [21]:
 [key for i, key in enumerate(best_combinations)]

[&#39;bc_bi&#39;,
 &#39;bc_bo&#39;,
 &#39;bc_bl&#39;,
 &#39;bi_bo&#39;,
 &#39;bi_bl&#39;,
 &#39;bo_bl&#39;,
 &#39;bc_bo_bl&#39;,
 &#39;bc_bi_bo&#39;,
 &#39;bc_bi_bl&#39;,
 &#39;bi_bo_bl&#39;,
 &#39;bc_bi_bo_bl&#39;]

In [33]:
full_custom[10]['drop_index_list']

Int64Index([    89,    121,    122,    285,    330,    672,    673,    674,
               675,    767,   1892,   1894,   1895,   5255,  12444,  12445,
            374720, 188927, 446354, 118340,  24021, 122917, 131634, 135397,
            140607, 281338, 282035, 283995, 284052, 284053, 404368, 271110,
            351286, 353081, 354912, 392044, 429617, 512200, 336843, 337339,
            206647, 338952,  49026,  50620,  54138,  57158, 479455,  93456,
             99861, 297802, 299534, 299536, 299537, 301528, 384018, 141052,
            315635, 324852, 166426, 168259, 177677, 181808, 181812],
           dtype=&#39;int64&#39;, name=&#39;id&#39;)

In [36]:
df_raw = pd.read_csv('outliers/df_raw.csv', index_col='id')
df_raw_dropped = df_raw.drop(full_custom[10]['drop_index_list'])
df_raw_dropped.to_csv('df_raw_dropped')
df_full_dropped = df_raw_dropped.copy()
df_full_name = 'outliers/df_full_dropped.csv'
df_full_dropped = create_average_columns(df_full_dropped, verbose=0)
df_full_dropped.to_csv(df_full_name)

In [37]:
df_full_dropped = pd.read_csv(df_full_name, index_col='id')

data_raw = split_df(get_df_work_columns(df_full))
with open(f'{df_full_name}_raw_data.pickle', 'wb') as handle:
    pickle.dump(data_raw, handle)

data, process = split_process_df(get_df_work_columns(df_full))
with open(f'{df_full_name}_data.pickle', 'wb') as handle:
    pickle.dump(data, handle)
with open(f'{df_full_name}_process.pickle', 'wb') as handle:
    pickle.dump(process, handle)

In [79]:
def run_cv(X, y, process):
    cv_results = []

    split_num=0
    n_splits=10
    kf = KFold(n_splits=n_splits)
    for train_index, test_index in kf.split(X, y):
        split_num+=1
        print('###################################')
        print(f'{split_num}:\tsplit num')
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        data_part = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
        cv_results.append({'metrics': run_nn(data_part, process, with_val=False), 'data': data_part})
    metrics = ['smape', 'mape', 'mae', 'rmse', 'adj_r2']
    metric_results = {metric: np.mean([cv_results[split_index]['metrics']['test'][metric] for split_index in range(n_splits)]) for metric in metrics}
    return {'cv_iterations': cv_results, 'cv_metrics': metric_results}

In [84]:
def drop_rows_and_output_result(df, drop_index_list, dirname):
    new_df = df.drop(drop_index_list)
    if not os.path.exists(dirname):
        os.mkdir(dirname)
    new_df.to_csv(f'{dirname}/df_raw.csv')
    df = create_average_columns(new_df, verbose=0)
    print('average created')

    df.to_csv(f'{dirname}/df_all_columns.csv')
    df = get_df_work_columns(df)
    df.to_csv(f'{dirname}/df.csv')

    data, process = split_process_df(df)
    print('processed')
    with open(f'{dirname}/data.pickle', 'wb') as handle:
        pickle.dump(data, handle)
    with open(f'{dirname}/process.pickle', 'wb') as handle:
        pickle.dump(process, handle)

    X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
    y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])

    return run_cv(X, y, process)


In [None]:

df_raw = pd.read_csv('outliers/df_raw.csv', index_col='id')

start = time.time()
o__full = drop_rows_and_output_result(df_raw, [], 'outliers/full')
print(time.time() - start)


In [86]:
dump(o__full, 'outliers/full/results.joblib')

['outliers/full/results.joblib']

In [91]:
o__full['cv_metrics']

{'smape': 80.04576707883504,
 'mape': 728.2753358367537,
 'mae': 41332135.4,
 'rmse': 97884334.3,
 'adj_r2': 0.4765097127225809}

In [94]:
with open(f'outliers/data.pickle', 'rb') as handle:
    data = pickle.load(handle)
with open(f'outliers/process.pickle', 'rb') as handle:
    process = pickle.load(handle)

X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])

In [18]:
iso_results = {}
df_raw = pd.read_csv('outliers/df_raw.csv', index_col='id')

for n_estimators in [50, 100, 250, 500, 2000]:
    iso = IsolationForest(random_state=0, n_jobs=-1, 
        n_estimators=n_estimators)
    yhat = iso.fit_predict(X)
    mask_to_keep = yhat != -1
    print('outliers: ', sum(yhat == -1))
    df_raw[mask_to_keep].index
    # iso_results[contamination] = drop_rows_and_output_result(df_raw, mask_to_keep, f'iso_contamination_{contamination}')

NameError: name &#39;pd&#39; is not defined

In [9]:
for i in iso_results:
    print(i, iso_results[i][1])

0.05 {'smape': 80.05861126141413, 'mape': 793.7242540662393, 'mae': 41239250.8, 'rmse': 96947720.8, 'adj_r2': 0.5125935171000893}


In [76]:
for i in iso_results:
    print(i, iso_results[i][1])

0.01 {'smape': 79.8379840350851, 'mape': 803.3126516871604, 'mae': 41064231.4, 'rmse': 97550786.2, 'adj_r2': 0.5103597131813838}
0.05 {'smape': 79.80010070996218, 'mape': 829.701486791581, 'mae': 40850199.6, 'rmse': 96056112.8, 'adj_r2': 0.525200785872264}
0.1 {'smape': 79.96970676386368, 'mape': 770.8597545685058, 'mae': 41238765.5, 'rmse': 98997202.7, 'adj_r2': 0.4968208757292799}


In [None]:
iso_results = {}
df_raw = pd.read_csv('outliers/df_raw.csv')
df_full = shuffle(pd.read_csv('outliers/df.csv'), random_state=0)

data, process = split_process_df(df_full, reskew=False)
with open('outliers/df_full_no_reskew_data.pickle', 'wb') as handle:
    pickle.dump(data, handle)
with open('outliers/df_full_no_reskew_process.pickle', 'wb') as handle:
    pickle.dump(process, handle)

X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
Xy = X.copy()
Xy['revenue'] = y

for contamination in [0.05]:
    print(f'contamination {contamination}')
    iso = IsolationForest(random_state=0, n_jobs=-1, contamination=contamination)
    yhat = iso.fit_predict(Xy)
    mask_to_keep = yhat != -1
    print('outliers: ', sum(yhat == -1))
    iso_results[contamination] = drop_rows_and_output_result(df_raw, mask_to_keep, f'iso_contamination_{contamination}')

In [18]:
for i in iso_results:
    print(i, iso_results[i][1])

0.05 {'smape': 80.3892237338859, 'mape': 779.3976743293541, 'mae': 41463328.0, 'rmse': 99280973.6, 'adj_r2': 0.49631429870037513}


In [None]:
df_raw = pd.read_csv('outliers/df_raw.csv')

with open(f'outliers/data.pickle', 'rb') as handle:
    data = pickle.load(handle)
with open(f'outliers/process.pickle', 'rb') as handle:
    process = pickle.load(handle)

X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
Xy = X.copy()
Xy['revenue'] = y

lof = LocalOutlierFactor(n_jobs=-1)
yhat = lof.fit_predict(Xy)
mask_to_keep = yhat != -1
print('outliers: ', sum(yhat == -1))

lof_results = drop_rows_and_output_result(df_raw, mask_to_keep, f'outliers/lof')

In [23]:
print(lof_results[1])

{'smape': 80.53399847638956, 'mape': 717.054116369367, 'mae': 41526546.3, 'rmse': 100136195.9, 'adj_r2': 0.48482544878353123}


In [None]:
df_raw = pd.read_csv('outliers/df_raw.csv')

with open(f'outliers/df_full_no_reskew_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
with open(f'outliers/df_full_no_reskew_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
Xy = X.copy()
Xy['revenue'] = y

lof = LocalOutlierFactor(n_jobs=-1)
yhat = lof.fit_predict(Xy)
mask_to_keep = yhat == 1
print('outliers: ', sum(yhat == -1))
lof_results_no_reskew = drop_rows_and_output_result(df_raw, mask_to_keep, f'outliers/lof_no_reskew')

In [22]:
print(lof_results_no_reskew[1])

{'smape': 80.81342601447645, 'mape': 790.9061310569335, 'mae': 42286085.8, 'rmse': 102400490.5, 'adj_r2': 0.4043295902645877}


In [9]:
df_full = shuffle(pd.read_csv('outliers/df.csv'), random_state=0)
df_raw = pd.read_csv('outliers/df_raw.csv')

In [65]:
del_revenue = df_full.META__revenue < 100000
print('del revenue', sum(del_revenue))
del_year = df_raw.META__year < 1960
print('del year', sum(del_year))
del_budget = df_full.budget < 100000
print('del budget', sum(del_budget))
profitability_quantiles = df_raw.META__profitability.quantile([0.01, 0.99])
del_profitability = (df_raw.META__profitability < profitability_quantiles.iloc[0]) | (df_raw.META__profitability > profitability_quantiles.iloc[1])
print('del profitability', sum(del_profitability))


common_del = del_revenue | del_year | del_budget | del_profitability

for c in df_full.columns:
    if 'profit' in c:
        quantiles = df_full[c].quantile([0.005, 0.995])
        print(c, quantiles.iloc[0], quantiles.iloc[1])
        del_pr = (df_full[c] < quantiles.iloc[0]) | (df_full[c] > quantiles.iloc[1])
        print(c, sum(del_pr))
        common_del = common_del | del_pr
print('common del', sum(common_del))
mask_to_keep = ~common_del
print(sum(mask_to_keep))

del revenue 338
del year 236
del budget 228
del profitability 150
production_company_1_avg_profit -25582026.47 531870114.62999946
production_company_1_avg_profit 50
production_company_2_avg_profit -29505344.73 479501103.95000017
production_company_2_avg_profit 38
production_company_3_avg_profit -37763245.285 576594041.6550019
production_company_3_avg_profit 24
cast_1_avg_profit -17955340.985 466358733.3400005
cast_1_avg_profit 58
cast_2_avg_profit -22613313.2 515134067.1599946
cast_2_avg_profit 54
cast_3_avg_profit -23760753.299999997 566813288.9000019
cast_3_avg_profit 50
cast_4_avg_profit -28205837.58 480897833.9200022
cast_4_avg_profit 46
cast_5_avg_profit -35858384.120000005 426095410.3599998
cast_5_avg_profit 42
cast_6_avg_profit -29000000.0 469479050.3800031
cast_6_avg_profit 37
cast_7_avg_profit -35252431.44 386006740.0
cast_7_avg_profit 33
cast_8_avg_profit -34318244.58 476673308.00499964
cast_8_avg_profit 30
crew__sound__music_editor_avg_profit -33883639.78 551002710.1599996
c

In [None]:
wo_mb_cols = [c for c in df_full.columns if not ('crew' in c and 'movies_before' in c)]
df_mb = df_full[wo_mb_cols]
mask_to_keep = ~common_del
res_drops = drop_rows_and_output_result(df_raw, mask_to_keep, f'outliers/try')

In [67]:
res_drops[1]

{'smape': 76.29234777780124,
 'mape': 241.38499922717833,
 'mae': 39967714.0,
 'rmse': 93752979.6,
 'adj_r2': 0.2977592547017185}

In [14]:




full_raw = shuffle(pd.read_csv(f'datasets/dataset_all.csv'), random_state=0)

feature_sets = load('feature_sets.joblib')

with open('processed/dataset_all_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
with open('processed/dataset_all_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
X = X[feature_sets[2]]

with open('processed/dataset_all_no_process_data.pickle', 'rb') as handle:
    raw_data = pickle.load(handle)
X_raw = pd.concat([raw_data['X_train'], raw_data['X_test'], raw_data['X_val']])
y_raw = pd.concat([raw_data['y_train'], raw_data['y_test'], raw_data['y_val']])
X_raw = X_raw[feature_sets[2]]

81.0005169305043
778.262245592468
41670831.5
99801069.1
0.4860886432172409


In [None]:
cv10_nn_wo_model = [(cv_results[i][0][0], cv_results[i][1]['X_test']) for i in range(10)]
dump(cv10_nn_wo_model, 'cv10_nn.joblib')

In [5]:
cv_nn = load('cv10_nn.joblib')

all_movies = pd.DataFrame(columns=full_raw.columns)
X_train_ = pd.DataFrame(columns=cv_nn[0][1].columns)
all_test_a = []
all_test_p = []
for i in range(10):
    batch_df = full_raw.loc[cv_nn[i][1].index, :]
    all_movies = pd.concat([all_movies, batch_df])
    X_train_ = pd.concat([X_train_, cv_nn[i][1]])
    all_test_a.extend(cv_nn[i][0]['result'][2])
    all_test_p.extend(cv_nn[i][0]['result'][3])

In [6]:
A = np.array(all_test_a)
P = np.array(all_test_p)

mape_list = 100*np.abs((A - P)/A)
mae_list = np.absolute(A - P)

movies = []
for i in range(len(A)):
    movies.append({
        'movie': all_movies.iloc[[i], :].to_dict(orient='index'),
        'predicted': P[i],
        'actual': A[i],
        'mape': mape_list[i],
        'mae': mae_list[i],
    })

In [7]:
sorted_mape = sorted(movies, key=lambda m: m['mape'], reverse=True)
sorted_mae = sorted(movies, key=lambda m: m['mae'], reverse=True)

dump(sorted_mape, 'sorted_mape_nn.joblib')
dump(sorted_mae, 'sorted_mae_nn.joblib')

['sorted_mae_nn.joblib']

In [51]:
sorted_mape = load('sorted_mape_nn.joblib')
sorted_mae = load('sorted_mae_nn.joblib')

bad_mape_index = [list(sorted_mape[i]['movie'].keys())[0] for i in range(100)]
bad_mae_index = [list(sorted_mae[i]['movie'].keys())[0] for i in range(100)]

In [52]:
for i in range(10):
    print('=================')
    movie = sorted_mape[i]
    movie_index = list(movie['movie'].keys())[0]
    movie_dict = movie['movie'][movie_index]
    print(f"index\t{movie_index}")
    print(f"title\t{movie_dict['META__title']}")
    print(f"revenue\t{movie_dict['META__revenue']}")
    print(f"budget\t{movie_dict['budget']}")
    print(f"predicted\t{movie['predicted']}")
    print(f"actual\t{movie['actual']}")
    print(f"mape\t{movie['mape']}")
    print(f"mae\t{movie['mae']}")

index	3837
title	The Room
revenue	1800
budget	6000000
predicted	6518502.0
actual	1800.0
mape	362039.0
mae	6516702.0
index	5642
title	Philadelphia Experiment II
revenue	2970
budget	5000000
predicted	5489262.0
actual	2970.0
mape	184723.63636363635
mae	5486292.0
index	6021
title	The Sacrament
revenue	9221
budget	4000000
predicted	16563665.0
actual	9221.0
mape	179529.8123847739
mae	16554444.0
index	3554
title	The Perfect Circle
revenue	1000
budget	1000
predicted	1683464.0
actual	1000.0
mape	168246.4
mae	1682464.0
index	3285
title	Fishtales
revenue	9216
budget	14000000
predicted	12208473.0
actual	9216.0
mape	132370.41015625
mae	12199257.0
index	3725
title	Foodfight!
revenue	73706
budget	65000000
predicted	80301592.0
actual	73706.0
mape	108848.51436789407
mae	80227886.0
index	5327
title	Area 51
revenue	7556
budget	5000000
predicted	7515991.0
actual	7556.0
mape	99370.50026469032
mae	7508435.0
index	505
title	City Lights
revenue	19181
budget	1500000
predicted	17663120.0
actual	19181.0
mape	919

In [53]:
for i in range(10):
    print('=================')
    movie = sorted_mae[i]
    movie_index = list(movie['movie'].keys())[0]
    movie_dict = movie['movie'][movie_index]
    print(f"index\t{movie_index}")
    print(f"title\t{movie_dict['META__title']}")
    print(f"revenue\t{movie_dict['META__revenue']}")
    print(f"budget\t{movie_dict['budget']}")
    print(f"predicted\t{movie['predicted']}")
    print(f"actual\t{movie['actual']}")
    print(f"mape\t{movie['mape']}")
    print(f"mae\t{movie['mae']}")

index	3989
title	Avatar
revenue	2787965087
budget	237000000
predicted	743898432.0
actual	2787965087.0
mape	73.31751263784746
mae	2044066655.0
index	6890
title	Avengers: Endgame
revenue	2797800564
budget	356000000
predicted	1093104000.0
actual	2797800564.0
mape	60.92988134804022
mae	1704696564.0
index	320
title	Titanic
revenue	1845034188
budget	200000000
predicted	439773376.0
actual	1845034188.0
mape	76.16448633525266
mae	1405260812.0
index	4616
title	Star Wars: The Force Awakens
revenue	2068223624
budget	245000000
predicted	663310976.0
actual	2068223624.0
mape	67.92846922823854
mae	1404912648.0
index	4583
title	Jurassic World
revenue	1671713208
budget	150000000
predicted	428131456.0
actual	1671713208.0
mape	74.38965882717366
mae	1243581752.0
index	6891
title	Avengers: Infinity War
revenue	2046239637
budget	300000000
predicted	895192064.0
actual	2046239637.0
mape	56.25184617611823
mae	1151047573.0
index	7320
title	Furious 7
revenue	1506249360
budget	190000000
predicted	430888032.0
actua

In [54]:
def split_process_outliers_df(df_raw, features, train=0.8, test=0.1):
    def get_train_test_revenue(df):
        df['revenue'] = df['META__revenue']
        X = df.drop(['revenue'], axis=1)
        y = df['revenue']
        X = X[[col for col in X.columns if col in features]]
        return X, y

    df = shuffle(df_raw, random_state=0)
    num_in_train = int(df.shape[0]*0.8)
    num_in_test = int(df.shape[0]*0.1)
    df_train = df[:num_in_train].copy()
    df_test = df[num_in_train:num_in_train+num_in_test].copy()
    df_val = df[num_in_train+num_in_test:].copy()
    X_train, y_train = get_train_test_revenue(df_train)
    X_test, y_test = get_train_test_revenue(df_test)
    X_val, y_val = get_train_test_revenue(df_val)
    
    data = {}
    imputer_func = KNNImputer(n_neighbors=30, weights='distance')
    process = Process(X_train, X_test, X_val, y_train, y_test, y_val, imputer='func', imputer_func=imputer_func).skew_X().skew_y().fill_nan()
    data['X_train'], data['X_test'], data['X_val'], data['y_train'], data['y_test'], data['y_val'] = process.return_processed()
    return data, process

In [None]:
cv_outliers = {}
for num_outliers in [50, 100, 250, 500, 1000]:
    print("#######################################################")
    print(num_outliers, "outliers")
    print("#######################################################")
    cv_outliers[num_outliers] = []
    bad_mape_index = [list(sorted_mape[i]['movie'].keys())[0] for i in range(num_outliers)]

    df_raw = pd.read_csv('datasets/dataset_all.csv')
    drop_positions = [i for i in range(df_raw.shape[0]) if df_raw.index.values[i] in bad_mape_index]
    df_drop = df_raw.drop(bad_mape_index)
    data_outliers, process_outliers = split_process_outliers_df(df_drop, features=feature_sets[2])

    X_outliers = pd.concat([data_outliers['X_train'], data_outliers['X_test'], data_outliers['X_val']])
    y_outliers = np.concatenate([data_outliers['y_train'], data_outliers['y_test'], data_outliers['y_val']])

    split_num=0
    kf = KFold(n_splits=10)
    for train_index, test_index in kf.split(X_outliers, y_outliers):
        split_num+=1
        print('###################################')
        print(f'{split_num}:\tsplit num')
        X_train, X_test = X_outliers.iloc[train_index, :], X_outliers.iloc[test_index, :]
        y_train, y_test = y_outliers[train_index], y_outliers[test_index]
        data_part = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
        cv_outliers[num_outliers].append((run_nn(data_part, process, with_val=False), data_part))

In [60]:
for num_outliers in cv_outliers:
    print('########################')
    print(f'{num_outliers} outliers')
    for metric in ['smape', 'mape', 'mae', 'rmse', 'adj_r2']:
            print(f"{metric}\t{np.mean([cv_outliers[num_outliers][i][0][0]['test'][metric] for i in range(10)])}")

########################
50 outliers
smape	81.5270622418259
mape	526.8497158142545
mae	42033665.0
rmse	100605524.5
adj_r2	0.4993588031199866
########################
100 outliers
smape	81.79352823775858
mape	513.5482155442227
mae	42518482.7
rmse	102112952.7
adj_r2	0.5055141487241462
########################
250 outliers
smape	81.42302418363708
mape	390.7568752387924
mae	42501235.8
rmse	101487479.7
adj_r2	0.49547231879442377
########################
500 outliers
smape	81.03919788984942
mape	397.0456291781021
mae	42135243.6
rmse	103343059.1
adj_r2	0.47790268168860817
########################
1000 outliers
smape	78.937314080687
mape	740.9970386980442
mae	42340276.1
rmse	103470473.9
adj_r2	0.49577382043303675


In [61]:
cv_outliers_lgb = {}
for num_outliers in [50, 100, 250, 500, 1000]:
    print("#######################################################")
    print(num_outliers, "outliers")
    print("#######################################################")
    cv_outliers_lgb[num_outliers] = []
    bad_mape_index = [list(sorted_mape[i]['movie'].keys())[0] for i in range(num_outliers)]

    df_raw = pd.read_csv('datasets/dataset_all.csv')
    drop_positions = [i for i in range(df_raw.shape[0]) if df_raw.index.values[i] in bad_mape_index]
    df_drop = df_raw.drop(bad_mape_index)
    data_outliers, process_outliers = split_process_outliers_df(df_drop, features=feature_sets[2])

    X_outliers = pd.concat([data_outliers['X_train'], data_outliers['X_test'], data_outliers['X_val']])
    y_outliers = np.concatenate([data_outliers['y_train'], data_outliers['y_test'], data_outliers['y_val']])

    split_num=0
    kf = KFold(n_splits=10)
    for train_index, test_index in kf.split(X_outliers, y_outliers):
        split_num+=1
        print('###################################')
        print(f'{split_num}:\tsplit num')
        X_train, X_test = X_outliers.iloc[train_index, :], X_outliers.iloc[test_index, :]
        y_train, y_test = y_outliers[train_index], y_outliers[test_index]
        data_part = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
        cv_outliers_lgb[num_outliers].append((run_lgb(data_part, process, with_val=False), data_part))

#######################################################
50 outliers
#######################################################
###################################
1:	split num


KeyError: 50

In [None]:
cv10_lgb_results = []

split_num=0
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(X, y):
    split_num+=1
    print('###################################')
    print(f'{split_num}:\tsplit num')
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    data_part = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
    cv10_lgb_results.append((run_nn(data_part, process, with_val=False), data_part))

In [68]:
for metric in ['smape', 'mape', 'mae', 'rmse', 'adj_r2']:
    print(np.mean([cv10_lgb_results[i][0]['test'][metric] for i in range(10)]))

80.8674970783687
840.3828405942328
41836584.7
100194973.8
0.4840395163327765


In [69]:
cv10_lgb_wo_model = [cv10_lgb_results[i][0] for i in range(10)]
dump(cv10_lgb_wo_model, 'cv10_lgb.joblib')

['cv10_lgb.joblib']