In [16]:
import numpy as np
import pandas as pd
import time
import pickle


from joblib import dump, load
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer

pd.options.display.float_format = '{:20,.15f}'.format
pd.options.display.max_columns = None

In [17]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import cross_val_score
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import tensorflow.keras.backend as K
import inspect
from sklearn.model_selection import ShuffleSplit

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import ParameterSampler, ParameterGrid
import sklearn

In [18]:
from importlib import reload
# module references for reload
import process_df, output, run_models

from process_df import Process, create_average_columns, split_process_df, split_df
from output import output_metrics
from run_models import run_nn, run_lgb, run_rf, build_nn_model

reload(process_df)
reload(output)
reload(run_models)

# reimport in case changed
from process_df import Process, create_average_columns, split_process_df, split_df
from output import output_metrics
from run_models import run_nn, run_lgb, run_rf, build_nn_model

In [19]:
def get_df_work_columns(df):
    return df[[col for col in df_full.columns if not 'META' in col or col == 'META__revenue']]

In [12]:
df_raw = pd.read_csv('outliers/df_raw.csv', index_col='id')
df_full = df_raw.copy()
df_full_name = 'outliers/df_full.csv'
df_full = create_average_columns(df_full, verbose=0)
df_full.to_csv(df_full_name)

In [13]:
df_full = pd.read_csv(df_full_name, index_col='id')

data_raw = split_df(get_df_work_columns(df_full))
with open(f'{df_full_name}_raw_data.pickle', 'wb') as handle:
    pickle.dump(data_raw, handle)

data, process = split_process_df(get_df_work_columns(df_full))
with open(f'{df_full_name}_data.pickle', 'wb') as handle:
    pickle.dump(data, handle)
with open(f'{df_full_name}_process.pickle', 'wb') as handle:
    pickle.dump(process, handle)

In [20]:
df_full_name = 'outliers/df_full.csv'

df_full = pd.read_csv(df_full_name, index_col='id')
df_raw = pd.read_csv('outliers/df_raw.csv', index_col='id')

with open(f'{df_full_name}_raw_data.pickle', 'rb') as handle:
    data_raw = pickle.load(handle)
X_raw = pd.concat([data_raw['X_train'], data_raw['X_test'], data_raw['X_val']])
y_raw = pd.concat([data_raw['y_train'], data_raw['y_test'], data_raw['y_val']])

with open(f'{df_full_name}_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
    
with open(f'{df_full_name}_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

In [21]:
del_revenue = df_full.META__revenue < 250000
print('del revenue', sum(del_revenue))
del_year = df_raw.META__year < 1970
print('del year', sum(del_year))
del_budget = df_full.budget < 250000
print('del budget', sum(del_budget))
profitability_quantiles = df_raw.META__profitability.quantile([0.1, 0.9])
del_profitability = (df_raw.META__profitability < profitability_quantiles.iloc[0]) | (df_raw.META__profitability > profitability_quantiles.iloc[1])
print('del profitability', sum(del_profitability))

common_del = del_revenue.astype(int) + del_year.astype(int) + del_budget.astype(int) + del_profitability.astype(int)

for c in df_full.columns:
    if 'profit' in c:
        quantiles = df_full[c].quantile([0.1, 0.9])
        print(c, quantiles.iloc[0], quantiles.iloc[1])
        del_pr = (df_full[c] < quantiles.iloc[0]) | (df_full[c] > quantiles.iloc[1])
        print(c, sum(del_pr))
        common_del = common_del.astype(int) + del_pr.astype(int)

for i in range(max(common_del)):
    print(i, sum(common_del > i))

del revenue 534
del year 382
del budget 364
del profitability 1500
META__profitability -4.424662959957207 7.39777777777778
META__profitability 1500
META__year_avg_profitability -10.812845724436539 16.025788582653266
META__year_avg_profitability 1267
cast_1_avg_profit 1571591.4 143041135.70000005
cast_1_avg_profit 1160
cast_2_avg_profit 197068.00000000116 133116862.20000002
cast_2_avg_profit 1064
cast_3_avg_profit -328240.0 124520203.0
cast_3_avg_profit 986
cast_4_avg_profit -917845.5999999997 127494148.40000002
cast_4_avg_profit 910
cast_5_avg_profit -1728369.5999999999 124271503.39999998
cast_5_avg_profit 818
cast_6_avg_profit -2677605.1999999993 121049458.39999996
cast_6_avg_profit 746
cast_7_avg_profit -3828683.0 119660173.60000005
cast_7_avg_profit 657
cast_8_avg_profit -5025942.1 119892339.10000002
cast_8_avg_profit 586
production_company_1_avg_profit 273410.2000000003 124954690.80000004
production_company_1_avg_profit 966
production_company_2_avg_profit 527914.4000000003 12953267

In [23]:
from statistics import mean
from sklearn import metrics

def smape(A,P):
    return 100/len(A) * np.sum(2 * np.abs(P - A) / (np.abs(A) + np.abs(P)))

def mape(A, P):
    return 100 * np.mean(np.abs((A - P)/A))

def get_metrics(y_test, y_pred, cols):
    SS_Residual = sum((y_test - y_pred)**2)
    SS_Total = sum((y_test - np.mean(y_test))**2)
    r2 = 1 - (float(SS_Residual))/SS_Total
    adj_r2 = 1 - (1-r2)*(len(y_test) - 1)/(len(y_test) - cols -1)
    return  {
        'smape': smape(y_test, y_pred),
        'mape': mape(y_test, y_pred),
        'mae': np.rint(metrics.mean_absolute_error(y_test, y_pred)),
        'rmse': np.rint(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
        'adj_r2': adj_r2,
    }


class OutliersEstimator(BaseEstimator, RegressorMixin):
    def __init__(self, outliers_detector=None, drop_index_list=None):
        self.outliers_detector = outliers_detector
        self.drop_index_list = drop_index_list

    def fit(self, X, y, **kwargs):
        def get_df_work_column(df):
            return df[[col for col in df.columns if not 'META' in col or col == 'META__revenue']]

        df_raw = pd.read_csv('outliers/df_raw.csv', index_col='id')

        print("######################################################")
        if self.outliers_detector is not None:
            print(self.outliers_detector.get_params())
            X_full = X.copy()
            X_full['revenue'] = y
            mask_to_drop = self.outliers_detector.fit_predict(X_full) == -1
            drop_index_list = df_raw[mask_to_drop].index
        else:
            drop_index_list = self.drop_index_list
        self.drop_index_list_ = drop_index_list
        print('Number of outliers: ', len(drop_index_list))
        new_df = df_raw.drop(drop_index_list)
        df = create_average_columns(new_df, verbose=0)
        print('Average created')
        df = get_df_work_columns(df)
        data, self.process_ = split_process_df(df)
        print('Processed')
        self.X_ = pd.concat([data['X_train'], data['X_test'], data['X_val']])
        self.y_ = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
        return self

    def score(self, X=None, y=None):
        cv_results = []

        split_num=0
        kf = KFold(n_splits=5)
        for train_index, test_index in kf.split(self.X_, self.y_):
            split_num+=1
            print(f'{split_num}:\tsplit num')
            X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
            y_train, y_test = y[train_index], y[test_index]
            data_part = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
            estimator = NN_estimator()
            estimator.fit(X_train, y_train, process=self.process_)
            cv_results.append({'metrics': estimator.score(X_test, y_test), 'data': data_part})
        metrics = list(cv_results[0]['metrics'].keys())
        metric_results = {
            metric: np.mean(
                [cv_results[split_index]['metrics'][metric] for split_index in range(len(cv_results))]
            ) for metric in metrics
        }
        print(metric_results)
        return {'cv_iterations': cv_results, 'cv_metrics': metric_results}


class NN_estimator(BaseEstimator, RegressorMixin):
    def fit(self, X, y, **kwargs):
        self.process_ = kwargs['process']
        input_shape = len(data['X_train'].keys())
        model = build_nn_model(input_shape)

        es = EarlyStopping(
            monitor='val_loss', 
            mode='min', 
            verbose=1, 
            patience=20)

        model.fit(X, y,
            epochs=10000,
            validation_split=0.1,
            verbose=0,
            batch_size=256,
            shuffle=True,
            callbacks=[es],
            use_multiprocessing=True)
        self.model_ = model
        return self
    
    def score(self, X, y):
        def mape(y, yhat):
            return 100 * np.mean(np.abs((y - yhat)/y))

        yhat = self.model_.predict(X.values)
        if len(yhat.shape) == 2 and yhat.shape[1] == 1:
            yhat = yhat.flatten()

        for apply_function in reversed(self.process_.y_process):
            y = apply_function(y)
            yhat = apply_function(yhat)

        return get_metrics(y, yhat, X.shape[1])

def param_search(estimator, param_dict, n_iter=None, seed=None):
    if n_iter is None:
        param_iter = ParameterGrid(param_dict)
    else:
        param_iter = ParameterSampler(param_dict,
                                      n_iter,
                                      random_state=seed)
 
    estimators = []
    for params in param_iter:
        new_estimator = sklearn.clone(estimator)
        new_estimator.set_params(**params)
        estimators.append(new_estimator)
    return estimators

In [24]:
###########################  custom outliers

full_custom = []
outlier_detector_number = 0
for i in range(3, 16):
    outlier_detector_number += 1
    print('outlier detector number', outlier_detector_number)
    drop_index_list = df_full[common_del > i].index
    outliers_estimator = OutliersEstimator(drop_index_list=drop_index_list)
    outliers_estimator.fit(X, y)
    score = outliers_estimator.score(X,y)
    full_custom.append({
        'cv_iteration_metrics': [fold['metrics'] for fold in score['cv_iterations']],
        'cv_metrics': score['cv_metrics'],
        'drop_index_list': outliers_estimator.drop_index_list_,
    })

outlier detector number 1
######################################################
Number of outliers:  3376
Average created
Processed
1:	split num
Epoch 00181: early stopping
2:	split num
Epoch 00261: early stopping
3:	split num
Epoch 00178: early stopping
4:	split num
Epoch 00273: early stopping
5:	split num
Epoch 00157: early stopping
{'smape': 70.86815791236145, 'mape': 286.93412373611983, 'mae': 20782452.8, 'rmse': 41606764.2, 'adj_r2': 0.5565246562251759}
outlier detector number 2
######################################################
Number of outliers:  2319
Average created
Processed
1:	split num
Epoch 00200: early stopping
2:	split num
Epoch 00244: early stopping
3:	split num
Epoch 00161: early stopping
4:	split num
Epoch 00168: early stopping
5:	split num
Epoch 00131: early stopping
{'smape': 72.1188929978657, 'mape': 477.629558270916, 'mae': 21991288.8, 'rmse': 43999880.0, 'adj_r2': 0.6006168290628364}
outlier detector number 3
#################################################

In [25]:
dump(full_custom, 'outliers/full_custom.joblib')

['outliers/full_custom.joblib']

In [26]:
###########################  ISO forest 
param_grid = {
    'n_estimators': sp_randint(200, 10000),
    'max_samples': sp_randint(256, 7500),
    'max_features': sp_uniform(loc=0.5, scale=0.5),
    'contamination': sp_uniform(loc=0.0001, scale=0.1),
}
outlier_detectors = param_search(IsolationForest(n_jobs=-1, random_state=0), param_grid, n_iter=25)

full_iso = []
outlier_detector_number = 0
for outlier_detector in outlier_detectors:
    outlier_detector_number += 1
    print('outlier detector number', outlier_detector_number)
    outliers_estimator = OutliersEstimator(outliers_detector=outlier_detector)
    outliers_estimator.fit(X, y)
    score = outliers_estimator.score(X,y)
    full_iso.append({
        'cv_iteration_metrics': [fold['metrics'] for fold in score['cv_iterations']],
        'cv_metrics': score['cv_metrics'],
        'drop_index_list': outliers_estimator.drop_index_list_,
    })

outlier detector number 1
######################################################
{'behaviour': 'deprecated', 'bootstrap': False, 'contamination': 0.04122016554920903, 'max_features': 0.5193714626152008, 'max_samples': 4431, 'n_estimators': 7765, 'n_jobs': -1, 'random_state': 0, 'verbose': 0, 'warm_start': False}
Number of outliers:  309
Average created
Processed
1:	split num
Epoch 00226: early stopping
2:	split num
Epoch 00194: early stopping
3:	split num
Epoch 00171: early stopping
4:	split num
Epoch 00126: early stopping
5:	split num
Epoch 00153: early stopping
{'smape': 79.63032620515492, 'mape': 796.2565406168256, 'mae': 40337263.8, 'rmse': 95435811.6, 'adj_r2': 0.5832925694468117}
outlier detector number 2
######################################################
{'behaviour': 'deprecated', 'bootstrap': False, 'contamination': 0.0769459127431091, 'max_features': 0.7164218392954724, 'max_samples': 1048, 'n_estimators': 316, 'n_jobs': -1, 'random_state': 0, 'verbose': 0, 'warm_start': 

In [27]:
dump(full_iso, 'outliers/full_iso.joblib')

['outliers/full_iso.joblib']

In [207]:
reduced_iso = [{'metrics': x['metrics']['cv_metrics'], 'params': x['estimator'].outliers_detector.get_params()} for x in full_iso]

In [208]:
dump(reduced_iso, 'outliers/reduced_iso.joblib')

[&#39;outliers/reduced_iso.joblib&#39;]

In [193]:
np.argmin([x['metrics']['cv_metrics']['mae'] for x in full_iso])

1

In [224]:
sorted(full_iso, key=lambda x: x['metrics']['cv_metrics']['mae'])[0]['metrics']['cv_metrics']

{&#39;smape&#39;: 78.92044432135074,
 &#39;mape&#39;: 794.1533429747885,
 &#39;mae&#39;: 39584212.2,
 &#39;rmse&#39;: 92004337.6,
 &#39;adj_r2&#39;: 0.5765857942632931}

In [225]:
full_iso[0]['estimator'].outliers_detector

LocalOutlierFactor(algorithm=&#39;ball_tree&#39;, contamination=0.055709011589515606,
                   n_jobs=-1, n_neighbors=26)

In [222]:
estim = IsolationForest(n_jobs=-1, random_state=0, max_features=0.5, max_samples=1000, n_estimators=2000)
outliers_estimator = OutliersEstimator(outliers_detector=estim)
outliers_estimator.fit(X, y)
best_iso = {'metrics': outliers_estimator.score(X, y), 'estimator': estim}

######################################################
{&#39;behaviour&#39;: &#39;deprecated&#39;, &#39;bootstrap&#39;: False, &#39;contamination&#39;: &#39;auto&#39;, &#39;max_features&#39;: 0.5, &#39;max_samples&#39;: 1000, &#39;n_estimators&#39;: 2000, &#39;n_jobs&#39;: -1, &#39;random_state&#39;: 0, &#39;verbose&#39;: 0, &#39;warm_start&#39;: False}
Number of outliers:  183
Average created
Processed
1:	split num
Epoch 00143: early stopping
2:	split num
Epoch 00180: early stopping
3:	split num
Epoch 00114: early stopping
4:	split num
Epoch 00229: early stopping
5:	split num
Epoch 00168: early stopping
{&#39;smape&#39;: 79.13025062079224, &#39;mape&#39;: 719.794957718316, &#39;mae&#39;: 40525757.6, &#39;rmse&#39;: 95366367.8, &#39;adj_r2&#39;: 0.5840590392307148}


In [28]:
from sklearn.neighbors import LocalOutlierFactor
from scipy.spatial import distance
###########################  Local outlier factor
param_grid = {
    'n_neighbors': sp_randint(1, 30),
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'contamination': sp_uniform(0.001, 0.1),
}
outlier_detectors = param_search(LocalOutlierFactor(n_jobs=-1), param_grid, n_iter=25)

full_lof = []
outlier_detector_number = 0
for outlier_detector in outlier_detectors:
    outlier_detector_number += 1
    print('outlier detector number', outlier_detector_number)
    outliers_estimator = OutliersEstimator(outliers_detector=outlier_detector)
    outliers_estimator.fit(X, y)
    score = outliers_estimator.score(X,y)
    full_lof.append({
        'cv_iteration_metrics': [fold['metrics'] for fold in score['cv_iterations']],
        'cv_metrics': score['cv_metrics'],
        'drop_index_list': outliers_estimator.drop_index_list_,
    })

outlier detector number 1
######################################################
{'algorithm': 'ball_tree', 'contamination': 0.0394526171193084, 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': -1, 'n_neighbors': 21, 'novelty': False, 'p': 2}
Number of outliers:  296
Average created
Processed
1:	split num
Epoch 00206: early stopping
2:	split num
Epoch 00134: early stopping
3:	split num
Epoch 00154: early stopping
4:	split num
Epoch 00197: early stopping
5:	split num
Epoch 00126: early stopping
{'smape': 79.79423137257612, 'mape': 747.003870257744, 'mae': 40608926.6, 'rmse': 96391540.8, 'adj_r2': 0.5767083786518814}
outlier detector number 2
######################################################
{'algorithm': 'brute', 'contamination': 0.0910360129864193, 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': -1, 'n_neighbors': 26, 'novelty': False, 'p': 2}
Number of outliers:  683
Average created
Processed
1:	split num
Epoch 00208: early stoppin

In [29]:
dump(full_lof, 'outliers/full_lof.joblib')

['outliers/full_lof.joblib']

In [30]:
from sklearn.svm import OneClassSVM
###########################  One class SVM
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'nu': sp_uniform(0.0001, 0.12),
}
outlier_detectors = param_search(OneClassSVM(), param_grid, n_iter=25)

full_ocsvm = []
outlier_detector_number = 0
for outlier_detector in outlier_detectors:
    outlier_detector_number += 1
    print('outlier detector number', outlier_detector_number)
    outliers_estimator = OutliersEstimator(outliers_detector=outlier_detector)
    outliers_estimator.fit(X, y)
    score = outliers_estimator.score(X,y)
    full_ocsvm.append({
        'cv_iteration_metrics': [fold['metrics'] for fold in score['cv_iterations']],
        'cv_metrics': score['cv_metrics'],
        'drop_index_list': outliers_estimator.drop_index_list_,
    })

outlier detector number 1
######################################################
{'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'sigmoid', 'max_iter': -1, 'nu': 0.04815250938208488, 'shrinking': True, 'tol': 0.001, 'verbose': False}
Number of outliers:  357
Average created
Processed
1:	split num
Epoch 00142: early stopping
2:	split num
Epoch 00184: early stopping
3:	split num
Epoch 00199: early stopping
4:	split num
Epoch 00165: early stopping
5:	split num
Epoch 00195: early stopping
{'smape': 79.65788202426072, 'mape': 864.7271747418145, 'mae': 40184171.4, 'rmse': 93718036.4, 'adj_r2': 0.5975068818791802}
outlier detector number 2
######################################################
{'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid', 'max_iter': -1, 'nu': 0.08115585290025654, 'shrinking': True, 'tol': 0.001, 'verbose': False}
Number of outliers:  608
Average created
Processed
1:	split num
Epoch 00112: early stopping
2:	spl

In [31]:
dump(full_ocsvm, 'outliers/full_ocsvm.joblib')

['outliers/full_ocsvm.joblib']

In [79]:
def run_cv(X, y, process):
    cv_results = []

    split_num=0
    n_splits=10
    kf = KFold(n_splits=n_splits)
    for train_index, test_index in kf.split(X, y):
        split_num+=1
        print('###################################')
        print(f'{split_num}:\tsplit num')
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        data_part = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
        cv_results.append({'metrics': run_nn(data_part, process, with_val=False), 'data': data_part})
    metrics = ['smape', 'mape', 'mae', 'rmse', 'adj_r2']
    metric_results = {metric: np.mean([cv_results[split_index]['metrics']['test'][metric] for split_index in range(n_splits)]) for metric in metrics}
    return {'cv_iterations': cv_results, 'cv_metrics': metric_results}

In [84]:
def drop_rows_and_output_result(df, drop_index_list, dirname):
    new_df = df.drop(drop_index_list)
    if not os.path.exists(dirname):
        os.mkdir(dirname)
    new_df.to_csv(f'{dirname}/df_raw.csv')
    df = create_average_columns(new_df, verbose=0)
    print('average created')

    df.to_csv(f'{dirname}/df_all_columns.csv')
    df = get_df_work_columns(df)
    df.to_csv(f'{dirname}/df.csv')

    data, process = split_process_df(df)
    print('processed')
    with open(f'{dirname}/data.pickle', 'wb') as handle:
        pickle.dump(data, handle)
    with open(f'{dirname}/process.pickle', 'wb') as handle:
        pickle.dump(process, handle)

    X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
    y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])

    return run_cv(X, y, process)


In [None]:

df_raw = pd.read_csv('outliers/df_raw.csv', index_col='id')

start = time.time()
o__full = drop_rows_and_output_result(df_raw, [], 'outliers/full')
print(time.time() - start)


In [86]:
dump(o__full, 'outliers/full/results.joblib')

['outliers/full/results.joblib']

In [91]:
o__full['cv_metrics']

{'smape': 80.04576707883504,
 'mape': 728.2753358367537,
 'mae': 41332135.4,
 'rmse': 97884334.3,
 'adj_r2': 0.4765097127225809}

In [94]:
with open(f'outliers/data.pickle', 'rb') as handle:
    data = pickle.load(handle)
with open(f'outliers/process.pickle', 'rb') as handle:
    process = pickle.load(handle)

X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])

In [18]:
from sklearn.ensemble import IsolationForest
iso_results = {}
df_raw = pd.read_csv('outliers/df_raw.csv', index_col='id')

for n_estimators in [50, 100, 250, 500, 2000]:
    iso = IsolationForest(random_state=0, n_jobs=-1, 
        n_estimators=n_estimators)
    yhat = iso.fit_predict(X)
    mask_to_keep = yhat != -1
    print('outliers: ', sum(yhat == -1))
    df_raw[mask_to_keep].index
    # iso_results[contamination] = drop_rows_and_output_result(df_raw, mask_to_keep, f'iso_contamination_{contamination}')

NameError: name &#39;pd&#39; is not defined

In [9]:
for i in iso_results:
    print(i, iso_results[i][1])

0.05 {'smape': 80.05861126141413, 'mape': 793.7242540662393, 'mae': 41239250.8, 'rmse': 96947720.8, 'adj_r2': 0.5125935171000893}


In [76]:
for i in iso_results:
    print(i, iso_results[i][1])

0.01 {'smape': 79.8379840350851, 'mape': 803.3126516871604, 'mae': 41064231.4, 'rmse': 97550786.2, 'adj_r2': 0.5103597131813838}
0.05 {'smape': 79.80010070996218, 'mape': 829.701486791581, 'mae': 40850199.6, 'rmse': 96056112.8, 'adj_r2': 0.525200785872264}
0.1 {'smape': 79.96970676386368, 'mape': 770.8597545685058, 'mae': 41238765.5, 'rmse': 98997202.7, 'adj_r2': 0.4968208757292799}


In [None]:
iso_results = {}
df_raw = pd.read_csv('outliers/df_raw.csv')
df_full = shuffle(pd.read_csv('outliers/df.csv'), random_state=0)

data, process = split_process_df(df_full, reskew=False)
with open('outliers/df_full_no_reskew_data.pickle', 'wb') as handle:
    pickle.dump(data, handle)
with open('outliers/df_full_no_reskew_process.pickle', 'wb') as handle:
    pickle.dump(process, handle)

X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
Xy = X.copy()
Xy['revenue'] = y

for contamination in [0.05]:
    print(f'contamination {contamination}')
    iso = IsolationForest(random_state=0, n_jobs=-1, contamination=contamination)
    yhat = iso.fit_predict(Xy)
    mask_to_keep = yhat != -1
    print('outliers: ', sum(yhat == -1))
    iso_results[contamination] = drop_rows_and_output_result(df_raw, mask_to_keep, f'iso_contamination_{contamination}')

In [18]:
for i in iso_results:
    print(i, iso_results[i][1])

0.05 {'smape': 80.3892237338859, 'mape': 779.3976743293541, 'mae': 41463328.0, 'rmse': 99280973.6, 'adj_r2': 0.49631429870037513}


In [None]:
from sklearn.neighbors import LocalOutlierFactor
df_raw = pd.read_csv('outliers/df_raw.csv')

with open(f'outliers/data.pickle', 'rb') as handle:
    data = pickle.load(handle)
with open(f'outliers/process.pickle', 'rb') as handle:
    process = pickle.load(handle)

X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
Xy = X.copy()
Xy['revenue'] = y

lof = LocalOutlierFactor(n_jobs=-1)
yhat = lof.fit_predict(Xy)
mask_to_keep = yhat != -1
print('outliers: ', sum(yhat == -1))

lof_results = drop_rows_and_output_result(df_raw, mask_to_keep, f'outliers/lof')

In [23]:
print(lof_results[1])

{'smape': 80.53399847638956, 'mape': 717.054116369367, 'mae': 41526546.3, 'rmse': 100136195.9, 'adj_r2': 0.48482544878353123}


In [None]:
df_raw = pd.read_csv('outliers/df_raw.csv')

with open(f'outliers/df_full_no_reskew_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
with open(f'outliers/df_full_no_reskew_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
Xy = X.copy()
Xy['revenue'] = y

lof = LocalOutlierFactor(n_jobs=-1)
yhat = lof.fit_predict(Xy)
mask_to_keep = yhat == 1
print('outliers: ', sum(yhat == -1))
lof_results_no_reskew = drop_rows_and_output_result(df_raw, mask_to_keep, f'outliers/lof_no_reskew')

In [22]:
print(lof_results_no_reskew[1])

{'smape': 80.81342601447645, 'mape': 790.9061310569335, 'mae': 42286085.8, 'rmse': 102400490.5, 'adj_r2': 0.4043295902645877}


In [9]:
df_full = shuffle(pd.read_csv('outliers/df.csv'), random_state=0)
df_raw = pd.read_csv('outliers/df_raw.csv')

In [65]:
del_revenue = df_full.META__revenue < 100000
print('del revenue', sum(del_revenue))
del_year = df_raw.META__year < 1960
print('del year', sum(del_year))
del_budget = df_full.budget < 100000
print('del budget', sum(del_budget))
profitability_quantiles = df_raw.META__profitability.quantile([0.01, 0.99])
del_profitability = (df_raw.META__profitability < profitability_quantiles.iloc[0]) | (df_raw.META__profitability > profitability_quantiles.iloc[1])
print('del profitability', sum(del_profitability))


common_del = del_revenue | del_year | del_budget | del_profitability

for c in df_full.columns:
    if 'profit' in c:
        quantiles = df_full[c].quantile([0.005, 0.995])
        print(c, quantiles.iloc[0], quantiles.iloc[1])
        del_pr = (df_full[c] < quantiles.iloc[0]) | (df_full[c] > quantiles.iloc[1])
        print(c, sum(del_pr))
        common_del = common_del | del_pr
print('common del', sum(common_del))
mask_to_keep = ~common_del
print(sum(mask_to_keep))

del revenue 338
del year 236
del budget 228
del profitability 150
production_company_1_avg_profit -25582026.47 531870114.62999946
production_company_1_avg_profit 50
production_company_2_avg_profit -29505344.73 479501103.95000017
production_company_2_avg_profit 38
production_company_3_avg_profit -37763245.285 576594041.6550019
production_company_3_avg_profit 24
cast_1_avg_profit -17955340.985 466358733.3400005
cast_1_avg_profit 58
cast_2_avg_profit -22613313.2 515134067.1599946
cast_2_avg_profit 54
cast_3_avg_profit -23760753.299999997 566813288.9000019
cast_3_avg_profit 50
cast_4_avg_profit -28205837.58 480897833.9200022
cast_4_avg_profit 46
cast_5_avg_profit -35858384.120000005 426095410.3599998
cast_5_avg_profit 42
cast_6_avg_profit -29000000.0 469479050.3800031
cast_6_avg_profit 37
cast_7_avg_profit -35252431.44 386006740.0
cast_7_avg_profit 33
cast_8_avg_profit -34318244.58 476673308.00499964
cast_8_avg_profit 30
crew__sound__music_editor_avg_profit -33883639.78 551002710.1599996
c

In [None]:
wo_mb_cols = [c for c in df_full.columns if not ('crew' in c and 'movies_before' in c)]
df_mb = df_full[wo_mb_cols]
mask_to_keep = ~common_del
res_drops = drop_rows_and_output_result(df_raw, mask_to_keep, f'outliers/try')

In [67]:
res_drops[1]

{'smape': 76.29234777780124,
 'mape': 241.38499922717833,
 'mae': 39967714.0,
 'rmse': 93752979.6,
 'adj_r2': 0.2977592547017185}

In [14]:




full_raw = shuffle(pd.read_csv(f'datasets/dataset_all.csv'), random_state=0)

feature_sets = load('feature_sets.joblib')

with open('processed/dataset_all_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
with open('processed/dataset_all_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])
X = X[feature_sets[2]]

with open('processed/dataset_all_no_process_data.pickle', 'rb') as handle:
    raw_data = pickle.load(handle)
X_raw = pd.concat([raw_data['X_train'], raw_data['X_test'], raw_data['X_val']])
y_raw = pd.concat([raw_data['y_train'], raw_data['y_test'], raw_data['y_val']])
X_raw = X_raw[feature_sets[2]]

81.0005169305043
778.262245592468
41670831.5
99801069.1
0.4860886432172409


In [None]:
cv10_nn_wo_model = [(cv_results[i][0][0], cv_results[i][1]['X_test']) for i in range(10)]
dump(cv10_nn_wo_model, 'cv10_nn.joblib')

In [5]:
cv_nn = load('cv10_nn.joblib')

all_movies = pd.DataFrame(columns=full_raw.columns)
X_train_ = pd.DataFrame(columns=cv_nn[0][1].columns)
all_test_a = []
all_test_p = []
for i in range(10):
    batch_df = full_raw.loc[cv_nn[i][1].index, :]
    all_movies = pd.concat([all_movies, batch_df])
    X_train_ = pd.concat([X_train_, cv_nn[i][1]])
    all_test_a.extend(cv_nn[i][0]['result'][2])
    all_test_p.extend(cv_nn[i][0]['result'][3])

In [6]:
A = np.array(all_test_a)
P = np.array(all_test_p)

mape_list = 100*np.abs((A - P)/A)
mae_list = np.absolute(A - P)

movies = []
for i in range(len(A)):
    movies.append({
        'movie': all_movies.iloc[[i], :].to_dict(orient='index'),
        'predicted': P[i],
        'actual': A[i],
        'mape': mape_list[i],
        'mae': mae_list[i],
    })

In [7]:
sorted_mape = sorted(movies, key=lambda m: m['mape'], reverse=True)
sorted_mae = sorted(movies, key=lambda m: m['mae'], reverse=True)

dump(sorted_mape, 'sorted_mape_nn.joblib')
dump(sorted_mae, 'sorted_mae_nn.joblib')

['sorted_mae_nn.joblib']

In [51]:
sorted_mape = load('sorted_mape_nn.joblib')
sorted_mae = load('sorted_mae_nn.joblib')

bad_mape_index = [list(sorted_mape[i]['movie'].keys())[0] for i in range(100)]
bad_mae_index = [list(sorted_mae[i]['movie'].keys())[0] for i in range(100)]

In [52]:
for i in range(10):
    print('=================')
    movie = sorted_mape[i]
    movie_index = list(movie['movie'].keys())[0]
    movie_dict = movie['movie'][movie_index]
    print(f"index\t{movie_index}")
    print(f"title\t{movie_dict['META__title']}")
    print(f"revenue\t{movie_dict['META__revenue']}")
    print(f"budget\t{movie_dict['budget']}")
    print(f"predicted\t{movie['predicted']}")
    print(f"actual\t{movie['actual']}")
    print(f"mape\t{movie['mape']}")
    print(f"mae\t{movie['mae']}")

index	3837
title	The Room
revenue	1800
budget	6000000
predicted	6518502.0
actual	1800.0
mape	362039.0
mae	6516702.0
index	5642
title	Philadelphia Experiment II
revenue	2970
budget	5000000
predicted	5489262.0
actual	2970.0
mape	184723.63636363635
mae	5486292.0
index	6021
title	The Sacrament
revenue	9221
budget	4000000
predicted	16563665.0
actual	9221.0
mape	179529.8123847739
mae	16554444.0
index	3554
title	The Perfect Circle
revenue	1000
budget	1000
predicted	1683464.0
actual	1000.0
mape	168246.4
mae	1682464.0
index	3285
title	Fishtales
revenue	9216
budget	14000000
predicted	12208473.0
actual	9216.0
mape	132370.41015625
mae	12199257.0
index	3725
title	Foodfight!
revenue	73706
budget	65000000
predicted	80301592.0
actual	73706.0
mape	108848.51436789407
mae	80227886.0
index	5327
title	Area 51
revenue	7556
budget	5000000
predicted	7515991.0
actual	7556.0
mape	99370.50026469032
mae	7508435.0
index	505
title	City Lights
revenue	19181
budget	1500000
predicted	17663120.0
actual	19181.0
mape	919

In [53]:
for i in range(10):
    print('=================')
    movie = sorted_mae[i]
    movie_index = list(movie['movie'].keys())[0]
    movie_dict = movie['movie'][movie_index]
    print(f"index\t{movie_index}")
    print(f"title\t{movie_dict['META__title']}")
    print(f"revenue\t{movie_dict['META__revenue']}")
    print(f"budget\t{movie_dict['budget']}")
    print(f"predicted\t{movie['predicted']}")
    print(f"actual\t{movie['actual']}")
    print(f"mape\t{movie['mape']}")
    print(f"mae\t{movie['mae']}")

index	3989
title	Avatar
revenue	2787965087
budget	237000000
predicted	743898432.0
actual	2787965087.0
mape	73.31751263784746
mae	2044066655.0
index	6890
title	Avengers: Endgame
revenue	2797800564
budget	356000000
predicted	1093104000.0
actual	2797800564.0
mape	60.92988134804022
mae	1704696564.0
index	320
title	Titanic
revenue	1845034188
budget	200000000
predicted	439773376.0
actual	1845034188.0
mape	76.16448633525266
mae	1405260812.0
index	4616
title	Star Wars: The Force Awakens
revenue	2068223624
budget	245000000
predicted	663310976.0
actual	2068223624.0
mape	67.92846922823854
mae	1404912648.0
index	4583
title	Jurassic World
revenue	1671713208
budget	150000000
predicted	428131456.0
actual	1671713208.0
mape	74.38965882717366
mae	1243581752.0
index	6891
title	Avengers: Infinity War
revenue	2046239637
budget	300000000
predicted	895192064.0
actual	2046239637.0
mape	56.25184617611823
mae	1151047573.0
index	7320
title	Furious 7
revenue	1506249360
budget	190000000
predicted	430888032.0
actua

In [54]:
def split_process_outliers_df(df_raw, features, train=0.8, test=0.1):
    def get_train_test_revenue(df):
        df['revenue'] = df['META__revenue']
        X = df.drop(['revenue'], axis=1)
        y = df['revenue']
        X = X[[col for col in X.columns if col in features]]
        return X, y

    df = shuffle(df_raw, random_state=0)
    num_in_train = int(df.shape[0]*0.8)
    num_in_test = int(df.shape[0]*0.1)
    df_train = df[:num_in_train].copy()
    df_test = df[num_in_train:num_in_train+num_in_test].copy()
    df_val = df[num_in_train+num_in_test:].copy()
    X_train, y_train = get_train_test_revenue(df_train)
    X_test, y_test = get_train_test_revenue(df_test)
    X_val, y_val = get_train_test_revenue(df_val)
    
    data = {}
    imputer_func = KNNImputer(n_neighbors=30, weights='distance')
    process = Process(X_train, X_test, X_val, y_train, y_test, y_val, imputer='func', imputer_func=imputer_func).skew_X().skew_y().fill_nan()
    data['X_train'], data['X_test'], data['X_val'], data['y_train'], data['y_test'], data['y_val'] = process.return_processed()
    return data, process

In [None]:
cv_outliers = {}
for num_outliers in [50, 100, 250, 500, 1000]:
    print("#######################################################")
    print(num_outliers, "outliers")
    print("#######################################################")
    cv_outliers[num_outliers] = []
    bad_mape_index = [list(sorted_mape[i]['movie'].keys())[0] for i in range(num_outliers)]

    df_raw = pd.read_csv('datasets/dataset_all.csv')
    drop_positions = [i for i in range(df_raw.shape[0]) if df_raw.index.values[i] in bad_mape_index]
    df_drop = df_raw.drop(bad_mape_index)
    data_outliers, process_outliers = split_process_outliers_df(df_drop, features=feature_sets[2])

    X_outliers = pd.concat([data_outliers['X_train'], data_outliers['X_test'], data_outliers['X_val']])
    y_outliers = np.concatenate([data_outliers['y_train'], data_outliers['y_test'], data_outliers['y_val']])

    split_num=0
    kf = KFold(n_splits=10)
    for train_index, test_index in kf.split(X_outliers, y_outliers):
        split_num+=1
        print('###################################')
        print(f'{split_num}:\tsplit num')
        X_train, X_test = X_outliers.iloc[train_index, :], X_outliers.iloc[test_index, :]
        y_train, y_test = y_outliers[train_index], y_outliers[test_index]
        data_part = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
        cv_outliers[num_outliers].append((run_nn(data_part, process, with_val=False), data_part))

In [60]:
for num_outliers in cv_outliers:
    print('########################')
    print(f'{num_outliers} outliers')
    for metric in ['smape', 'mape', 'mae', 'rmse', 'adj_r2']:
            print(f"{metric}\t{np.mean([cv_outliers[num_outliers][i][0][0]['test'][metric] for i in range(10)])}")

########################
50 outliers
smape	81.5270622418259
mape	526.8497158142545
mae	42033665.0
rmse	100605524.5
adj_r2	0.4993588031199866
########################
100 outliers
smape	81.79352823775858
mape	513.5482155442227
mae	42518482.7
rmse	102112952.7
adj_r2	0.5055141487241462
########################
250 outliers
smape	81.42302418363708
mape	390.7568752387924
mae	42501235.8
rmse	101487479.7
adj_r2	0.49547231879442377
########################
500 outliers
smape	81.03919788984942
mape	397.0456291781021
mae	42135243.6
rmse	103343059.1
adj_r2	0.47790268168860817
########################
1000 outliers
smape	78.937314080687
mape	740.9970386980442
mae	42340276.1
rmse	103470473.9
adj_r2	0.49577382043303675


In [61]:
cv_outliers_lgb = {}
for num_outliers in [50, 100, 250, 500, 1000]:
    print("#######################################################")
    print(num_outliers, "outliers")
    print("#######################################################")
    cv_outliers_lgb[num_outliers] = []
    bad_mape_index = [list(sorted_mape[i]['movie'].keys())[0] for i in range(num_outliers)]

    df_raw = pd.read_csv('datasets/dataset_all.csv')
    drop_positions = [i for i in range(df_raw.shape[0]) if df_raw.index.values[i] in bad_mape_index]
    df_drop = df_raw.drop(bad_mape_index)
    data_outliers, process_outliers = split_process_outliers_df(df_drop, features=feature_sets[2])

    X_outliers = pd.concat([data_outliers['X_train'], data_outliers['X_test'], data_outliers['X_val']])
    y_outliers = np.concatenate([data_outliers['y_train'], data_outliers['y_test'], data_outliers['y_val']])

    split_num=0
    kf = KFold(n_splits=10)
    for train_index, test_index in kf.split(X_outliers, y_outliers):
        split_num+=1
        print('###################################')
        print(f'{split_num}:\tsplit num')
        X_train, X_test = X_outliers.iloc[train_index, :], X_outliers.iloc[test_index, :]
        y_train, y_test = y_outliers[train_index], y_outliers[test_index]
        data_part = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
        cv_outliers_lgb[num_outliers].append((run_lgb(data_part, process, with_val=False), data_part))

#######################################################
50 outliers
#######################################################
###################################
1:	split num


KeyError: 50

In [None]:
cv10_lgb_results = []

split_num=0
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(X, y):
    split_num+=1
    print('###################################')
    print(f'{split_num}:\tsplit num')
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    data_part = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
    cv10_lgb_results.append((run_nn(data_part, process, with_val=False), data_part))

In [68]:
for metric in ['smape', 'mape', 'mae', 'rmse', 'adj_r2']:
    print(np.mean([cv10_lgb_results[i][0]['test'][metric] for i in range(10)]))

80.8674970783687
840.3828405942328
41836584.7
100194973.8
0.4840395163327765


In [69]:
cv10_lgb_wo_model = [cv10_lgb_results[i][0] for i in range(10)]
dump(cv10_lgb_wo_model, 'cv10_lgb.joblib')

['cv10_lgb.joblib']