In [None]:
import time
import pickle
from os import listdir
from os.path import isfile, join

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mean

from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_validate
from sklearn.decomposition import PCA

from impyute.imputation import cs
pd.options.display.float_format = '{:20,.15f}'.format

from pprint import pprint

import copy
from sklearn.ensemble import ExtraTreesRegressor

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Activation, Dropout
from keras.layers.normalization import BatchNormalization
from tensorflow.keras.initializers import GlorotNormal, GlorotUniform, he_normal, he_uniform
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Adamax
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras import backend as K
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

from hyperopt import Trials, STATUS_OK, tpe, rand
from hyperas import optim
from hyperas.distributions import choice, uniform
import lightgbm as lgb

In [None]:
from process import Process

In [None]:
pd.set_option('display.max_columns', None)  

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
from importlib import reload
# module references for reload
import process_class, output, run_models

from process_class import Process
from output import output_metrics
from run_models import run_nn, run_lgb

reload(process_class)
reload(output)
reload(output)

# reimport in case changed
from process_class import Process
from output import output_metrics
from run_models import run_nn, run_lgb

In [None]:
path = 'datasets'
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
datasets = sorted([d for d in onlyfiles if not 'raw' in d and 'dataset' in d])

In [None]:
# cols_to_exclude = ['genre__western', 'genre__documentary', 'genre__history', 'country__es', 'country__jp', 'country__ca', 'country__de', 'country__in', 'country__fr', 'country__ru', 'country__it', 'country__au', 'rating__nc-17', 'country__other', 'tag__satire', 'tag__neo_noir', 'tag__sadist', 'tag__cruelty', 'tag__dark', 'tag__storytelling', 'tag__sci_fi', 'tag__psychological', 'tag__absurd', 'tag__philosophical', 'tag__depressing', 'tag__plot_twist', 'tag__realism', 'tag__home_movie', 'tag__thought_provoking']
# cols_to_exclude = [col for col in df.columns if (('budget' in col or 'profit' in col) and ('crew' in col or 'cast' in col))]

def split_process_df(name, train=0.8, test=0.1):
    def get_train_test_revenue(df):
        df['revenue'] = df['META__revenue']
        dff = df[[col for col in df.columns if not 'META' in col]]
        X = dff.drop(['revenue'], axis=1)
        y = dff['revenue']
        return X, y

    df_raw = pd.read_csv(f'datasets/{name}')
    # df_raw = reduce_mem_usage(df_raw)
    df = shuffle(df_raw, random_state=0)

    num_in_train = int(df.shape[0]*0.8)
    # num_in_test = int(df.shape[0]*0.1)
    num_in_test = int(df.shape[0]*0.1)
    df_train = df[:num_in_train].copy()
    df_test = df[num_in_train:num_in_train+num_in_test].copy()
    df_val = df[num_in_train+num_in_test:].copy()
    X_train, y_train = get_train_test_revenue(df_train)
    X_test, y_test = get_train_test_revenue(df_test)
    X_val, y_val = get_train_test_revenue(df_val)
    
    data = {}
    imputer_func = KNNImputer(n_neighbors=30, weights='distance')
    process = Process(X_train, X_test, X_val, y_train, y_test, y_val, imputer='func', imputer_func=imputer_func).skew_X().skew_y().fill_nan()
#     process = Process(X_train, X_test, X_val, y_train, y_test, y_val, imputer='knn').skew_X().skew_y().robustscale_Y().fill_nan()
#     process = Process(X_train, X_test, X_val, y_train, y_test, y_val, imputer='knn').skew_X().skew_y().robustscale_X().fill_nan()
#     process = Process(X_train, X_test, X_val, y_train, y_test, y_val, imputer='knn').robustscale_X().robustscale_Y().skew_X().skew_y().fill_nan()
#     process = Process(X_train, X_test, X_val, y_train, y_test, y_val, imputer='knn').skew_X().skew_y().robustscale_X().robustscale_Y().fill_nan()
    data['X_train'], data['X_test'], data['X_val'], data['y_train'], data['y_test'], data['y_val'] = process.return_processed()
    return data, process

In [None]:
df_all = pd.read_csv(f'datasets/dataset_all.csv')

In [None]:
for col in df_all.columns:
    print(f'{df_all[col].isnull().sum()}\t: {col}')

In [None]:
data, process = split_process_df('dataset_all.csv')

In [None]:
def split_df(name, train=0.8, test=0.1):
    def get_train_test_revenue(df):
        df['revenue'] = df['META__revenue']
        dff = df[[col for col in df.columns if not 'META' in col]]
        X = dff.drop(['revenue'], axis=1)
        y = dff['revenue']
        return X, y

    df_raw = pd.read_csv(f'datasets/{name}')
    df = shuffle(df_raw, random_state=0)

    num_in_train = int(df.shape[0]*0.8)
    num_in_test = int(df.shape[0]*0.1)
    df_train = df[:num_in_train].copy()
    df_test = df[num_in_train:num_in_train+num_in_test].copy()
    df_val = df[num_in_train+num_in_test:].copy()
    X_train, y_train = get_train_test_revenue(df_train)
    X_test, y_test = get_train_test_revenue(df_test)
    X_val, y_val = get_train_test_revenue(df_val)
    return {
        'X_train': X_train,
        'X_test': X_test,
        'X_val': X_val,
        'y_train': y_train,
        'y_test': y_test,
        'y_val': y_val,
    }
data = split_df('dataset_all.csv')
with open(f'processed/dataset_all_no_process_data.pickle', 'wb') as handle:
    pickle.dump(data, handle)

In [None]:
for d_name in datasets:
    name = d_name.replace('.csv', '')
    print(f'processing {name}')
    data, process = split_process_df(d_name, train=0.8, test=0.1)

    with open(f'processed/{name}_data.pickle', 'wb') as handle:
        pickle.dump(data, handle)

    with open(f'processed/{name}_process.pickle', 'wb') as handle:
        pickle.dump(process, handle)

In [None]:
with open('processed/dataset_all_movies_before_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
with open('processed/dataset_all_movies_before_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

In [None]:
###########################################################
# Try NN and LGBMR with best features
###########################################################

In [None]:
features_ranking = {
    'features_in_11': [
        'budget',
        'production_company_1_avg_revenue',
        'production_company_2_avg_revenue',
        'production_company_3_avg_revenue',
        'crew__sound__sound_designer_avg_revenue',
        'crew__sound__sound_re_recording_mixer_avg_revenue',
        'crew__directing__director__1_avg_revenue',
        'crew__production__casting_avg_revenue',
        'crew__production__executive_producer__1_avg_revenue',
        'crew__production__producer__1_avg_revenue',
        'crew__production__producer__2_avg_revenue',
        'crew__costume__costume_designer_avg_revenue',
        'crew__costume__costume_designer_movies_before',
        'crew__costume__costume_supervisor_avg_revenue',
        'crew__costume__makeup_artist_avg_revenue',
        'crew__art__production_design_avg_revenue',
        'crew__art__property_master_avg_revenue',
        'crew__camera__director_of_photography_avg_revenue',
        'crew__camera__still_photographer_movies_before',
        'collection_avg_revenue',
        'cast_avg_revenue',  
    ],
    'features_in_10': [
        'production_company_1_avg_profit',
        'cast_1_avg_revenue',
        'cast_2_avg_revenue',
        'cast_3_avg_revenue',
        'cast_5_avg_revenue',
        'cast_6_avg_revenue',
        'cast_7_avg_revenue',
        'crew__sound__music_editor_movies_before',
        'crew__sound__original_music_composer_avg_profit',
        'crew__sound__original_music_composer_avg_revenue',
        'crew__sound__original_music_composer_movies_before',
        'crew__sound__sound_designer_movies_before',
        'crew__sound__sound_effects_editor_avg_revenue',
        'crew__sound__sound_re_recording_mixer_movies_before',
        'crew__sound__supervising_sound_editor_avg_profit',
        'crew__sound__supervising_sound_editor_avg_revenue',
        'crew__sound__supervising_sound_editor_movies_before',
        'crew__directing__director__1_avg_profit',
        'crew__production__casting_avg_profit',
        'crew__production__producer__1_avg_profit',
        'crew__production__producer__1_movies_before',
        'crew__production__producer__2_avg_profit',
        'crew__editing__editor__1_avg_profit',
        'crew__editing__editor__1_avg_revenue',
        'crew__editing__editor__1_movies_before',
        'crew__costume__costume_designer_avg_profit',
        'crew__costume__costume_supervisor_movies_before',
        'crew__crew__stunt_coordinator_avg_revenue',
        'crew__writing__screenplay__1_avg_profit',
        'crew__writing__screenplay__1_avg_revenue',
        'crew__art__art_direction_avg_revenue',
        'crew__art__property_master_avg_profit',
        'crew__art__property_master_movies_before',
        'crew__art__set_decoration_avg_revenue',
        'crew__art__set_decoration_movies_before',
        'crew__visualeffects__visual_effects_supervisor_avg_profit',
        'crew__visualeffects__visual_effects_supervisor_avg_revenue',
        'crew__camera__steadicam_operator_avg_revenue',
        'crew__camera__steadicam_operator_movies_before',
        'collection_avg_profit',
        'cast_avg_profit',
        'cast_avg_experience',
        'cast_avg_movies_before',
    ],
    'features_in_9': [
        'production_company_2_avg_profit',
        'production_company_3_avg_profit',
        'cast_1_movies_before',
        'cast_2_movies_before',
        'cast_4_avg_revenue',
        'cast_8_avg_revenue',
        'crew__sound__sound_designer_avg_profit',
        'crew__sound__sound_effects_editor_movies_before',
        'crew__directing__script_supervisor_avg_revenue',
        'crew__directing__script_supervisor_movies_before',
        'crew__production__casting_movies_before',
        'crew__production__executive_producer__1_avg_profit',
        'crew__production__producer__2_movies_before',
        'crew__costume__costume_supervisor_avg_profit',
        'crew__costume__makeup_artist_avg_profit',
        'crew__costume__makeup_artist_movies_before',
        'crew__crew__stunt_coordinator_avg_profit',
        'crew__crew__stunt_coordinator_movies_before',
        'crew__writing__screenplay__1_movies_before',
        'crew__art__production_design_avg_profit',
        'crew__art__production_design_movies_before',
        'crew__visualeffects__visual_effects_supervisor_movies_before',
        'crew__camera__director_of_photography_avg_profit',
        'crew__camera__director_of_photography_movies_before',
        'crew__camera__steadicam_operator_avg_profit',
        'crew__camera__still_photographer_avg_revenue',  
    ],
    'features_in_8': [
        'cast_1_avg_profit',
        'cast_1_experience',
        'cast_2_avg_profit',
        'cast_2_experience',
        'cast_3_movies_before',
        'cast_4_movies_before',
        'cast_5_movies_before',
        'cast_6_experience',
        'cast_6_movies_before',
        'cast_7_movies_before',
        'cast_8_movies_before',
        'crew__sound__music_editor_avg_revenue',
        'crew__sound__sound_re_recording_mixer_avg_profit',
        'crew__directing__director__1_movies_before',
        'crew__production__executive_producer__1_movies_before',
        'crew__art__art_direction_avg_profit',
        'crew__art__art_direction_movies_before',
        'crew__art__set_decoration_avg_profit',
        'crew__camera__still_photographer_avg_profit',
    ],
    'features_in_7': [
        'tag__action',
        'homepage_exists',
        'cast_3_avg_profit',
        'cast_3_experience',
        'cast_4_avg_profit',
        'cast_4_experience',
        'cast_5_experience',
        'cast_7_experience',
        'cast_8_avg_profit',
        'cast_8_experience',
        'crew__sound__sound_effects_editor_avg_profit',
        'crew__directing__script_supervisor_avg_profit',
    ],
    'features_in_6': [
     'runtime',
     'tag__entertaining',
     'cast_6_gender',
     'cast_5_avg_profit',
     'cast_6_avg_profit',
     'cast_7_avg_profit',
     'crew__sound__music_editor_avg_profit',
     'year_avg_revenue',
    ],
    'features_in_5': [
        'spoken_languages',
        'genre__adventure',
        'genre__comedy',
        'genre__horror',
        'country__us',
        'month_sin',
        'competition',
        'rating__pg-13',
        'rating__r',
        'tag__romantic',
        'tag__cult',
        'tag__comedy',
        'tag__humor',
        'cast_7_gender',
    ],
    'features_in_4': [
        'weekend',
        'genre__romance',
        'genre__action',
        'country__in',
        'country__fr',
        'country__gb',
        'day_sin',
        'day_cos',
        'month_cos',
        'rating__pg',
        'tag__murder',
        'tag__violence',
        'tag__revenge',
        'tag__suspenseful',
        'tag__good_versus_evil',
        'tag__boring',
        'tag__dramatic',
        'tag__other',
        'cast_1_gender',
        'cast_8_gender',
    ],
    'features_in_3': [
        'genre__thriller',
        'genre__fantasy',
        'tag__flashback',
        'tag__psychedelic',
        'tag__horror',
        'tag__cute',
        'cast_2_gender',
    ],
}

In [None]:
features_to_stay = []
for i in [11,10,9,8,7,6,5,4,3]:
    print('######################################')
    print(f'{i}: lowest features')

    with open('processed/dataset_all_data.pickle', 'rb') as handle:
        data = pickle.load(handle)

    with open('processed/dataset_all_process.pickle', 'rb') as handle:
        process = pickle.load(handle)
        
    features_to_stay.extend(features_ranking[f'features_in_{i}'])
    data['X_train'] = data['X_train'][features_to_stay]
    data['X_test'] = data['X_test'][features_to_stay]
    data['X_val'] = data['X_val'][features_to_stay]
    run_lgb()
    run_nn()

In [None]:
with open('processed/dataset_all_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
with open('processed/dataset_all_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

features_to_stay = features_in_11 + features_in_10 + features_in_9 + features_in_8
data['X_train'] = data['X_train'][features_to_stay]
data['X_test'] = data['X_test'][features_to_stay]
data['X_val'] = data['X_val'][features_to_stay]
run_lgb()
run_nn()

In [None]:
with open('processed/dataset_all_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
with open('processed/dataset_all_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

features_to_stay = features_in_11 + features_in_10 + features_in_9
data['X_train'] = data['X_train'][features_to_stay]
data['X_test'] = data['X_test'][features_to_stay]
data['X_val'] = data['X_val'][features_to_stay]
run_lgb()
run_nn()

In [None]:
with open('processed/dataset_all_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
with open('processed/dataset_all_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

features_to_stay = features_in_11 + features_in_10
data['X_train'] = data['X_train'][features_to_stay]
data['X_test'] = data['X_test'][features_to_stay]
data['X_val'] = data['X_val'][features_to_stay]
run_lgb()
run_nn()

In [None]:
# features in 11
run_lgb()
run_nn()

In [None]:
run_lgb()
run_nn()

In [None]:
with open('processed/dataset_all_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
with open('processed/dataset_all_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

In [None]:
run_lgb()
run_nn()

In [None]:
selected_features = ['budget',
 'runtime',
 'spoken_languages',
 'weekend',
 'genre__romance',
 'genre__action',
 'genre__thriller',
 'genre__comedy',
 'genre__horror',
 'day_sin',
 'day_cos',
 'month_sin',
 'month_cos',
 'competition',
 'rating__pg',
 'rating__pg-13',
 'rating__r',
 'tag__murder',
 'tag__violence',
 'tag__flashback',
 'tag__romantic',
 'tag__cult',
 'tag__revenge',
 'tag__comedy',
 'tag__suspenseful',
 'tag__good_versus_evil',
 'tag__humor',
 'tag__entertaining',
 'tag__action',
 'tag__horror',
 'tag__dramatic',
 'tag__cute',
 'tag__other',
 'cast_1_gender',
 'cast_2_gender',
 'cast_6_gender',
 'cast_7_gender',
 'cast_8_gender',
 'homepage_exists',
 'production_company_1_avg_profit',
 'production_company_1_avg_revenue',
 'production_company_2_avg_profit',
 'production_company_2_avg_revenue',
 'production_company_3_avg_profit',
 'production_company_3_avg_revenue',
 'cast_1_avg_profit',
 'cast_1_experience',
 'cast_1_avg_revenue',
 'cast_1_movies_before',
 'cast_2_avg_profit',
 'cast_2_experience',
 'cast_2_avg_revenue',
 'cast_2_movies_before',
 'cast_3_avg_profit',
 'cast_3_experience',
 'cast_3_avg_revenue',
 'cast_3_movies_before',
 'cast_4_avg_profit',
 'cast_4_experience',
 'cast_4_avg_revenue',
 'cast_4_movies_before',
 'cast_5_avg_profit',
 'cast_5_experience',
 'cast_5_avg_revenue',
 'cast_5_movies_before',
 'cast_6_avg_profit',
 'cast_6_experience',
 'cast_6_avg_revenue',
 'cast_6_movies_before',
 'cast_7_avg_profit',
 'cast_7_experience',
 'cast_7_avg_revenue',
 'cast_7_movies_before',
 'cast_8_avg_profit',
 'cast_8_experience',
 'cast_8_avg_revenue',
 'cast_8_movies_before',
 'crew__sound__music_editor_avg_profit',
 'crew__sound__music_editor_avg_revenue',
 'crew__sound__music_editor_movies_before',
 'crew__sound__original_music_composer_avg_profit',
 'crew__sound__original_music_composer_avg_revenue',
 'crew__sound__original_music_composer_movies_before',
 'crew__sound__sound_designer_avg_profit',
 'crew__sound__sound_designer_avg_revenue',
 'crew__sound__sound_designer_movies_before',
 'crew__sound__sound_effects_editor_avg_profit',
 'crew__sound__sound_effects_editor_avg_revenue',
 'crew__sound__sound_effects_editor_movies_before',
 'crew__sound__sound_re_recording_mixer_avg_profit',
 'crew__sound__sound_re_recording_mixer_avg_revenue',
 'crew__sound__sound_re_recording_mixer_movies_before',
 'crew__sound__supervising_sound_editor_avg_profit',
 'crew__sound__supervising_sound_editor_avg_revenue',
 'crew__sound__supervising_sound_editor_movies_before',
 'crew__directing__director__1_avg_profit',
 'crew__directing__director__1_avg_revenue',
 'crew__directing__director__1_movies_before',
 'crew__directing__script_supervisor_avg_profit',
 'crew__directing__script_supervisor_avg_revenue',
 'crew__directing__script_supervisor_movies_before',
 'crew__production__casting_avg_profit',
 'crew__production__casting_avg_revenue',
 'crew__production__casting_movies_before',
 'crew__production__executive_producer__1_avg_profit',
 'crew__production__executive_producer__1_avg_revenue',
 'crew__production__executive_producer__1_movies_before',
 'crew__production__producer__1_avg_profit',
 'crew__production__producer__1_avg_revenue',
 'crew__production__producer__1_movies_before',
 'crew__production__producer__2_avg_profit',
 'crew__production__producer__2_avg_revenue',
 'crew__production__producer__2_movies_before',
 'crew__editing__editor__1_avg_profit',
 'crew__editing__editor__1_avg_revenue',
 'crew__editing__editor__1_movies_before',
 'crew__costume__costume_designer_avg_profit',
 'crew__costume__costume_designer_avg_revenue',
 'crew__costume__costume_designer_movies_before',
 'crew__costume__costume_supervisor_avg_profit',
 'crew__costume__costume_supervisor_avg_revenue',
 'crew__costume__costume_supervisor_movies_before',
 'crew__costume__makeup_artist_avg_profit',
 'crew__costume__makeup_artist_avg_revenue',
 'crew__costume__makeup_artist_movies_before',
 'crew__crew__stunt_coordinator_avg_profit',
 'crew__crew__stunt_coordinator_avg_revenue',
 'crew__crew__stunt_coordinator_movies_before',
 'crew__writing__screenplay__1_avg_profit',
 'crew__writing__screenplay__1_avg_revenue',
 'crew__writing__screenplay__1_movies_before',
 'crew__art__art_direction_avg_profit',
 'crew__art__art_direction_avg_revenue',
 'crew__art__art_direction_movies_before',
 'crew__art__production_design_avg_profit',
 'crew__art__production_design_avg_revenue',
 'crew__art__production_design_movies_before',
 'crew__art__property_master_avg_profit',
 'crew__art__property_master_avg_revenue',
 'crew__art__property_master_movies_before',
 'crew__art__set_decoration_avg_profit',
 'crew__art__set_decoration_avg_revenue',
 'crew__art__set_decoration_movies_before',
 'crew__visualeffects__visual_effects_supervisor_avg_profit',
 'crew__visualeffects__visual_effects_supervisor_avg_revenue',
 'crew__visualeffects__visual_effects_supervisor_movies_before',
 'crew__camera__director_of_photography_avg_profit',
 'crew__camera__director_of_photography_avg_revenue',
 'crew__camera__director_of_photography_movies_before',
 'crew__camera__steadicam_operator_avg_profit',
 'crew__camera__steadicam_operator_avg_revenue',
 'crew__camera__steadicam_operator_movies_before',
 'crew__camera__still_photographer_avg_profit',
 'crew__camera__still_photographer_avg_revenue',
 'crew__camera__still_photographer_movies_before',
 'collection_avg_profit',
 'collection_avg_revenue',
 'cast_avg_revenue',
 'cast_avg_profit',
 'cast_avg_experience',
 'cast_avg_movies_before',
 'year_avg_revenue']

In [None]:
for i in ['X_train', 'X_test', 'X_val']:
    data[i] = data[i][selected_features]

In [None]:
run_lgb()
run_nn()

In [None]:
datasets

In [None]:
Datasets							Comment			Number of movies	% from all movies dataset
All movies										7495			100
US											5695			76
GB											893			12
Years 2000-2018										4903			65.4
Years 1970-1999										2042			27
Profitable movies										5107			68
Not profitable movies									2335			31
Budget after 1 percentile 					starts from 8875$			7418			99
Budget after 5 percentile 					starts from 250000$		7129			95
Profitability 1 percentile cut 				profitability range [-13.8, 65]	7342			98
Profitability 5 percentile cut					profitability range [-14.9, 13.9]	6744			90
Profitability 10 percentile cut				profitability range [-4.42, 7.4]	5994			80
Profitability 1 percentile cut for movies 2000-2018						4814			64.2
Profitability 1 percentile cut for movies 2000-2018 and budget 1%+				4754			63.4
Profitability 1 percentile cut for movies 2000-2018 and budget 5%+				4599			61.3
Revenue after 1 percentile					starts from 10000			7354			98
Revenue after 5 percentile					starts from 111229		6743			90


In [None]:
'dataset_all.csv'.replace('.csv', '')

In [None]:
for dataset in datasets:
    name = dataset.replace('.csv', '')
    if name in ['dataset_all', 'dataset_us', 'dataset_gb']:
        continue
    
    with open(f'processed/{name}_data.pickle', 'rb') as handle:
        data = pickle.load(handle)
    
    with open(f'processed/{name}_process.pickle', 'rb') as handle:
        process = pickle.load(handle)
    
    print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
    print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
    print(dataset)
    run_lgb()
    run_nn()

In [None]:
def smape_lgbm(A,P):
    return 'smape', smape(A,P), False

In [None]:
for objective in [
    'mse', 
    'mae', 
    'huber', 
    'fair', 
    'poisson', 
    'quantile', 
    'mape', 
]:
    print('##################################')
    print(objective)
    run_lgb(objective)

In [None]:
for loss in [
#     'mean_squared_error',
#     'mean_absolute_error',
#     'mean_absolute_percentage_error',
#     'cosine_similarity',
#     'huber_loss',
    'logcosh',
]:
    print('##################################')
    print(loss)
    run_nn(loss)

In [None]:
####################################################################################
# fine tune GradientBoostingRegressor
####################################################################################

In [None]:
from scipy.stats import randint as sp_randint

random_grid = {
    'loss': ['ls', 'lad', 'huber', 'quantile'],
    'learning_rate': sp_uniform(loc=0.0001, scale=0.0999),
    'n_estimators': [100, 500, 1000],
    'subsample': sp_uniform(loc=0.3, scale=0.5),
    'criterion': ['friedman_mse', 'mse', 'mae'],
    'min_samples_leaf': sp_randint(1, 20),
    'min_samples_split': sp_randint(10, 100),
    'max_depth': sp_randint(5, 300),
    'alpha': sp_uniform(loc=0.85, scale=0.1),
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(
    random_state=0,
    verbose=1,
    validation_fraction=0.15,
    n_iter_no_change=50,
)
gbr_random = RandomizedSearchCV(
    scoring='neg_mean_absolute_error',
    estimator = gbr, 
    param_distributions = random_grid, 
    n_iter = 30,
    n_jobs=-1,
    cv = 3, 
    refit=True,
    verbose=True, 
    random_state=42, 
)
gbr_random.fit(
    data['X_train'], 
    data['y_train'], 
)

In [None]:
# DT model selection
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

random_grid = {
               'bootstrap': [True, False],
               'max_depth': sp_randint(5, 300),
               'min_samples_leaf': sp_randint(1, 20),
               'min_samples_split': sp_randint(10, 100),
               'n_estimators': [100, 500, 1000],
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

rf = RandomForestRegressor(
    criterion='mae',
    n_jobs=-1,
    random_state=0,
    verbose=1,
)
rf_random = RandomizedSearchCV(
    scoring='neg_mean_absolute_error',
    estimator = rf, 
    param_distributions = random_grid, 
    n_iter = 100, 
    n_jobs=-1,
    cv = 3, 
    refit=True,
    verbose=True, 
    random_state=42, 
)
rf_random.fit(
    data['X_train'], 
    data['y_train'], 
)

In [None]:
####################################################################################
# train AdaBoostRegressor
####################################################################################

In [None]:
model = RandomForestRegressor(
    n_estimators=40,
    max_depth=15,
    min_samples_split=0.001,
    min_samples_leaf=0.0005,
    bootstrap=True,
    max_samples=0.95,
    criterion='mae', 
    random_state=0, 
    n_jobs=-1,
    verbose=1,
)

In [None]:
param_dist = {
    'n_estimators': [50, 100],
    'learning_rate' : [0.01,0.05,0.1,0.3,1],
    'loss' : ['linear', 'square', 'exponential']
 }

pre_gs_inst = RandomizedSearchCV(AdaBoostRegressor(),
 param_distributions = param_dist,
 cv=3,
 n_iter = 10,
 n_jobs=-1)

pre_gs_inst.fit(X_train, y_train)

In [None]:
from sklearn.ensemble import AdaBoostRegressor

for loss in ['exponential']:
    print("##########################")
    print(loss)
    abr = AdaBoostRegressor(
        base_estimator=model, 
        n_estimators=50,
        random_state=0,
        loss=loss,
    )
    abr.fit(data['X_train'], data['y_train'])
    output_metrics(abr, data, process, with_val=True)

In [None]:
from sklearn.ensemble import AdaBoostRegressor

for loss in ['square', 'exponential']:
    print("##########################")
    print(loss)
    abr = AdaBoostRegressor(
        base_estimator=model, 
        n_estimators=50,
        random_state=0,
        loss=loss,
    )
    abr.fit(data['X_train'], data['y_train'])
    output_metrics(abr, data, process, with_val=True)

In [None]:
abr.fit(data['X_train'], data['y_train'])

In [None]:
output_metrics(abr, data, process, with_val=True)

In [None]:
output_metrics(abr, data, process, with_val=True)

In [None]:
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

In [None]:
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

fit_params={"early_stopping_rounds":25, 
            "eval_metric" : 'mse', 
            "eval_set" : [(data['X_test'], data['y_test'])],
#             'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
}

In [None]:
param_test ={
    'num_leaves': sp_randint(5, 1000), 
    'max_depth': sp_randint(5, 100),
    'max_bin': sp_randint(100, 1000),
    'min_child_samples': sp_randint(100, 400), 
    'learning_rate': sp_uniform(loc=0.0001, scale=0.0999),
#     'num_iterations': [2500, 5000, 7500, 10000],
    'min_child_weight': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1],
    'min_child_samples': sp_randint(100, 400), 
    'subsample': sp_uniform(loc=0.3, scale=0.5), 
    'colsample_bytree': sp_uniform(loc=0.6, scale=0.3),
    'reg_alpha': sp_uniform(loc=0, scale=5),
    'reg_lambda': sp_randint(1, 50),
    'bagging_fraction': sp_uniform(loc=0.7, scale=0.3),
    'bagging_freq': sp_randint(5, 20),
    'feature_fraction': sp_uniform(loc=0.5, scale=0.5),
    'min_data_in_leaf': sp_randint(1, 500),
    'min_sum_hessian_in_leaf': sp_randint(1, 100),
}

In [None]:
clf = lgb.LGBMRegressor(
    num_threads=12,
    random_state=314, 
    silent=True, 
    n_jobs=-1,
    verbose=4,
    tree_learner='data',
# #     num_leaves=
#     max_depth=-1,
#     learning_rate=0.001,
#     num_iterations=100000,
#     min_child_weight=10,
# #     min_child_samples=
#     subsample=0.4,
#     colsample_bytree=0.73,
#     reg_alpha=3.15,
#     reg_lambda=26,
# #     max_bin=
#     bagging_fraction=0.96,
#     bagging_freq=6,
#     feature_fraction=0.6,
#     min_data_in_leaf=50,
#     min_sum_hessian_in_leaf=50,
)
gs = RandomizedSearchCV(
    estimator=clf, 
    param_distributions=param_test, 
    n_iter=1000,
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

In [None]:
gs.fit(
    data['X_train'], 
    data['y_train'], 
    **fit_params,
)

In [None]:
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [None]:
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [None]:
mod_lgb = lgb.LGBMRegressor(
#     boosting='dart',
    num_iterations=50000,
    objective='regression',
    bagging_fraction=0.937,
    bagging_freq=14, 
    colsample_bytree=0.609,
    feature_fraction=0.626,
    learning_rate=0.086, 
    max_bin=117,
    max_depth=74,
    min_child_samples=169,
    min_child_weight=0.01,
    min_data_in_leaf=72, 
    min_sum_hessian_in_leaf=34,
#     n_estimators=2500,
    num_leaves=411,
    reg_alpha=0.5,
    reg_lambda=44,
    subsample=0.63,
    feature_fraction_seed=9,
    bagging_seed=9,
    tree_learner='data',
)
mod_lgb.fit(
    data['X_train'].values, 
    data['y_train'],
    verbose=2,
    eval_metric='rmse',
    eval_set=(data['X_test'], data['y_test']),
    early_stopping_rounds=100,
)

In [None]:
output_metrics(mod_lgb, data, process, with_val=True)

In [None]:
mod_lgb = lgb.LGBMRegressor(
    objective='regression',
    num_leaves=34,
    learning_rate=0.001, 
    n_estimators=7500,
    max_bin=192,
    max_depth=0,
    min_child_samples=160,
    min_child_weight=0.001,
    bagging_fraction=0.98,
    bagging_freq=15, 
    feature_fraction=0.77,
    metric='l2',
    bagging_seed=9,
    min_data_in_leaf=1, 
    min_sum_hessian_in_leaf=50,
    colsample_bytree=0.87,
    reg_alpha=0.18,
    reg_lambda=30,
    subsample=0.39,
    tree_learner='data',
)
mod_lgb.fit(
    data['X_train'].values, 
    data['y_train'],
    verbose=2,
    eval_metric='mse',
    eval_set=[(data['X_test'], data['y_test'])],
    early_stopping_rounds=100
)
output_metrics(mod_lgb, data, process, with_val=True)

In [None]:
param_test_etr ={
    'n_estimators': [100, 500, 750, 1000, 5000, 10000],
    'max_depth': sp_randint(1, 200),
    'max_features': sp_randint(10, 225), 
    'min_samples_leaf': sp_randint(1,50), 
    'min_samples_split': sp_randint(1,50),
    'min_weight_fraction_leaf': sp_uniform(0.0, 0.5),
    'max_leaf_nodes': sp_randint(1, 200),
    'min_impurity_decrease': sp_uniform(0.0, 5),
    'ccp_alpha': sp_uniform(0.0, 5),
}

In [None]:
reg_etr = ExtraTreesRegressor(
    n_jobs=-1, 
    random_state=0,
    verbose=5,
)
reg_etr_gs = RandomizedSearchCV(
    n_jobs=-1, 
    estimator=reg_etr, 
    param_distributions=param_test_etr, 
    n_iter=10,
    cv=5,
    refit=True,
    random_state=314,
    verbose=10)

In [None]:
%%time

reg_etr_gs.fit(data['X_train'], data['y_train'])

In [None]:
print('Best score reached: {} with params: {} '.format(reg_etr_gs.best_score_, reg_etr_gs.best_params_))

In [None]:
etr = ExtraTreesRegressor(
    n_jobs=-1, 
    ccp_alpha=0.32,
    max_depth=10,
    max_features=71, 
    max_leaf_nodes=55,
    min_impurity_decrease=0.35,
    min_samples_leaf=8, 
    min_samples_split=23,
    min_weight_fraction_leaf=0.29,
    n_estimators=5000, 
    
)
etr.fit(data['X_train'].values, data['y_train'])
output_metrics(etr, data, process, with_val=True)

In [None]:
def model(X_train, y_train, X_test, y_test):
    
    model = tf.keras.Sequential()
    model.add(Dense(
        {{choice([128, 192, 256, 512, 1024, 2048])}}, 
        input_shape=[X_train.shape[1]],
        kernel_initializer={{choice(['zeros', 'glorot_normal', 'he_normal'])}},           
        bias_initializer={{choice(['zeros', 'glorot_normal', 'he_normal'])}},  
        kernel_regularizer=l1_l2(
            l1={{uniform(0, 0.1)}}, 
            l2={{uniform(0, 0.1)}}),
        bias_regularizer=l1_l2(
            l1={{uniform(0, 0.1)}}, 
            l2={{uniform(0, 0.1)}}),
        activity_regularizer=l1_l2(
            l1={{uniform(0, 0.1)}}, 
            l2={{uniform(0, 0.1)}})))
    model.add(Activation({{choice(['sigmoid', 'relu', 'tanh'])}}))
    if {{choice(['bn1_yes', 'bn1_no'])}} == 'bn1_yes':
        model.add(BatchNormalization())
    model.add(Dropout({{uniform(0, 1)}}))
    model.add(Dense(
        {{choice([128, 192, 256, 512, 1024, 2048])}},
        kernel_initializer={{choice(['zeros', 'glorot_normal', 'he_normal'])}},           
        bias_initializer={{choice(['zeros', 'glorot_normal', 'he_normal'])}},          
        kernel_regularizer=l1_l2(
            l1={{uniform(0, 0.1)}}, 
            l2={{uniform(0, 0.1)}}),
        bias_regularizer=l1_l2(
            l1={{uniform(0, 0.1)}}, 
            l2={{uniform(0, 0.1)}}),
        activity_regularizer=l1_l2(
            l1={{uniform(0, 0.1)}}, 
            l2={{uniform(0, 0.1)}})))
    model.add(Activation({{choice(['sigmoid', 'relu', 'tanh'])}}))
    if {{choice(['bn2_yes', 'bn2_no'])}} == 'bn2_yes':
        model.add(BatchNormalization())
    model.add(Dropout({{uniform(0, 1)}}))
    model.add(Dense(1))
    
    adamax = keras.optimizers.Adamax(
        learning_rate={{uniform(0.001, 0.1)}},
        beta_1={{uniform(0.75, 1)}},
        beta_2={{uniform(0.75, 1)}})
        
    model.compile(loss='mse', 
                  metrics=['mae'],
                  optimizer=adamax)

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    model.fit(X_train, y_train,
              batch_size={{choice([16, 32, 64, 128, 256, 512])}},
              epochs=500,
              verbose=2,
              validation_data=(X_test, y_test),
#               validation_split=0.15,
              callbacks=[es])

    mse, mae = model.evaluate(X_test, y_test, verbose=1)
    print('Test mse:', mse)
    return {'loss': mse, 'status': STATUS_OK, 'model': model}

# def model(X_train, y_train, X_test, y_test):
    
#     model = tf.keras.Sequential()
#     model.add(Dense(
#         256,
#         activation='sigmoid', 
#         input_shape=[X_train.shape[1]],
#         kernel_initializer='glorot_normal',
#         kernel_regularizer=keras.regularizers.l1_l2(l1=0.0001, l2=0.0001),
#         bias_regularizer=keras.regularizers.l1_l2(l1=0.001, l2=0.1)))        
#     model.add(Dropout(0.005))
#     model.add(Dense(
#         256,
#         activation='sigmoid',
#         kernel_initializer='glorot_normal', 
#         kernel_regularizer=keras.regularizers.l1_l2(l1=0, l2=0.001),
#         bias_regularizer=keras.regularizers.l1_l2(l1=0.01, l2=0.01)))
#     model.add(Dropout(0.5))
        
#     model.add(Dense(
#         1,
#         kernel_initializer='glorot_normal',
#         activation='linear'
#     ))
    
#     adamax = keras.optimizers.Adamax(
#         learning_rate=0.001,
#         beta_1={{uniform(0.9, 1)}},
#         beta_2={{uniform(0.9, 1)}})
        
#     model.compile(loss='mse', 
#                   metrics=['mae'],
#                   optimizer=adamax)

#     es = EarlyStopping(
#         monitor='val_loss', 
#         mode='min', 
#         verbose=1, 
#         patience=20)

#     model.fit(X_train, y_train,
#               batch_size=256,
#               epochs=500,
#               verbose=2,
#               shuffle=True,
#               validation_data=(X_test, y_test),
# #               validation_split=0.15,
#               callbacks=[es])
    
#     mse, mae = model.evaluate(X_test, y_test, verbose=1)
#     print('Test mse:', mse)
#     return {'loss': mse, 'status': STATUS_OK, 'model': model}

In [None]:
best_run = None
best_model = None
space = None
trials=Trials()
best_run, best_model, space = optim.minimize(model=model,
                                      data=get_data,
                                      algo=tpe.suggest,
                                      max_evals=500,
                                      trials=trials,
                                      notebook_name='keras',
                                      eval_space=True,
                                      return_space=True)

In [None]:
best_run

In [None]:
output_metrics(best_model, data, process)

In [None]:
adamax = keras.optimizers.Adamax(learning_rate=0.001,beta_1=0.958,beta_2=0.987)

def build_model():
  model = tf.keras.Sequential([
    Dense(
        256, 
        activation='sigmoid', 
        input_shape=[len(data['X_train'].keys())],
        kernel_initializer='glorot_normal',
        kernel_regularizer=keras.regularizers.l1_l2(l1=0.0001, l2=0.0001),
        bias_regularizer=keras.regularizers.l1_l2(l1=0.001, l2=0.1)
    ),
    Dropout(0.005),
    Dense(
        256, 
        activation='sigmoid',
        kernel_initializer='glorot_normal',
        kernel_regularizer=keras.regularizers.l1_l2(l1=0, l2=0.001),
        bias_regularizer=keras.regularizers.l1_l2(l1=0.01, l2=0.01),
    ),
    Dropout(0.5),  
    Dense(
        1,
        kernel_initializer='glorot_normal',
        activation='linear'
    )
  ])

  model.compile(loss='mse',
                optimizer=adamax,
                metrics=['mae', 'mse'])
  return model

model3 = build_model()
# model3.summary()

es = EarlyStopping(
    monitor='val_loss', 
    mode='min', 
    verbose=1, 
    patience=20)

history = model3.fit(
    data['X_train'], data['y_train'],
    epochs=10000, 
    validation_data=(data['X_test'], data['y_test']),
    verbose=0,
    batch_size=256,
    shuffle=True,
    callbacks=[tfdocs.modeling.EpochDots(), es])
#     callbacks=[es])
output_metrics(model3, data, process, with_val=True)

In [None]:
output_metrics(model3, data, process, with_val=True)

In [None]:
mape: 701.1628654644568
mae: 41859822.98122131
rmse: 92053624.95453802
adj_r2: 0.44188860358712956

In [None]:
output_metrics(model3, data, process, with_val=True)

In [None]:
from matplotlib import pyplot
pyplot.plot(history.history['loss'], label='train loss')
pyplot.plot(history.history['val_loss'], label='test loss')
pyplot.plot(history.history['mae'], label='train mae')
pyplot.plot(history.history['val_mae'], label='test mae')
pyplot.legend()
pyplot.show()

In [None]:
# relu wo initialization

# mape: 6.6013761902604395
# mae: 47785029.21390841
# rmse: 117085745.28254725
# adj_r2: 0.2683608578800275

In [None]:
# sigmoid

# mape: 6.921294519771364
# mae: 46309187.6036963
# rmse: 106573312.92760979
# adj_r2: 0.39384203627853387

In [None]:
# sigmoid initialization glorot

# mape: 5.891309178898037
# mae: 45178191.806934245
# rmse: 105538830.70659108
# adj_r2: 0.40555259042845115

In [None]:
# relu initialization he-normal

# mape: 11.22876428439224
# mae: 49985922.89351641
# rmse: 124054429.88760668
# adj_r2: 0.1786780362328163

In [None]:
# {'Dense': 1024,
#  'Dense_1': 256,
#  'Dense_2': 1024,
#  'Dropout': 0.15106219282775246,
#  'Dropout_1': 0.6763508226311498,
#  'Dropout_2': 'two',
#  'Dropout_3': 0.30987873188582754,
#  'batch_size': 256,
#  'choiceval': 'adam',
#  'lr': 0.001,
#  'lr_1': 0.01,
#  'lr_2': 0.1}

# mape: 14.735849474528678
# mae: 42425257.59364493
# rmse: 93314406.47801651
# adj_r2: 0.5986823964694081

In [None]:
# {'Dense': 256,
#  'Dense_1': 1024,
#  'Dense_2': 1024,
#  'Dropout': 0.2974750265433348,
#  'Dropout_1': 0.39966419778642664,
#  'Dropout_2': 'two',
#  'Dropout_3': 0.526026582745637,
#  'activation': 'sigmoid',
#  'activation_1': 'sigmoid',
#  'activation_2': 'sigmoid',
#  'batch_size': 128,
#  'epochs': 75,
#  'kernel_initializer': <tensorflow.python.ops.init_ops_v2.VarianceScaling at 0x7f8b393ae710>,
#  'kernel_initializer_1': <tensorflow.python.ops.init_ops_v2.GlorotUniform at 0x7f8b393f3610>,
#  'kernel_initializer_2': <tensorflow.python.ops.init_ops_v2.VarianceScaling at 0x7f8b2aec9750>,
#  'optimizer': 'adam'}

# mape: 11.930947077248565
# mae: 41936947.39355901
# rmse: 94227681.26787286
# adj_r2: 0.5117092256416581

In [None]:
# {'Dense': 256,
#  'Dense_1': 256,
#  'Dense_2': 1024,
#  'Dropout': 0.12206225819915595,
#  'Dropout_1': 0.197064731659927,
#  'Dropout_2': 'two',
#  'Dropout_3': 0.00029842311592569865,
#  'activation': 'sigmoid',
#  'activation_1': 'sigmoid',
#  'activation_2': 'relu',
#  'batch_size': 128,
#  'epochs': 25,
#  'kernel_initializer': 'glorot_normal',
#  'kernel_initializer_1': 'glorot_normal',
#  'kernel_initializer_2': 'glorot_uniform',
#  'optimizer': 'adam'}

# mape: 6.224157960480135
# mae: 43643280.72317956
# rmse: 99891850.23558912
# adj_r2: 0.5618906445214958

In [None]:
# {'Dense': 512,
#  'Dense_1': 1024,
#  'Dense_2': 1024,
#  'Dropout': 0.003154114487844459,
#  'Dropout_1': 0.49453988643233404,
#  'Dropout_2': 'two',
#  'Dropout_3': 0.6057457357612506,
#  'activation': 'sigmoid',
#  'activation_1': 'sigmoid',
#  'activation_2': 'sigmoid',
#  'batch_size': 512,
#  'epochs': 100,
#  'kernel_initializer': 'glorot_normal',
#  'kernel_initializer_1': 'he_uniform',
#  'kernel_initializer_2': 'glorot_normal',
#  'optimizer': 'adam'}


# mape: 5.984611554637821
# mae: 43830607.21558445
# rmse: 99151995.47327316
# adj_r2: 0.5683563756398305

In [None]:
# def build_model():
#   model = tf.keras.Sequential([
#     Dense(
#         512, 
#         activation='sigmoid', 
#         input_shape=[len(data['X_train'].keys())],
#         kernel_initializer='glorot_normal',
#         kernel_regularizer=keras.regularizers.l1_l2(l1=0.0001, l2=0.0001),
#         bias_regularizer=keras.regularizers.l1_l2(l1=0.001, l2=0.1)
#     ),
#     Dropout(0.005),
#     Dense(
#         1024, 
#         activation='sigmoid',
#         kernel_initializer='he_uniform',
#         kernel_regularizer=keras.regularizers.l1_l2(l1=0, l2=0.001),
#         bias_regularizer=keras.regularizers.l1_l2(l1=0.01, l2=0.01),
#         activity_regularizer=l1_l2(l1=0, l2=0.00001)
#     ),
#     Dropout(0.7), 
#     Dense(1)
#   ])

#   model.compile(loss='mean_squared_error',
#                 optimizer='adam',
#                 metrics=['mae', 'mean_squared_error'])
#   return model

# model3 = build_model()
# model3.summary()

# mape: 8.161095980091112
# mae: 35879531.109611064
# rmse: 87476559.6580578
# adj_r2: 0.5608763424966564

In [None]:
# def build_model():
#   model = tf.keras.Sequential([
#     Dense(
#         256, 
#         activation='sigmoid', 
#         input_shape=[len(data['X_train'].keys())],
#         kernel_initializer='glorot_normal',
#         kernel_regularizer=keras.regularizers.l1_l2(l1=0.0001, l2=0.0001),
#         bias_regularizer=keras.regularizers.l1_l2(l1=0.001, l2=0.1)
#     ),
#     Dropout(0.005),
#     Dense(
#         256, 
#         activation='sigmoid',
#         kernel_initializer='glorot_normal',
#         kernel_regularizer=keras.regularizers.l1_l2(l1=0, l2=0.001),
#         bias_regularizer=keras.regularizers.l1_l2(l1=0.01, l2=0.01),
#     ),
#     Dropout(0.5),  
#     Dense(1)
#   ])

#   model.compile(loss='mae',
#                 optimizer='adam',
#                 metrics=['mae', 'mean_squared_error'])
#   return model

# model3 = build_model()
# # model3.summary()

# es = EarlyStopping(
#     monitor='val_loss', 
#     mode='min', 
#     verbose=1, 
#     patience=50)

# history = model3.fit(
#     data['X_train'], data['y_train'],
#     epochs=10000, 
#     validation_data=(data['X_test'], data['y_test']),
#     verbose=1,
#     batch_size=256,
#     shuffle=True,
# #     callbacks=[tfdocs.modeling.EpochDots(), es])
#     callbacks=[es])


# mape: 5.821131932635737
# mae: 42307258.2010261
# rmse: 95281127.2382761
# adj_r2: 0.4884981254695354

In [None]:
# skew x -> skew y

# adamax = keras.optimizers.Adamax(learning_rate=0.001,beta_1=0.95,beta_2=0.999)

# def build_model():
#   model = tf.keras.Sequential([
#     Dense(
#         256, 
#         activation='sigmoid', 
#         input_shape=[len(data['X_train'].keys())],
#         kernel_initializer='glorot_normal',
#         kernel_regularizer=keras.regularizers.l1_l2(l1=0.0001, l2=0.0001),
#         bias_regularizer=keras.regularizers.l1_l2(l1=0.001, l2=0.1)
#     ),
#     Dropout(0.005),
#     Dense(
#         256, 
#         activation='sigmoid',
#         kernel_initializer='glorot_normal',
#         kernel_regularizer=keras.regularizers.l1_l2(l1=0, l2=0.001),
#         bias_regularizer=keras.regularizers.l1_l2(l1=0.01, l2=0.01),
#     ),
#     Dropout(0.5),  
#     Dense(
#         1,
#         kernel_initializer='glorot_normal',
#         activation='linear'
#     )
#   ])

#   model.compile(loss='mse',
#                 optimizer=adamax,
#                 metrics=['mae', 'mse'])
#   return model

# model3 = build_model()
# # model3.summary()

# es = EarlyStopping(
#     monitor='val_loss', 
#     mode='min', 
#     verbose=1, 
#     patience=20)

# history = model3.fit(
#     data['X_train'], data['y_train'],
#     epochs=1000, 
#     validation_data=(data['X_test'], data['y_test']),
#     verbose=0,
#     batch_size=256,
#     shuffle=True,
# #     callbacks=[tfdocs.modeling.EpochDots(), es])
#     callbacks=[es])

# mape: 6.094501890528896
# mae: 40504269.625439286
# rmse: 87054173.39806816
# adj_r2: 0.5730149701057303

In [None]:
# # skew x -> skew y

# import lightgbm as lgb
# mod_lgb = lgb.LGBMRegressor(
#     objective='regression',
#     num_leaves=34,
#     learning_rate=0.001, 
#     n_estimators=7500,
#     max_bin=192,
#     max_depth=0,
#     min_child_samples=160,
#     min_child_weight=0.001,
#     bagging_fraction=0.98,
#     bagging_freq=15, 
#     feature_fraction=0.77,
#     feature_fraction_seed=9,
#     bagging_seed=9,
#     min_data_in_leaf=1, 
#     min_sum_hessian_in_leaf=50,
#     colsample_bytree=0.87,
#     reg_alpha=0.18,
#     reg_lambda=30,
#     subsample=0.39,
#     tree_learner='data',
# )
# mod_lgb.fit(
#     data['X_train'].values, 
#     data['y_train'],
#     verbose=0,
#     eval_metric='mse',
#     eval_set=[(data['X_test'], data['y_test'])],
#     early_stopping_rounds=25
# )
# output_metrics(mod_lgb, data, process, with_val=True)

# mape: 8.358551244895033
# mae: 40639301.811736666
# rmse: 83874926.05518548
# adj_r2: 0.49830307531562434

In [None]:
# adamax = keras.optimizers.Adamax(learning_rate=0.001,beta_1=0.958,beta_2=0.987)

# def build_model():
#   model = tf.keras.Sequential([
#     Dense(
#         256, 
#         activation='sigmoid', 
#         input_shape=[len(data['X_train'].keys())],
#         kernel_initializer='glorot_normal',
#         kernel_regularizer=keras.regularizers.l1_l2(l1=0.0001, l2=0.0001),
#         bias_regularizer=keras.regularizers.l1_l2(l1=0.001, l2=0.1)
#     ),
#     Dropout(0.005),
#     Dense(
#         256, 
#         activation='sigmoid',
#         kernel_initializer='glorot_normal',
#         kernel_regularizer=keras.regularizers.l1_l2(l1=0, l2=0.001),
#         bias_regularizer=keras.regularizers.l1_l2(l1=0.01, l2=0.01),
#     ),
#     Dropout(0.5),  
#     Dense(
#         1,
#         kernel_initializer='glorot_normal',
#         activation='linear'
#     )
#   ])

#   model.compile(loss='mse',
#                 optimizer=adamax,
#                 metrics=['mae', 'mse'])
#   return model

# model3 = build_model()
# # model3.summary()

# es = EarlyStopping(
#     monitor='val_loss', 
#     mode='min', 
#     verbose=1, 
#     patience=20)

# history = model3.fit(
#     data['X_train'], data['y_train'],
#     epochs=10000, 
#     validation_data=(data['X_test'], data['y_test']),
#     verbose=0,
#     batch_size=256,
#     shuffle=True,
#     callbacks=[tfdocs.modeling.EpochDots(), es])
# #     callbacks=[es])


# mape: 715.0661530743256
# mae: 41755141.579565056
# rmse: 91255901.0566581
# adj_r2: 0.45151972233459114

In [None]:
# import lightgbm as lgb
# mod_lgb = lgb.LGBMRegressor(
#     objective='regression',
#     num_leaves=34,
#     learning_rate=0.001, 
#     n_estimators=7500,
#     max_bin=192,
#     max_depth=0,
#     min_child_samples=160,
#     min_child_weight=0.001,
#     bagging_fraction=0.98,
#     bagging_freq=15, 
#     feature_fraction=0.77,
#     metric='l2',
#     bagging_seed=9,
#     min_data_in_leaf=1, 
#     min_sum_hessian_in_leaf=50,
#     colsample_bytree=0.87,
#     reg_alpha=0.18,
#     reg_lambda=30,
#     subsample=0.39,
#     tree_learner='data',
# )
# mod_lgb.fit(
#     data['X_train'].values, 
#     data['y_train'],
#     verbose=2,
#     eval_metric='mse',
#     eval_set=[(data['X_test'], data['y_test'])],
#     early_stopping_rounds=50
# )
# output_metrics(mod_lgb, data, process, with_val=True)