In [None]:
def nested_cross_val(df,model,space, n_iter,scoring,search_cv='random',outer=10,inner=3,seed=42):
  """
    Objective
    ---------
    Creates nested folds manually.
    Returns dataframe  of validation data
    

    Parameters
    ----------
    df : dataframe
      Input ready in form of dataframe.
        
    model : machine learning model
      Model in which to cross validate.
      eg) CatboostRegressor, LightGBMRegressor
      
    space : dictionary of lists
      Search space for hyper parameter optimization.
      eg) {
            'num_leaves': [300,400,500], 
            'max_bin': [175,255,510],
            'num_iterations': [700,800,900], 
            'learning_rate':[0.05,0.1,0.15],
            'boosting_type': ['gbdt', 'dart'],
            'max_depth': [-1, 5, 10]
          }
      
    n_iter : number of parameter settings sampled
      n_iter trades off runtime vs quality of the solution.
      
    scoring : string
      score method for evaluating model
      
    search_cv : string, default='random'
      hyper-parameter optimization approach. Random and grid search options
      eg)'random' or 'grid'
      
    outer : int, default=10
      number of outer folds
      
    inner : int, default=3
      number of inner folds
      
    seed : int, default=42
      seed for reproducability
      
    """
  from sklearn.model_selection import KFold, RandomizedSearchCV
  from sklearn.metrics import mean_absolute_error
  from numpy import mean, std
  import numpy as np
  import pandas as pd
  from tqdm import tqdm
  def mean_absolute_percentage_error(y_true, y_pred): 
    import numpy as np
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

  outer_scores = []
  outer_mape = []
  outer_estimates = []
  outer_configurations = []
  error_indices = []
  
  ground_truth = []
  predicted_labels = []

  X, y = df.drop(columns=['box_office']), df['box_office']  

  # Use the random grid to search for best hyperparameters
  # configure the cross-validation procedure
  cv_outer = KFold(n_splits=outer, random_state=seed)
  cv_inner = KFold(n_splits=inner, random_state=seed)
  # enumerate splits
  for train_ix, test_ix in tqdm(cv_outer.split(X,y), total=10):
    # split data
    # Get the training data
    X_train, y_train = X.iloc[train_ix], y.iloc[train_ix]
    # Get the validation data
    X_test, y_test = X.iloc[test_ix], y.iloc[test_ix]
    
    # define search
    if search_cv=='random':
      search = RandomizedSearchCV(
        estimator = model(), 
        param_distributions = space, 
        n_iter = n_iter,
        cv = cv_inner,
        random_state=seed, 
        scoring=scoring
      )
    else:
      search = GridSearchCV(
      model(), 
      space, 
      cv = cv_inner,
      scoring=scoring
    )
    # execute search
    result = search.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test).flatten()
    # evaluate the model
    mae = mean_absolute_error(np.expm1(y_test), np.expm1(yhat))
    mape = mean_absolute_percentage_error(np.expm1(y_test), np.expm1(yhat))
    
    #gather misclassified objects
    y_test = np.asarray(y_test)
    y_pred = best_model.predict(X_test).flatten()
    misclassified = np.where(y_test != y_pred)
    misclassified = list(X.iloc[test_ix].iloc[misclassified].index)
    error_indices.append(misclassified)
    ground_truth.append(list(y_test))
    predicted_labels.append(list(y_pred))
    
    # store the result
    outer_scores.append(mae)
    outer_mape.append(mape)
    outer_estimates.append(result.best_score_)
    outer_configurations.append(result.best_params_)
    # report progress
    print(f'>$USD={mae:.3f}, mape={mape:.3f} mae={mae:.3f},est={result.best_score_:.3f}, cfg={result.best_params_}')
    
  error_indices = [item for sublist in error_indices for item in sublist]
  
  final_data = pd.DataFrame({
    'outer_mae': [outer_scores],
    'outer_mape': [outer_mape],
    'outer_estimates': [outer_estimates],
    'outer_configurations': [outer_configurations],
    'error_indices': [error_indices],
    'ground_truth': [[item for sublist in ground_truth for item in sublist]],
    'predicted_labels': [[item for sublist in predicted_labels for item in sublist]]
  })
  
  outer_average = np.array(final_data['outer_mae'][0]).mean()
  final_data['outer_average'] = outer_average
  
  # summarize the estimated performance of the model
  print(f"Mean Absolute Error: {outer_average:.3f} ({std(np.array(final_data['outer_mae'][0])):.3f})")
  
  return final_data

In [None]:
def base_performance(df,model,outer,seed,tuned_mae):
  """
    Objective
    ---------
    Assess performance increase from hyper-parameter optimization
    

    Parameters
    ----------
    df : dataframe
      Input ready in form of dataframe.
        
    model : machine learning model
      Model in which to cross validate.
      eg) CatboostRegressor, LightGBMRegressor
      
    seed : int, default=42
      seed for reproducability
      
    tuned_mae : float
      tuned model performance
      
  """
  from sklearn.model_selection import KFold
  from numpy import mean
  from sklearn.metrics import mean_absolute_error
  from numpy import std
  
  X, y = df.drop(columns=['box_office']), df['box_office'] 
  #GET BASE PERFORMANCE
  # Use the random grid to search for best hyperparameters
  # configure the cross-validation procedure
  cv_outer = KFold(n_splits=outer, random_state=seed)
  outer_results = list()
  counter=0
  for train_ix, test_ix in cv_outer.split(X,y):
    # split data
    # Get the training data
    X_train, y_train = X.iloc[train_ix], y.iloc[train_ix]
    # Get the validation data
    X_test, y_test = X.iloc[test_ix], y.iloc[test_ix]

    # define the model
    model = model

    # execute search
    result = model.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    # evaluate model on the hold out dataset
    yhat = result.predict(X_test)
    # evaluate the model
    mae = mean_absolute_error(y_test, yhat)
    # store the result
    outer_results.append(mae)
    # report progress
    print(f'>mae={mae:.2f}')
    
  # summarize the estimated performance of the model
  base_mae = mean(outer_results)
  print(f'Tuned MAE: {tuned_mae:.3f}')
  print(f'Base MAE: {base_mae:.3f}')

  print(f"Improvement of: { 100 * (tuned_mae - base_mae) / base_mae}%")

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    import numpy as np
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100