# Splitting, Preprocessing and Model Development
This notebook is used for:


### Declaring Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import ticker
from matplotlib.colors import LogNorm, Normalize
import sklearn
import time
import datetime
import joblib
import warnings
from sklearn.model_selection import ParameterGrid
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

### Color Palette & Typeface Sizing

In [2]:
YELLOW = '#F2DC5D'
GREEN = '#9BC53D'
DARK_GREEN = '#597222'
RED = '#C3423F'
LIGHT_BLUE = '#2596BE'
GRAY = '#666666'

AXIS_SIZE = 12
TITLE_SIZE = 16
DESCRIPTION_SIZE = 9
FIGURE_SIZE = (10*2/3,6*2/3)

RANDOM_STATE = 14

### Import Dataframe

In [3]:
#****************************************************import dataset****************************************************
df = pd.read_csv('../data/final.csv', dtype={'citizen': 'string', 'sex': 'string', 'age': 'string', 'decision': 'string', 'geo': 'string', 'TIME_PERIOD': 'string', 'GENCONV': "Int64", 'HUMSTAT': "Int64", 'SUB_PROT': "Int64", 'REJECTED': "Int64", 'TOTAL_POS': "Int64", 'TOTAL_APPS': "Int64", "POS_RATE": "Float64"}, keep_default_na=False, na_values=['nan'])

##remove partial 2023-Q3 Data
df = df[df["TIME_PERIOD"] != "2023-Q3"]

df

Unnamed: 0,citizen,sex,age,geo,TIME_PERIOD,GENCONV,HUMSTAT,SUB_PROT,REJECTED,TOTAL_POS,TOTAL_APPS
0,AD,F,UNK,AT,2008-Q1,0,0,0,0,0,0
1,AD,F,UNK,AT,2008-Q2,0,0,0,0,0,0
2,AD,F,UNK,AT,2008-Q3,0,0,0,0,0,0
3,AD,F,UNK,AT,2008-Q4,0,0,0,0,0,0
4,AD,F,UNK,AT,2009-Q1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
7221109,ZW,UNK,Y_LT14,UK,2019-Q3,0,0,0,0,0,0
7221110,ZW,UNK,Y_LT14,UK,2019-Q4,0,0,0,0,0,0
7221111,ZW,UNK,Y_LT14,UK,2020-Q1,0,0,0,0,0,0
7221112,ZW,UNK,Y_LT14,UK,2020-Q2,0,0,0,0,0,0


## 1. Introduce Lagged Features

In [4]:
#****************************************************re-sort dataframe****************************************************

sort_order = ['citizen', 'sex', 'age', 'geo', 'TIME_PERIOD']
df = df.sort_values(by =sort_order) 

#*********************************************create sequential list of quarters****************************************************

quarters = []
for i in range(2008, 2024):
    quarters.append(str(i) + "-Q1")
    quarters.append(str(i) + "-Q2")
    quarters.append(str(i) + "-Q3")
    quarters.append(str(i) + "-Q4")

#****************************************************lagged features****************************************************

QUARTERS_OF_LAG = (4 * 1)

def add_lagged_features(df, features, maintained_columns, QUARTERS_OF_LAG):
    quarters = np.unique(df["TIME_PERIOD"])
    def lagged_features(target_var, lag_count, unit):
        lagged = pd.DataFrame()
        columns = []
        for i in range(1, lag_count + 1):
            lagged = pd.concat([lagged, target_var.shift(i)], axis=1)
            name = target_var.name
            if (i == 1):
                columns.append(name + " - lag " + str(i) + " " + str(unit))
            else:
                columns.append(name + " - lag " + str(i) + " " + str(unit) + "s")
        lagged.columns = columns
        return lagged.astype('Int64')

    #introduce lag for each feature
    df_lagged = df
    for f in features:
        df_lagged = pd.concat([df_lagged, lagged_features(df[f], QUARTERS_OF_LAG, "quarter")], axis=1)

    #remove all features with less than the lag amount of historical data
    #df_lagged = df_lagged[df_lagged.eq()]
    for i in range(0, QUARTERS_OF_LAG):
        shift_eq = df_lagged.eq(df_lagged.shift())
        keep = shift_eq[maintained_columns[0]]
        for j in range(1, len(maintained_columns)):
            keep = keep & shift_eq[maintained_columns[j]]
        df_lagged = df_lagged[keep]
        #print("lagged i" + str(i) + " of " + str(QUARTERS_OF_LAG))
    
    return df_lagged

df_lagged = add_lagged_features(df, ["TOTAL_POS", "TOTAL_APPS"], ['citizen', 'age', 'sex', 'geo'], QUARTERS_OF_LAG)

df_lagged

Unnamed: 0,citizen,sex,age,geo,TIME_PERIOD,GENCONV,HUMSTAT,SUB_PROT,REJECTED,TOTAL_POS,TOTAL_APPS,TOTAL_POS - lag 1 quarter,TOTAL_POS - lag 2 quarters,TOTAL_POS - lag 3 quarters,TOTAL_POS - lag 4 quarters,TOTAL_APPS - lag 1 quarter,TOTAL_APPS - lag 2 quarters,TOTAL_APPS - lag 3 quarters,TOTAL_APPS - lag 4 quarters
4,AD,F,UNK,AT,2009-Q1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,AD,F,UNK,AT,2009-Q2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,AD,F,UNK,AT,2009-Q3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,AD,F,UNK,AT,2009-Q4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,AD,F,UNK,AT,2010-Q1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7221109,ZW,UNK,Y_LT14,UK,2019-Q3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7221110,ZW,UNK,Y_LT14,UK,2019-Q4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7221111,ZW,UNK,Y_LT14,UK,2020-Q1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7221112,ZW,UNK,Y_LT14,UK,2020-Q2,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Lagged Feature Testing

In [5]:
#***********************************************testing equivalence function****************************************************
def equivalent_dfs(df1, df2):
    return (df1.reset_index(drop=True) == df2.reset_index(drop=True)).all().all()

#****************************************************lagging testing****************************************************
print("[*] testing lagging function")
#simple tests
tdf0_data = [['AR', 1, 1], ['AR', 2, 2], ['AR', 3, 3], ['AR', 4, 4], ['AR', 5, 5]]
tdf0 = pd.DataFrame(data=tdf0_data, columns=['citizen', 'TIME_PERIOD', 'val'])
tdf1_data = [['AR', 2, 2, 1], ['AR', 3, 3, 2], ['AR', 4, 4, 3], ['AR', 5, 5, 4]]
tdf1 = pd.DataFrame(data=tdf1_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter'])
tdf2_data = [['AR', 3, 3, 2, 1], ['AR', 4, 4, 3, 2], ['AR', 5, 5, 4, 3]]
tdf2 = pd.DataFrame(data=tdf2_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter', 'val - lag 2 quarters'])
tdf3_data = [['AR', 4, 4, 3, 2, 1], ['AR', 5, 5, 4, 3, 2]]
tdf3 = pd.DataFrame(data=tdf3_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter', 'val - lag 2 quarters', 'val - lag 3 quarters'])

assert equivalent_dfs(add_lagged_features(tdf0, ["val"], ['citizen'], 1), 
                      tdf1), "basic lag test failed, shift=1"
assert equivalent_dfs(add_lagged_features(tdf0, ["val"], ['citizen'], 2), 
                      tdf2), "basic lag test failed, shift=2"
assert equivalent_dfs(add_lagged_features(tdf0, ["val"], ['citizen'], 3), 
                      tdf3), "basic lag test failed, shift=3"
print("\t[+] basic lag tests passed")

#more advanced tests
tdf4_data = [['AU', 1, 1], ['AU', 2, 2], ['AU', 3, 3], ['AU', 4, 4], ['AU', 5, 5],
             ['NZ', 1, 11], ['NZ', 2, 12], ['NZ', 3, 13], ['NZ', 4, 14], ['NZ', 5, 15]]
tdf4 = pd.DataFrame(data=tdf4_data, columns=['citizen', 'TIME_PERIOD', 'val'])
tdf5_data = [['AU', 2, 2, 1], ['AU', 3, 3, 2], ['AU', 4, 4, 3], ['AU', 5, 5, 4],
             ['NZ', 2, 12, 11], ['NZ', 3, 13, 12], ['NZ', 4, 14, 13], ['NZ', 5, 15, 14]]
tdf5 = pd.DataFrame(data=tdf5_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter'])
tdf6_data = [['AU', 3, 3, 2, 1], ['AU', 4, 4, 3, 2], ['AU', 5, 5, 4, 3],
             ['NZ', 3, 13, 12, 11], ['NZ', 4, 14, 13, 12], ['NZ', 5, 15, 14, 13]]
tdf6 = pd.DataFrame(data=tdf6_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter', 'val - lag 2 quarters'])


assert equivalent_dfs(add_lagged_features(tdf4, ["val"], ['citizen'], 1), 
                      tdf5), "lag with shared times test failed, shift=1"
assert equivalent_dfs(add_lagged_features(tdf4, ["val"], ['citizen'], 2), 
                      tdf6), "lag with shared times test failed, shift=2"
print("\t[+] tests with shared times passed")

#very complicated tests
tdf7_data = [['AU', 1, 1], ['AU', 2, 2], ['AU', 3, 3], ['AU', 4, 4], ['AU', 5, 5],
             ['NZ', 2, 12], ['NZ', 3, 13], ['NZ', 4, 14], ['NZ', 5, 15]]
tdf7 = pd.DataFrame(data=tdf7_data, columns=['citizen', 'TIME_PERIOD', 'val'])
tdf8_data = [['AU', 2, 2, 1], ['AU', 3, 3, 2], ['AU', 4, 4, 3], ['AU', 5, 5, 4],
             ['NZ', 3, 13, 12], ['NZ', 4, 14, 13], ['NZ', 5, 15, 14]]
tdf8 = pd.DataFrame(data=tdf8_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter'])
tdf9_data = [['AU', 3, 3, 2, 1], ['AU', 4, 4, 3, 2], ['AU', 5, 5, 4, 3],
             ['NZ', 4, 14, 13, 12], ['NZ', 5, 15, 14, 13]]
tdf9 = pd.DataFrame(data=tdf9_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter', 'val - lag 2 quarters'])
tdf10_data = [['AU', 1, 1], ['AU', 2, 2], ['AU', 3, 3], ['AU', 4, 4], ['AU', 5, 5],
              ['NZ', 2, 12], ['NZ', 3, 13], ['NZ', 4, 14], ['NZ', 5, 15],
              ['FJ', 3, 23], ['FJ', 4, 24], ['FJ', 5, 25], ['FJ', 6, 26], ['FJ', 7, 27], ['FJ', 8, 28],
              ['WS', 4, 34], ['WS', 5, 35]]
tdf10 = pd.DataFrame(data=tdf10_data, columns=['citizen', 'TIME_PERIOD', 'new_val'])
tdf11_data = [['AU', 2, 2, 1], ['AU', 3, 3, 2], ['AU', 4, 4, 3], ['AU', 5, 5, 4],
              ['NZ', 3, 13, 12], ['NZ', 4, 14, 13], ['NZ', 5, 15, 14],
              ['FJ', 4, 24, 23], ['FJ', 5, 25, 24], ['FJ', 6, 26, 25], ['FJ', 7, 27, 26], ['FJ', 8, 28, 27],
              ['WS', 5, 35, 34]]
tdf11 = pd.DataFrame(data=tdf11_data, columns=['citizen', 'TIME_PERIOD', 'new_val', 'new_val - lag 1 quarter'])
tdf12_data = [['AU', 5, 5, 4, 3, 2, 1],
              ['FJ', 7, 27, 26, 25, 24, 23], ['FJ', 8, 28, 27, 26, 25, 24]]
tdf12 = pd.DataFrame(data=tdf12_data, columns=['citizen', 'TIME_PERIOD', 'new_val', 'new_val - lag 1 quarter', 'new_val - lag 2 quarters', 'new_val - lag 3 quarters', 'new_val - lag 4 quarters'])

assert equivalent_dfs(add_lagged_features(tdf7, ["val"], ['citizen'], 1), 
                      tdf8), "lag with 2 different start times test failed, shift=1"
assert equivalent_dfs(add_lagged_features(tdf7, ["val"], ['citizen'], 2), 
                      tdf9), "lag with 2 different start times test failed, shift=2"
assert equivalent_dfs(add_lagged_features(tdf10, ["new_val"], ['citizen'], 1), 
                      tdf11), "lag with many different start times test failed, shift=1"
assert equivalent_dfs(add_lagged_features(tdf10, ["new_val"], ['citizen'], 4), 
                      tdf12), "lag with many different start times test failed, shift=4"
print("\t[+] tests with multiple start and end dates passed")

print("\t[+] \x1b[42mPASSED ALL\x1b[0m lagging tests")

[*] testing lagging function
	[+] basic lag tests passed
	[+] tests with shared times passed
	[+] tests with multiple start and end dates passed
	[+] [42mPASSED ALL[0m lagging tests


## 2. Dataset Splitting

In [6]:
#****************************************************splitting****************************************************
TARGET_VAR = "TOTAL_POS"

y = df_lagged[TARGET_VAR]
X = df_lagged.drop(['GENCONV', 'HUMSTAT', 'SUB_PROT', 'REJECTED', 'TOTAL_POS'], axis=1)
#PLAN:
#of 62 quarters...
#QUARTERS_OF_LAG are lost bc they wont have the needed lagged features
    
new_quarters = [q for q in quarters if q >= quarters[QUARTERS_OF_LAG]]
quarter_count = len(new_quarters) - 1

TRAIN_PORTION = 0.6
VAL_PORTION = 0.2
TEST_PORTION = 0.2

#take out last portion of quarters for testing
#div_0 = new_quarters[0]
#div_1 = new_quarters[int(quarter_count * train_split)]
div_2 = new_quarters[int(quarter_count * (1 - TEST_PORTION))]
div_3 = new_quarters[quarter_count]

#seperate out test section
X_test = X[(div_2 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_3)]
y_test = y[(div_2 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_3)]

In [7]:
def RMSE(y_pred, y_true):
    return np.sqrt(((y_pred - y_true) ** 2).sum() / len(y_pred))

def BASELINE_RMSE(y_true):
    return ALL_ZERO_BASELINE(y_true)

def ALL_ZERO_BASELINE(y_true):
    return np.sqrt(((y_true) ** 2).sum() / len(y_true))

def ALL_ONE_BASELINE(y_true):
    return np.sqrt(((1 - y_true) ** 2).sum() / len(y_true))

def ALL_MEAN_BASELINE(y_train, y_true):
    mean = y_train.mean()
    return np.sqrt(((mean - y_true) ** 2).sum() / len(y_true))
def 

In [29]:
#**********calculate baselines**********
baselines = []
for i in range(1, 3 + 1):
        train_portion = np.round(i * VAL_PORTION, 2)
        #div_0 = new_quarters[0]
        div_1 = new_quarters[int(quarter_count * train_portion)]
        div_2 = new_quarters[int(quarter_count * (train_portion + VAL_PORTION))]

        X_val = X[(div_1 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_2)]
        y_val = y[(div_1 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_2)]
        
        score = RMSE(X_val["TOTAL_POS - lag 1 quarter"], y_val)
        baselines.append(score)
        
        print("\t\t[*] fold " + str(i) + " complete (test score of " + str(score) + ")")
        
print("\t[*] final test score " + str(np.mean(baselines)))

		[*] fold 1 complete (test score of 5.016934313790199)
		[*] fold 2 complete (test score of 29.456813493791838)
		[*] fold 3 complete (test score of 9.40553193999117)
	[*] final test score 14.626426582524402


In [28]:
X["TOTAL_POS - lag 1 quarter"]

4          0
5          0
6          0
7          0
8          0
          ..
7221109    0
7221110    0
7221111    0
7221112    0
7221113    0
Name: TOTAL_POS - lag 1 quarter, Length: 6731478, dtype: Int64

In [8]:
UPDATE = True
def Status_Update(t, message):
    my_time = time.time()
    if UPDATE:
        print(str(message) + " \t-- in " + str(time.time() - t) + "s")
    return time.time()

def MLPipe_TimeSeries_RMSE(X, y, preprocessor, ML_algo, param_grid):
    ALGO_NAME = str(ML_algo)[str(ML_algo).rindex('.')+1:-2]
    MODEL_NAME = "{algo}({time})".format(algo=ALGO_NAME, time=TIMESTAMP_STR())
    NUM_FOLDS = np.round(TRAIN_PORTION / VAL_PORTION).astype(int)
    RANDOM_STATE = 14
    print("[!] looking at model: " + str(MODEL_NAME))
    
    print("\t[*] doing initial dataset splitting")
    
    X_trains = []
    y_trains = []
    X_vals = []
    y_vals = []
    for i in range(1, NUM_FOLDS + 1):
        train_portion = np.round(i * VAL_PORTION, 2)
        div_0 = new_quarters[0]
        div_1 = new_quarters[int(quarter_count * train_portion)]
        div_2 = new_quarters[int(quarter_count * (train_portion + VAL_PORTION))]

        X_trains.append(X[(div_0 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_1)])
        y_trains.append(y[(div_0 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_1)])
        X_vals.append(X[(div_1 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_2)])
        y_vals.append(y[(div_1 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_2)])
        print("\t\t[+] fold " + str(i))
    
    
    
    param_scores = []
    param_models = []
    for p in param_grid:
        T_PARAM_START = time.time()
        fold_scores = []
        print("\t[*] looking at hyperparameters " + str(p))
        for i in range(1, NUM_FOLDS + 1):
            T_FOLD_START = time.time()
            #ADD OPTIMIZATION HERE:
                #[] NO NEED TO CONSTANTLY REDIVIDE THE DATA, SAME THREE FOLDS WILL BE USED EACH TIME REGARDLES OF HYPERPARAMAETERS
                #[1] NO NEED TO CHECK FUTURE FOLDS IF EVEN A SCORE OF ZERO WOULDN'T BRING IT BELOW A BETTER FOUND PARAMETER ARRANGEMENT
            train_portion = np.round(i * VAL_PORTION, 2)
            
            #opt check 1
            if (i > 0 & len(param_scores) > 0):
                best_param_score = param_scores[np.argmin(param_scores)]
                if ((best_param_score * 3) < np.sum(fold_scores)):
                    print("[!] best average score I could get with " + str(np.sum(fold_scores)) + " score already is worse than a previous params score of " + str(best_param_score) + ", so giving up")

            mini_t = time.time()
            
            #div_0 = new_quarters[0]
            #div_1 = new_quarters[int(quarter_count * train_portion)]
            #div_2 = new_quarters[int(quarter_count * (train_portion + VAL_PORTION))]

            X_train = X_trains[i - 1]
            y_train = y_trains[i - 1]
            X_val = X_vals[i - 1]
            y_val = y_vals[i - 1]
            
            #PORTION_OF_POINTS = 0.001
            #X_train = X_train.sample(np.floor((len(X_train) * PORTION_OF_POINTS)).astype(int), random_state = RANDOM_STATE)
            #y_train = y_train.loc[X_train.index]
            #X_val = X_val.sample(np.floor((len(X_val) * PORTION_OF_POINTS)).astype(int), random_state = RANDOM_STATE)
            #y_val = y_val.loc[X_val.index]
            
            mini_t = Status_Update(mini_t, "split data")

            #make pipeline
            algo = ML_algo(**p, random_state = RANDOM_STATE)
            #print(algo)
            pipe = Pipeline(steps=[
                        ('preprocess', preprocessor),
                        ('model', algo)
                    ])
            
            mini_t = Status_Update(mini_t, "made pipeline")

            pipe.fit(X_train, y_train)
            mini_t = Status_Update(mini_t, "fit pipeline")
            
            y_pred = pipe.predict(X_val)
            score = RMSE(y_pred, y_val)
            #zero_baseline = ALL_ZERO_BASELINE(y_val)
            #one_baseline = ALL_ONE_BASELINE(y_val)
            #mean_baseline = ALL_MEAN_BASELINE(y_train, y_val)
            fold_scores.append(score)
            T_FOLD_END = time.time()
            T_FOLD_ELAPSED = T_FOLD_END - T_FOLD_START
            mini_t = Status_Update(mini_t, "eval pipeline")
            print("\t\t[*] fold " + str(i) + " complete (test score of " + str(score) + ") -- in " + str(np.round(T_FOLD_ELAPSED, 3)) + "s")
        score = np.mean(fold_scores)
        param_scores.append(score)
        param_models.append(pipe)
        
        T_PARAM_END = time.time()
        T_PARAM_ELAPSED = T_PARAM_END - T_PARAM_START
        print("\t\t[+] final score for params of " + str(score) + " -- in " + str(np.round(T_PARAM_ELAPSED, 3)) + "s")
        
        if (np.argmin(param_scores) == (len(param_scores) - 1)):
            best_model = param_models[np.argmin(param_scores)]
            print("\t\t[!] new best param configuration, so saving model to path")
            path = "../results/" + MODEL_NAME + ".pkl"
            joblib.dump(best_model, path, compress = 1)
            print("\t\t\t[+] saved model to " + str(path))
        
        
    i_best = np.argmin(param_scores)
    best_score = param_scores[i_best]
    best_params = param_grid[i_best]
    best_model = param_models[i_best]
    print("\t[+] best param configuration of " + str(best_params) + " found with score " + str(best_score))
    path = "../results/" + MODEL_NAME + ".pkl"
    joblib.dump(best_model, path, compress = 1)
    print("\t[+] saved model to " + str(path))


In [None]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

RIDGE_PARAM_GRID = ParameterGrid({
            'alpha': np.geomspace(0.01, 10000, 12), # no upper bound so the values are evenly spaced in log
            })
RIDGE_PARAM_GRID_SMALL = ParameterGrid({
            'alpha': [0.1, 1, 10, 100], # no upper bound so the values are evenly spaced in log
            })

RF_PARAM_GRID = ParameterGrid({
            'max_depth': [1, 3, 4, 7, 10, 14, 21, 31], # no upper bound so the values are evenly spaced in log
            'max_features': np.linspace(0.2, 1, 5)
            })
RF_PARAM_GRID_SMALL = ParameterGrid({
            'max_depth': [1, 3, 7, 14, 31], # no upper bound so the values are evenly spaced in log
            'max_features': np.linspace(0.2, 1, 3)
            })

#MLPipe_TimeSeries_RMSE(X.drop("citizen", axis=1), y, preprocessor, Ridge, RIDGE_PARAM_GRID_SMALL)
MLPipe_TimeSeries_RMSE(X, y, preprocessor, Ridge, RIDGE_PARAM_GRID)

#MLPipe_TimeSeries_RMSE(X, y, preprocessor, RandomForestRegressor, RF_PARAM_GRID)
#is taking about 70s for first fold
#fold time is about 6 times first fold time assuming linearity?? so 6ish minutes per hyperparameter


[!] looking at model: Ridge(20:30 on 2-12-2023)
	[*] doing initial dataset splitting
		[+] fold 1


In [10]:
#31 was best
def TIMESTAMP_STR():
    dt = datetime.datetime.now()
    txt = "{hour}:{minute:02.0f} on {day}-{month}-{year}"
    return txt.format(hour=dt.hour, minute=dt.minute, day=dt.day, month=dt.month, year=dt.year)

In [134]:
y_pred = model.predict(X_val)
pred_df = pd.DataFrame(y_pred, columns=["y_pred"])
pred_df["y_pred_5"] = np.round(y_pred / 5) * 5

In [135]:
pred_df[pred_df["y_pred_5"] > 40]

Unnamed: 0,y_pred,y_pred_5
14467,77.431586,75.0
14468,48.660720,50.0
14469,49.813108,50.0
14539,105.708442,105.0
14540,91.535207,90.0
...,...,...
1403173,65.049243,65.0
1403331,45.048866,45.0
1403333,47.363093,45.0
1403334,60.246972,60.0


In [57]:
pred_df["y_pred_5"].value_counts()

0.0     1318805
5.0       57231
10.0      44130
40.0       5948
45.0        874
15.0        688
50.0        216
20.0        174
Name: y_pred_5, dtype: int64

In [154]:
def DF_ALL_PREDICTED(model):
    df_new = X
    df_new["TOTAL_POS_TRUE"] = y
    df_new["TOTAL_POS_PRED"] = model.predict(X)
    return df_new
    
df_new = DF_ALL_PREDICTED(model)

In [158]:
df_new[df_new["TOTAL_APPS_TRUE"] > 100]

Unnamed: 0,citizen,sex,age,geo,TIME_PERIOD,TOTAL_APPS,TOTAL_POS - lag 1 quarter,TOTAL_POS - lag 2 quarters,TOTAL_POS - lag 3 quarters,TOTAL_POS - lag 4 quarters,TOTAL_APPS - lag 1 quarter,TOTAL_APPS - lag 2 quarters,TOTAL_APPS - lag 3 quarters,TOTAL_APPS - lag 4 quarters,TOTAL_APPS_TRUE,TOTAL_APPS_PRED
73551,AF,F,Y14-17,DE,2016-Q3,525,70,25,40,10,120,40,45,15,370,344.960033
73552,AF,F,Y14-17,DE,2016-Q4,1610,370,70,25,40,525,120,40,45,1245,1099.154564
73553,AF,F,Y14-17,DE,2017-Q1,1040,1245,370,70,25,1610,525,120,40,780,913.497072
73554,AF,F,Y14-17,DE,2017-Q2,735,780,1245,370,70,1040,1610,525,120,585,554.533535
73555,AF,F,Y14-17,DE,2017-Q3,210,585,780,1245,370,735,1040,1610,525,175,345.858228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7203274,ZW,M,Y18-34,UK,2009-Q1,255,55,50,45,30,155,255,230,95,180,137.802353
7203275,ZW,M,Y18-34,UK,2009-Q2,455,180,55,50,45,255,155,255,230,120,305.110714
7203276,ZW,M,Y18-34,UK,2009-Q3,475,120,180,55,50,455,255,155,255,105,123.666926
7205251,ZW,M,Y35-64,UK,2009-Q1,205,45,30,20,25,105,150,125,70,155,118.993887


In [None]:
RMSE(y_pred, y_val)

In [133]:
model = joblib.load("../results/Ridge(19:57 on 2-12-2023).pkl")
cfs = pd.DataFrame(model.named_steps["model"].coef_)
cfs.index = model.named_steps['preprocess'].get_feature_names_out()
cfs[0].sort_values(ascending=False).head(40)

std__TOTAL_POS - lag 1 quarter      40.354379
std__TOTAL_APPS                     30.960840
std__TOTAL_POS - lag 3 quarters     13.258446
std__TOTAL_APPS - lag 2 quarters     9.245035
onehot__citizen_SY                   2.073114
onehot__citizen_ER                   0.444089
onehot__citizen_STLS                 0.146045
onehot__citizen_SO                   0.130571
onehot__geo_HR                       0.039161
onehot__geo_NL                       0.035456
onehot__geo_MT                       0.034956
onehot__geo_CH                       0.034744
onehot__citizen_YE                   0.034715
onehot__geo_BG                       0.033446
onehot__citizen_SK                   0.031374
onehot__citizen_SS                   0.031063
onehot__citizen_SA                   0.030582
onehot__citizen_CZ                   0.030449
onehot__citizen_KR                   0.030388
onehot__citizen_ST                   0.030306
onehot__sex_UNK                      0.030299
onehot__citizen_BH                

In [112]:
model.named_steps['preprocess'].get_feature_names_out()

array(['ord__age', 'ord__TIME_PERIOD', 'onehot__citizen_AD',
       'onehot__citizen_AE', 'onehot__citizen_AF', 'onehot__citizen_AG',
       'onehot__citizen_AL', 'onehot__citizen_AM', 'onehot__citizen_AO',
       'onehot__citizen_AR', 'onehot__citizen_AT', 'onehot__citizen_AU',
       'onehot__citizen_AZ', 'onehot__citizen_BA', 'onehot__citizen_BB',
       'onehot__citizen_BD', 'onehot__citizen_BE', 'onehot__citizen_BF',
       'onehot__citizen_BG', 'onehot__citizen_BH', 'onehot__citizen_BI',
       'onehot__citizen_BJ', 'onehot__citizen_BN', 'onehot__citizen_BO',
       'onehot__citizen_BR', 'onehot__citizen_BS', 'onehot__citizen_BT',
       'onehot__citizen_BW', 'onehot__citizen_BY', 'onehot__citizen_BZ',
       'onehot__citizen_CA', 'onehot__citizen_CD', 'onehot__citizen_CF',
       'onehot__citizen_CG', 'onehot__citizen_CH', 'onehot__citizen_CI',
       'onehot__citizen_CK', 'onehot__citizen_CL', 'onehot__citizen_CM',
       'onehot__citizen_CN', 'onehot__citizen_CO', 'onehot__cit

In [89]:
X_test

Unnamed: 0,citizen,sex,age,geo,TIME_PERIOD,TOTAL_APPS,TOTAL_POS - lag 1 quarter,TOTAL_POS - lag 2 quarters,TOTAL_POS - lag 3 quarters,TOTAL_POS - lag 4 quarters,TOTAL_APPS - lag 1 quarter,TOTAL_APPS - lag 2 quarters,TOTAL_APPS - lag 3 quarters,TOTAL_APPS - lag 4 quarters
51,AD,F,UNK,AT,2020-Q4,0,0,0,0,0,0,0,0,0
52,AD,F,UNK,AT,2021-Q1,0,0,0,0,0,0,0,0,0
53,AD,F,UNK,AT,2021-Q2,0,0,0,0,0,0,0,0,0
54,AD,F,UNK,AT,2021-Q3,0,0,0,0,0,0,0,0,0
55,AD,F,UNK,AT,2021-Q4,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7221058,ZW,UNK,Y_LT14,SK,2022-Q2,0,0,0,0,0,0,0,0,0
7221059,ZW,UNK,Y_LT14,SK,2022-Q3,0,0,0,0,0,0,0,0,0
7221060,ZW,UNK,Y_LT14,SK,2022-Q4,0,0,0,0,0,0,0,0,0
7221061,ZW,UNK,Y_LT14,SK,2023-Q1,0,0,0,0,0,0,0,0,0


In [None]:
X_trains = []
y_trains = []
X_vals = []
y_vals = []
for i in range(1, 3 + 1):
    train_portion = np.round(i * VAL_PORTION, 2)
    div_0 = new_quarters[0]
    div_1 = new_quarters[int(quarter_count * train_portion)]
    div_2 = new_quarters[int(quarter_count * (train_portion + VAL_PORTION))]

    X_trains.append(X[(div_0 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_1)])
    y_trains.append(y[(div_0 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_1)])
    X_vals.append(X[(div_1 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_2)])
    y_vals.append(y[(div_1 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_2)])

In [None]:
i_best = np.argmin(param_scores)
param_scores[i_best]
param_grid[i_best]

In [None]:
ML_algo.name

In [15]:
baselines = []
for i in range(1, 4):
    
    print("\t\t[*] fold " + str(i) + " complete (test score of " + str(score) + ")")

4          0
5          0
6          0
7          0
8          0
          ..
7221109    0
7221110    0
7221111    0
7221112    0
7221113    0
Name: TOTAL_APPS, Length: 6731478, dtype: Int64

In [None]:
#old split checking
def check_split_sizes(X, train, test, val):
    fails = 0
    print("[*] checking train test val split")
    train_set_qs = set(train["TIME_PERIOD"])
    test_set_qs = set(test["TIME_PERIOD"])
    val_set_qs = set(val["TIME_PERIOD"])
    
    #check for TIME_PERIOD overlap
    shared = (train_set_qs & test_set_qs) | (val_set_qs & test_set_qs) | (train_set_qs & val_set_qs)
    if (len(shared) != 0):
        warnings.warn('\t[-] overlap between train, test, or val time_periods')
        fails+=1
    else:
        print("\t[+] no overlap between train, test, or val TIME_PERIODS")
        
    #check for a fairly even 60/20/20 split
    NAMES = ['train', 'test ', 'val  ']
    TARGETS = [0.6, 0.2, 0.2]
    ALLOWED_FRACTION_ERROR = 0.02
    sizes = [len(train) / len(X), len(test) / len(X), len(val) / len(X)]
    for i in range(0, 3):
        if (np.abs(sizes[i] - TARGETS[i]) < ALLOWED_FRACTION_ERROR):
            print("\t[+] " + str(NAMES[i]) + " is " + str(np.round(sizes[i], 3)) + " of datapoints which is within bounds of its " + str(TARGETS[i]) + " target")
        else:
            warnings.warn("\t[-] " + str(NAMES[i]) + " is " + str(np.round(sizes[i], 3)) + " of datapoints which is out of bounds")
            fails+=1

    if (fails == 0):
        print("\t[+] \x1b[42mPASSED ALL\x1b[0m train test val split tests")
    else:
        print("\t[?] \033[91mFAILED " + str(fails) + "\033[0m train test val split tests")
        

#check_split_sizes(X, X_train, X_test, X_val)

In [128]:
preprocessor.get_feature_names_out()

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [122]:
std = [a for a in X.columns.to_list() if 'TOTAL' in a]
std

['TOTAL_APPS',
 'TOTAL_POS - lag 1 quarter',
 'TOTAL_POS - lag 2 quarters',
 'TOTAL_POS - lag 3 quarters',
 'TOTAL_POS - lag 4 quarters',
 'TOTAL_APPS - lag 1 quarter',
 'TOTAL_APPS - lag 2 quarters',
 'TOTAL_APPS - lag 3 quarters',
 'TOTAL_APPS - lag 4 quarters']

## 3. Feature Scaling

In [125]:
#****************************************************feature scaling********************************************************

ordinal_ftrs = ['age', 'TIME_PERIOD']
ordinal_cats = [['UNK','Y_LT14','Y14-17','Y18-34','Y35-64','Y_GE65'], quarters]
                                                                     #^^i'm using quarters not new_quarters here so that
                                                                     #  the model can still tell where in history this q is
onehot_ftrs = ['citizen', 'geo', 'sex']
#onehot_ftrs = ['geo', 'sex']
minmax_ftrs = []
std_ftrs = [a for a in X.columns.to_list() if 'TOTAL' in a]

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories = ordinal_cats), ordinal_ftrs),
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), onehot_ftrs),
        ('minmax', MinMaxScaler(), minmax_ftrs),
        ('std', StandardScaler(), std_ftrs)])

#clf = Pipeline(steps=[('preprocessor', preprocessor)]) # for now we only preprocess 
                                                       # later on we will add other steps here

#X_train_prep = clf.fit_transform(X_train)
#X_val_prep = clf.transform(X_val)
#X_test_prep = clf.transform(X_test)

#print(X_train.shape)
#print(X_train_prep.shape)
#print(X_train_prep)
#X_prep

In [None]:
from sklearn.linear_model import Ridge
ridge_param_grid = {
            'model__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], # no upper bound so the values are evenly spaced in log
            } 

ridge_test_scores, ridge_best_models = MLpipe_RMSE(preprocessor, Ridge, ridge_param_grid)

In [None]:
#****************************************************training models****************************************************

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

# function for the ML pipeline as outlined above 
def MLpipe_RMSE(preprocessor, ML_algo, param_grid):
    '''
    !This function splits the data to other/test (80/20) and then applies KFold with 4 folds to other.
    !The RMSE is minimized in cross-validation.

    !You should:

    !1. Loop through 10 different random states
    !2. Split your data 
    !3. Fit a model using GridSearchCV with KFold and the predefined Preprocessor 
    !4. Calculate the model's error on the test set 
    !5. Return a list of 10 test scores and 10 best models 
    '''
    
    # lists to be returned 
    test_scores = []
    best_models = []
    
    # your code here...
    for RANDOM_STATE in range(0, 10):
        pipe = Pipeline(steps=[
                ('preprocess', preprocessor),
                ('model', ML_algo())
            ])

        grid = GridSearchCV(pipe, param_grid=param_grid,scoring = 'neg_root_mean_squared_error', \
                            return_train_score = True, n_jobs=-2, verbose=True)

        grid.fit(X_val, y_val)
        
        results = pd.DataFrame(grid.cv_results_)
        
        best_models.append(grid)
        y_pred = grid.predict(X_test)
        test_scores.append(np.sqrt(mean_squared_error(y_pred, y_test)))

    return test_scores, best_models