# Splitting, Preprocessing and Model Development
This notebook is used for:


### Declaring Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import ticker
from matplotlib.colors import LogNorm, Normalize
import sklearn
import time
import datetime
import joblib
import warnings
from sklearn.model_selection import ParameterGrid
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

### Color Palette & Typeface Sizing

In [2]:
YELLOW = '#F2DC5D'
GREEN = '#9BC53D'
DARK_GREEN = '#597222'
RED = '#C3423F'
LIGHT_BLUE = '#2596BE'
GRAY = '#666666'

AXIS_SIZE = 12
TITLE_SIZE = 16
DESCRIPTION_SIZE = 9
FIGURE_SIZE = (10*2/3,6*2/3)

RANDOM_STATE = 14

### Import Dataframe

In [3]:
#****************************************************import dataset****************************************************
df = pd.read_csv('../data/final.csv', dtype={'citizen': 'string', 'sex': 'string', 'age': 'string', 'decision': 'string', 'geo': 'string', 'TIME_PERIOD': 'string', 'GENCONV': "Int64", 'HUMSTAT': "Int64", 'SUB_PROT': "Int64", 'REJECTED': "Int64", 'TOTAL_POS': "Int64", 'TOTAL_APPS': "Int64", "POS_RATE": "Float64"}, keep_default_na=False, na_values=['nan'])

##remove partial 2023-Q3 Data
df = df[df["TIME_PERIOD"] != "2023-Q3"]

df

Unnamed: 0,citizen,sex,age,geo,TIME_PERIOD,GENCONV,HUMSTAT,SUB_PROT,REJECTED,TOTAL_POS,TOTAL_APPS
0,AD,F,UNK,AT,2008-Q1,0,0,0,0,0,0
1,AD,F,UNK,AT,2008-Q2,0,0,0,0,0,0
2,AD,F,UNK,AT,2008-Q3,0,0,0,0,0,0
3,AD,F,UNK,AT,2008-Q4,0,0,0,0,0,0
4,AD,F,UNK,AT,2009-Q1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
7221109,ZW,UNK,Y_LT14,UK,2019-Q3,0,0,0,0,0,0
7221110,ZW,UNK,Y_LT14,UK,2019-Q4,0,0,0,0,0,0
7221111,ZW,UNK,Y_LT14,UK,2020-Q1,0,0,0,0,0,0
7221112,ZW,UNK,Y_LT14,UK,2020-Q2,0,0,0,0,0,0


## 1. Introduce Lagged Features

In [4]:
#****************************************************re-sort dataframe****************************************************

sort_order = ['citizen', 'sex', 'age', 'geo', 'TIME_PERIOD']
df = df.sort_values(by =sort_order) 

#*********************************************create sequential list of quarters****************************************************

quarters = []
for i in range(2008, 2024):
    quarters.append(str(i) + "-Q1")
    quarters.append(str(i) + "-Q2")
    quarters.append(str(i) + "-Q3")
    quarters.append(str(i) + "-Q4")

#****************************************************lagged features****************************************************

QUARTERS_OF_LAG = (4 * 1)

def add_lagged_features(df, features, maintained_columns, QUARTERS_OF_LAG):
    quarters = np.unique(df["TIME_PERIOD"])
    def lagged_features(target_var, lag_count, unit):
        lagged = pd.DataFrame()
        columns = []
        for i in range(1, lag_count + 1):
            lagged = pd.concat([lagged, target_var.shift(i)], axis=1)
            name = target_var.name
            if (i == 1):
                columns.append(name + " - lag " + str(i) + " " + str(unit))
            else:
                columns.append(name + " - lag " + str(i) + " " + str(unit) + "s")
        lagged.columns = columns
        return lagged.astype('Int64')

    #introduce lag for each feature
    df_lagged = df
    for f in features:
        df_lagged = pd.concat([df_lagged, lagged_features(df[f], QUARTERS_OF_LAG, "quarter")], axis=1)

    #remove all features with less than the lag amount of historical data
    #df_lagged = df_lagged[df_lagged.eq()]
    for i in range(0, QUARTERS_OF_LAG):
        shift_eq = df_lagged.eq(df_lagged.shift())
        keep = shift_eq[maintained_columns[0]]
        for j in range(1, len(maintained_columns)):
            keep = keep & shift_eq[maintained_columns[j]]
        df_lagged = df_lagged[keep]
        #print("lagged i" + str(i) + " of " + str(QUARTERS_OF_LAG))
    
    return df_lagged

df_lagged = add_lagged_features(df, ["TOTAL_POS", "TOTAL_APPS"], ['citizen', 'age', 'sex', 'geo'], QUARTERS_OF_LAG)

df_lagged

Unnamed: 0,citizen,sex,age,geo,TIME_PERIOD,GENCONV,HUMSTAT,SUB_PROT,REJECTED,TOTAL_POS,TOTAL_APPS,TOTAL_POS - lag 1 quarter,TOTAL_POS - lag 2 quarters,TOTAL_POS - lag 3 quarters,TOTAL_POS - lag 4 quarters,TOTAL_APPS - lag 1 quarter,TOTAL_APPS - lag 2 quarters,TOTAL_APPS - lag 3 quarters,TOTAL_APPS - lag 4 quarters
4,AD,F,UNK,AT,2009-Q1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,AD,F,UNK,AT,2009-Q2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,AD,F,UNK,AT,2009-Q3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,AD,F,UNK,AT,2009-Q4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,AD,F,UNK,AT,2010-Q1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7221109,ZW,UNK,Y_LT14,UK,2019-Q3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7221110,ZW,UNK,Y_LT14,UK,2019-Q4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7221111,ZW,UNK,Y_LT14,UK,2020-Q1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7221112,ZW,UNK,Y_LT14,UK,2020-Q2,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Lagged Feature Testing

In [5]:
#***********************************************testing equivalence function****************************************************
def equivalent_dfs(df1, df2):
    return (df1.reset_index(drop=True) == df2.reset_index(drop=True)).all().all()

#****************************************************lagging testing****************************************************
print("[*] testing lagging function")
#simple tests
tdf0_data = [['AR', 1, 1], ['AR', 2, 2], ['AR', 3, 3], ['AR', 4, 4], ['AR', 5, 5]]
tdf0 = pd.DataFrame(data=tdf0_data, columns=['citizen', 'TIME_PERIOD', 'val'])
tdf1_data = [['AR', 2, 2, 1], ['AR', 3, 3, 2], ['AR', 4, 4, 3], ['AR', 5, 5, 4]]
tdf1 = pd.DataFrame(data=tdf1_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter'])
tdf2_data = [['AR', 3, 3, 2, 1], ['AR', 4, 4, 3, 2], ['AR', 5, 5, 4, 3]]
tdf2 = pd.DataFrame(data=tdf2_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter', 'val - lag 2 quarters'])
tdf3_data = [['AR', 4, 4, 3, 2, 1], ['AR', 5, 5, 4, 3, 2]]
tdf3 = pd.DataFrame(data=tdf3_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter', 'val - lag 2 quarters', 'val - lag 3 quarters'])

assert equivalent_dfs(add_lagged_features(tdf0, ["val"], ['citizen'], 1), 
                      tdf1), "basic lag test failed, shift=1"
assert equivalent_dfs(add_lagged_features(tdf0, ["val"], ['citizen'], 2), 
                      tdf2), "basic lag test failed, shift=2"
assert equivalent_dfs(add_lagged_features(tdf0, ["val"], ['citizen'], 3), 
                      tdf3), "basic lag test failed, shift=3"
print("\t[+] basic lag tests passed")

#more advanced tests
tdf4_data = [['AU', 1, 1], ['AU', 2, 2], ['AU', 3, 3], ['AU', 4, 4], ['AU', 5, 5],
             ['NZ', 1, 11], ['NZ', 2, 12], ['NZ', 3, 13], ['NZ', 4, 14], ['NZ', 5, 15]]
tdf4 = pd.DataFrame(data=tdf4_data, columns=['citizen', 'TIME_PERIOD', 'val'])
tdf5_data = [['AU', 2, 2, 1], ['AU', 3, 3, 2], ['AU', 4, 4, 3], ['AU', 5, 5, 4],
             ['NZ', 2, 12, 11], ['NZ', 3, 13, 12], ['NZ', 4, 14, 13], ['NZ', 5, 15, 14]]
tdf5 = pd.DataFrame(data=tdf5_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter'])
tdf6_data = [['AU', 3, 3, 2, 1], ['AU', 4, 4, 3, 2], ['AU', 5, 5, 4, 3],
             ['NZ', 3, 13, 12, 11], ['NZ', 4, 14, 13, 12], ['NZ', 5, 15, 14, 13]]
tdf6 = pd.DataFrame(data=tdf6_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter', 'val - lag 2 quarters'])


assert equivalent_dfs(add_lagged_features(tdf4, ["val"], ['citizen'], 1), 
                      tdf5), "lag with shared times test failed, shift=1"
assert equivalent_dfs(add_lagged_features(tdf4, ["val"], ['citizen'], 2), 
                      tdf6), "lag with shared times test failed, shift=2"
print("\t[+] tests with shared times passed")

#very complicated tests
tdf7_data = [['AU', 1, 1], ['AU', 2, 2], ['AU', 3, 3], ['AU', 4, 4], ['AU', 5, 5],
             ['NZ', 2, 12], ['NZ', 3, 13], ['NZ', 4, 14], ['NZ', 5, 15]]
tdf7 = pd.DataFrame(data=tdf7_data, columns=['citizen', 'TIME_PERIOD', 'val'])
tdf8_data = [['AU', 2, 2, 1], ['AU', 3, 3, 2], ['AU', 4, 4, 3], ['AU', 5, 5, 4],
             ['NZ', 3, 13, 12], ['NZ', 4, 14, 13], ['NZ', 5, 15, 14]]
tdf8 = pd.DataFrame(data=tdf8_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter'])
tdf9_data = [['AU', 3, 3, 2, 1], ['AU', 4, 4, 3, 2], ['AU', 5, 5, 4, 3],
             ['NZ', 4, 14, 13, 12], ['NZ', 5, 15, 14, 13]]
tdf9 = pd.DataFrame(data=tdf9_data, columns=['citizen', 'TIME_PERIOD', 'val', 'val - lag 1 quarter', 'val - lag 2 quarters'])
tdf10_data = [['AU', 1, 1], ['AU', 2, 2], ['AU', 3, 3], ['AU', 4, 4], ['AU', 5, 5],
              ['NZ', 2, 12], ['NZ', 3, 13], ['NZ', 4, 14], ['NZ', 5, 15],
              ['FJ', 3, 23], ['FJ', 4, 24], ['FJ', 5, 25], ['FJ', 6, 26], ['FJ', 7, 27], ['FJ', 8, 28],
              ['WS', 4, 34], ['WS', 5, 35]]
tdf10 = pd.DataFrame(data=tdf10_data, columns=['citizen', 'TIME_PERIOD', 'new_val'])
tdf11_data = [['AU', 2, 2, 1], ['AU', 3, 3, 2], ['AU', 4, 4, 3], ['AU', 5, 5, 4],
              ['NZ', 3, 13, 12], ['NZ', 4, 14, 13], ['NZ', 5, 15, 14],
              ['FJ', 4, 24, 23], ['FJ', 5, 25, 24], ['FJ', 6, 26, 25], ['FJ', 7, 27, 26], ['FJ', 8, 28, 27],
              ['WS', 5, 35, 34]]
tdf11 = pd.DataFrame(data=tdf11_data, columns=['citizen', 'TIME_PERIOD', 'new_val', 'new_val - lag 1 quarter'])
tdf12_data = [['AU', 5, 5, 4, 3, 2, 1],
              ['FJ', 7, 27, 26, 25, 24, 23], ['FJ', 8, 28, 27, 26, 25, 24]]
tdf12 = pd.DataFrame(data=tdf12_data, columns=['citizen', 'TIME_PERIOD', 'new_val', 'new_val - lag 1 quarter', 'new_val - lag 2 quarters', 'new_val - lag 3 quarters', 'new_val - lag 4 quarters'])

assert equivalent_dfs(add_lagged_features(tdf7, ["val"], ['citizen'], 1), 
                      tdf8), "lag with 2 different start times test failed, shift=1"
assert equivalent_dfs(add_lagged_features(tdf7, ["val"], ['citizen'], 2), 
                      tdf9), "lag with 2 different start times test failed, shift=2"
assert equivalent_dfs(add_lagged_features(tdf10, ["new_val"], ['citizen'], 1), 
                      tdf11), "lag with many different start times test failed, shift=1"
assert equivalent_dfs(add_lagged_features(tdf10, ["new_val"], ['citizen'], 4), 
                      tdf12), "lag with many different start times test failed, shift=4"
print("\t[+] tests with multiple start and end dates passed")

print("\t[+] \x1b[42mPASSED ALL\x1b[0m lagging tests")

[*] testing lagging function
	[+] basic lag tests passed
	[+] tests with shared times passed
	[+] tests with multiple start and end dates passed
	[+] [42mPASSED ALL[0m lagging tests


## 2. Dataset Splitting

In [6]:
#****************************************************splitting****************************************************
TARGET_VAR = "TOTAL_POS"

y = df_lagged[TARGET_VAR]
X = df_lagged.drop(['GENCONV', 'HUMSTAT', 'SUB_PROT', 'REJECTED', 'TOTAL_POS'], axis=1)
#PLAN:
#of 62 quarters...
#QUARTERS_OF_LAG are lost bc they wont have the needed lagged features
    
new_quarters = [q for q in quarters if q >= quarters[QUARTERS_OF_LAG]]
quarter_count = len(new_quarters) - 1

TRAIN_PORTION = 0.6
VAL_PORTION = 0.2
TEST_PORTION = 0.2

#take out last portion of quarters for testing
#div_0 = new_quarters[0]
#div_1 = new_quarters[int(quarter_count * train_split)]
div_2 = new_quarters[int(quarter_count * (1 - TEST_PORTION))]
div_3 = new_quarters[quarter_count]

#seperate out test section
X_test = X[(div_2 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_3)]
y_test = y[(div_2 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_3)]

### Evaluation Functions (for cross-validation)

In [7]:
def RMSE(y_pred, y_true):
    return np.sqrt(((y_pred - y_true) ** 2).sum() / len(y_pred))

def BASELINE_RMSE(y_true):
    return ALL_ZERO_BASELINE(y_true)

def ALL_ZERO_BASELINE(y_true):
    return np.sqrt(((y_true) ** 2).sum() / len(y_true))

def ALL_ONE_BASELINE(y_true):
    return np.sqrt(((1 - y_true) ** 2).sum() / len(y_true))

def ALL_MEAN_BASELINE(y_train, y_true):
    mean = y_train.mean()
    return np.sqrt(((mean - y_true) ** 2).sum() / len(y_true))

### Helper Functions

In [8]:
def TIMESTAMP_STR():
    dt = datetime.datetime.now()
    txt = "{hour}:{minute:02.0f} on {day}-{month}-{year}"
    return txt.format(hour=dt.hour, minute=dt.minute, day=dt.day, month=dt.month, year=dt.year)

def DF_ALL_PREDICTED(model):
    df_new = df
    df_new["TOTAL_APPS_PRED"] = model.predict(X)

UPDATE = True
def Status_Update(t, message):
    my_time = time.time()
    if UPDATE:
        print(str(message) + " \t-- in " + str(time.time() - t) + "s")
    return time.time()

### General Model-Training Function

In [11]:
def MLPipe_TimeSeries_RMSE(X, y, preprocessor, ML_algo, param_grid):
    ALGO_NAME = str(ML_algo)[str(ML_algo).rindex('.')+1:-2]
    MODEL_NAME = "{algo}({time})".format(algo=ALGO_NAME, time=TIMESTAMP_STR())
    NUM_FOLDS = np.round(TRAIN_PORTION / VAL_PORTION).astype(int)
    RANDOM_STATE = 14
    print("[!] looking at model: " + str(MODEL_NAME))
    
    print("\t[*] doing initial dataset splitting")
    
    #opt check 2
    X_trains = []
    y_trains = []
    X_vals = []
    y_vals = []
    for i in range(1, NUM_FOLDS + 1):
        train_portion = np.round(i * VAL_PORTION, 2)
        div_0 = new_quarters[0]
        div_1 = new_quarters[int(quarter_count * train_portion)]
        div_2 = new_quarters[int(quarter_count * (train_portion + VAL_PORTION))]

        X_trains.append(X[(div_0 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_1)])
        y_trains.append(y[(div_0 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_1)])
        X_vals.append(X[(div_1 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_2)])
        y_vals.append(y[(div_1 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_2)])
        print("\t\t[+] fold " + str(i))
    
    param_scores = []
    param_models = []
    for p in param_grid:
        T_PARAM_START = time.time()
        fold_scores = []
        print("\t[*] looking at hyperparameters " + str(p))
        for i in range(1, NUM_FOLDS + 1):
            T_FOLD_START = time.time()
            #ADD OPTIMIZATION HERE:
                #[2] NO NEED TO CONSTANTLY REDIVIDE THE DATA, SAME THREE FOLDS WILL BE USED EACH TIME REGARDLES OF HYPERPARAMAETERS
                #[1] NO NEED TO CHECK FUTURE FOLDS IF EVEN A SCORE OF ZERO WOULDN'T BRING IT BELOW A BETTER FOUND PARAMETER ARRANGEMENT
            train_portion = np.round(i * VAL_PORTION, 2)
            
            #opt check 1
            if (i > 0 & len(param_scores) > 0):
                best_param_score = param_scores[np.argmin(param_scores)]
                if ((best_param_score * 3) < np.sum(fold_scores)):
                    print("[!] best average score I could get with " + str(np.sum(fold_scores)) + " score already is worse than a previous params score of " + str(best_param_score) + ", so giving up")

            mini_t = time.time()
            
            X_train = X_trains[i - 1]
            y_train = y_trains[i - 1]
            X_val = X_vals[i - 1]
            y_val = y_vals[i - 1]
            
            if False:
                PORTION_OF_POINTS = 0.001
                X_train = X_train.sample(np.floor((len(X_train) * PORTION_OF_POINTS)).astype(int), random_state = RANDOM_STATE)
                y_train = y_train.loc[X_train.index]
                X_val = X_val.sample(np.floor((len(X_val) * PORTION_OF_POINTS)).astype(int), random_state = RANDOM_STATE)
                y_val = y_val.loc[X_val.index]
            
            mini_t = Status_Update(mini_t, "split data")

            #make pipeline
            if (ALGO_NAME == "SVR"):
                algo = ML_algo(**p)
            else:
                algo = ML_algo(**p, random_state = RANDOM_STATE)
            
            #print(algo)
            pipe = Pipeline(steps=[
                        ('preprocess', preprocessor),
                        ('model', algo)
                    ], verbose=True)
            
            mini_t = Status_Update(mini_t, "made pipeline")
            #print(X_train)
            
            pipe.fit(X_train, y_train)
            mini_t = Status_Update(mini_t, "fit pipeline")
            
            y_pred = pipe.predict(X_val)
            score = RMSE(y_pred, y_val)
            
            fold_scores.append(score)
            T_FOLD_END = time.time()
            T_FOLD_ELAPSED = T_FOLD_END - T_FOLD_START
            mini_t = Status_Update(mini_t, "eval pipeline")
            print("\t\t[*] fold " + str(i) + " complete (test score of " + str(score) + ") -- in " + str(np.round(T_FOLD_ELAPSED, 3)) + "s")
        score = np.mean(fold_scores)
        param_scores.append(score)
        param_models.append(pipe)
        
        T_PARAM_END = time.time()
        T_PARAM_ELAPSED = T_PARAM_END - T_PARAM_START
        print("\t\t[+] final score for params of " + str(score) + " -- in " + str(np.round(T_PARAM_ELAPSED, 3)) + "s")
        
        if (np.argmin(param_scores) == (len(param_scores) - 1)):
            best_model = param_models[np.argmin(param_scores)]
            print("\t\t[!] new best param configuration, so saving model to path")
            path = "../results/" + MODEL_NAME + ".pkl"
            joblib.dump(best_model, path, compress = 1)
            print("\t\t\t[+] saved model to " + str(path))
        
        
    i_best = np.argmin(param_scores)
    best_score = param_scores[i_best]
    best_params = param_grid[i_best]
    best_model = param_models[i_best]
    print("\t[+] best param configuration of " + str(best_params) + " found with score " + str(best_score))
    path = "../results/" + MODEL_NAME + ".pkl"
    joblib.dump(best_model, path, compress = 1)
    print("\t[+] saved model to " + str(path))

### Importing Models

In [9]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.svm import LinearSVR

#MODELS TO TEST:
#[IP] LASSO
#[T] RIDGE
#[Q] ELASTIC NET
#[IP] RF
#[] SVR
#[] XGBoost
#[] KNN??

#T: Trained
#IP: In Progress
#Q: Queued

### Training Linear Models

In [None]:
LASSO_PARAM_GRID = ParameterGrid({
            'alpha': np.geomspace(0.01, 10000, 12), # no upper bound so the values are evenly spaced in log
            })

ELASTIC_PARAM_GRID = ParameterGrid({
            'alpha': np.geomspace(0.1, 10000, 12), # no upper bound so the values are evenly spaced in log
            'l1_ratio': np.linspace(0.2, 1, 9)
            })

#MLPipe_TimeSeries_RMSE(X, y, preprocessor, ElasticNet, ELASTIC_PARAM_GRID)

MLPipe_TimeSeries_RMSE(X, y, preprocessor, Lasso, LASSO_PARAM_GRID)

In [14]:
LINEARSVR_PARAM_GRID = ParameterGrid({
    #'gamma': [1e-3, 1e-1, 1e1, 1e3, 1e5], #don't need gamma for linear
    'C': [1e-1, 1e0, 1e1, 1e2],
    'dual': ["auto"]
})

MLPipe_TimeSeries_RMSE(X, y, preprocessor, LinearSVR, LINEARSVR_PARAM_GRID)

[!] looking at model: LinearSVR(13:54 on 3-12-2023)
	[*] doing initial dataset splitting
		[+] fold 1
		[+] fold 2
		[+] fold 3
	[*] looking at hyperparameters {'C': 0.1, 'dual': 'auto'}
split data 	-- in 2.7179718017578125e-05s
made pipeline 	-- in 0.0001919269561767578s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   4.0s




[Pipeline] ............. (step 2 of 2) Processing model, total= 5.6min
fit pipeline 	-- in 337.15336298942566s
eval pipeline 	-- in 5.750520706176758s
		[*] fold 1 complete (test score of 4.7585265430888875) -- in 342.904s
split data 	-- in 6.9141387939453125e-06s
made pipeline 	-- in 0.00014901161193847656s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   8.5s




[Pipeline] ............. (step 2 of 2) Processing model, total=12.8min
fit pipeline 	-- in 773.7149360179901s
eval pipeline 	-- in 6.197703123092651s
		[*] fold 2 complete (test score of 29.199649097801224) -- in 779.913s
split data 	-- in 1.6689300537109375e-06s
made pipeline 	-- in 0.00013208389282226562s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  14.3s




[Pipeline] ............. (step 2 of 2) Processing model, total=22.3min
fit pipeline 	-- in 1354.658942937851s
eval pipeline 	-- in 6.7380900382995605s
		[*] fold 3 complete (test score of 6.8515689171603755) -- in 1361.397s
		[+] final score for params of 13.60324818601683 -- in 2484.216s
		[!] new best param configuration, so saving model to path
			[+] saved model to ../results/LinearSVR(13:54 on 3-12-2023).pkl
	[*] looking at hyperparameters {'C': 1.0, 'dual': 'auto'}
split data 	-- in 1.2159347534179688e-05s
made pipeline 	-- in 0.0001010894775390625s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   4.0s




[Pipeline] ............. (step 2 of 2) Processing model, total= 5.4min
fit pipeline 	-- in 328.12718296051025s
eval pipeline 	-- in 5.5861921310424805s
		[*] fold 1 complete (test score of 4.755031551385784) -- in 333.714s
split data 	-- in 1.0013580322265625e-05s
made pipeline 	-- in 0.00011205673217773438s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   8.4s




[Pipeline] ............. (step 2 of 2) Processing model, total=12.5min
fit pipeline 	-- in 761.6263680458069s
eval pipeline 	-- in 5.644953012466431s
		[*] fold 2 complete (test score of 28.970044657679857) -- in 767.271s
split data 	-- in 1.9073486328125e-06s
made pipeline 	-- in 0.0001239776611328125s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  13.3s




[Pipeline] ............. (step 2 of 2) Processing model, total=20.5min
fit pipeline 	-- in 1245.1851398944855s
eval pipeline 	-- in 5.78561806678772s
		[*] fold 3 complete (test score of 6.836381830138532) -- in 1250.971s
		[+] final score for params of 13.520486013068059 -- in 2351.957s
		[!] new best param configuration, so saving model to path
			[+] saved model to ../results/LinearSVR(13:54 on 3-12-2023).pkl
	[*] looking at hyperparameters {'C': 10.0, 'dual': 'auto'}
split data 	-- in 5.0067901611328125e-06s
made pipeline 	-- in 2.8133392333984375e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   3.8s




[Pipeline] ............. (step 2 of 2) Processing model, total= 5.8min
fit pipeline 	-- in 353.29660987854004s
eval pipeline 	-- in 5.739134073257446s
		[*] fold 1 complete (test score of 4.455386002093071) -- in 359.036s
split data 	-- in 2.86102294921875e-06s
made pipeline 	-- in 0.00010800361633300781s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   8.4s




[Pipeline] ............. (step 2 of 2) Processing model, total=12.5min
fit pipeline 	-- in 759.2692489624023s
eval pipeline 	-- in 5.650629758834839s
		[*] fold 2 complete (test score of 30.058329713681175) -- in 764.92s
split data 	-- in 5.0067901611328125e-06s
made pipeline 	-- in 9.012222290039062e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  13.3s




[Pipeline] ............. (step 2 of 2) Processing model, total=20.9min
fit pipeline 	-- in 1269.294716835022s
eval pipeline 	-- in 5.786826133728027s
		[*] fold 3 complete (test score of 6.812597098811412) -- in 1275.082s
		[+] final score for params of 13.775437604861885 -- in 2399.039s
	[*] looking at hyperparameters {'C': 100.0, 'dual': 'auto'}
split data 	-- in 1.0967254638671875e-05s
made pipeline 	-- in 5.507469177246094e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   3.7s




[Pipeline] ............. (step 2 of 2) Processing model, total= 5.8min
fit pipeline 	-- in 350.72221088409424s
eval pipeline 	-- in 5.439251184463501s
		[*] fold 1 complete (test score of 3.9362070485442735) -- in 356.162s
split data 	-- in 7.152557373046875e-06s
made pipeline 	-- in 0.00012493133544921875s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   8.2s




[Pipeline] ............. (step 2 of 2) Processing model, total=12.8min
fit pipeline 	-- in 778.549418926239s
eval pipeline 	-- in 5.476837873458862s
		[*] fold 2 complete (test score of 28.689300528697444) -- in 784.026s
split data 	-- in 9.5367431640625e-07s
made pipeline 	-- in 9.775161743164062e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  12.7s




[Pipeline] ............. (step 2 of 2) Processing model, total=21.2min
fit pipeline 	-- in 1283.8037250041962s
eval pipeline 	-- in 5.636337995529175s
		[*] fold 3 complete (test score of 7.145846830300674) -- in 1289.44s
		[+] final score for params of 13.257118135847465 -- in 2429.629s
		[!] new best param configuration, so saving model to path
			[+] saved model to ../results/LinearSVR(13:54 on 3-12-2023).pkl
	[+] best param configuration of {'dual': 'auto', 'C': 100.0} found with score 13.257118135847465
	[+] saved model to ../results/LinearSVR(13:54 on 3-12-2023).pkl


In [15]:
LINEARSVR_PARAM_GRID = ParameterGrid({
    #'gamma': [1e-3, 1e-1, 1e1, 1e3, 1e5], #don't need gamma for linear
    'C': [1e3, 1e4, 1e5],
    'dual': ["auto"]
})

MLPipe_TimeSeries_RMSE(X, y, preprocessor, LinearSVR, LINEARSVR_PARAM_GRID)

[!] looking at model: LinearSVR(16:40 on 3-12-2023)
	[*] doing initial dataset splitting
		[+] fold 1
		[+] fold 2
		[+] fold 3
	[*] looking at hyperparameters {'C': 1000.0, 'dual': 'auto'}
split data 	-- in 1.0967254638671875e-05s
made pipeline 	-- in 4.601478576660156e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   4.2s




[Pipeline] ............. (step 2 of 2) Processing model, total= 5.9min
fit pipeline 	-- in 357.7044150829315s
eval pipeline 	-- in 5.449075937271118s
		[*] fold 1 complete (test score of 3.9504893728675814) -- in 363.154s
split data 	-- in 1.0967254638671875e-05s
made pipeline 	-- in 8.797645568847656e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   8.3s




[Pipeline] ............. (step 2 of 2) Processing model, total=12.8min
fit pipeline 	-- in 775.3218419551849s
eval pipeline 	-- in 3.7379658222198486s
		[*] fold 2 complete (test score of 27.281421128763885) -- in 779.06s
split data 	-- in 1.0013580322265625e-05s
made pipeline 	-- in 9.226799011230469e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  10.9s




[Pipeline] ............. (step 2 of 2) Processing model, total=20.2min
fit pipeline 	-- in 1221.419251203537s
eval pipeline 	-- in 5.466487884521484s
		[*] fold 3 complete (test score of 7.689797311311569) -- in 1226.886s
		[+] final score for params of 12.973902604314347 -- in 2369.101s
		[!] new best param configuration, so saving model to path
			[+] saved model to ../results/LinearSVR(16:40 on 3-12-2023).pkl
	[*] looking at hyperparameters {'C': 10000.0, 'dual': 'auto'}
split data 	-- in 2.1457672119140625e-06s
made pipeline 	-- in 2.5272369384765625e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   3.7s




[Pipeline] ............. (step 2 of 2) Processing model, total= 5.8min
fit pipeline 	-- in 350.9046869277954s
eval pipeline 	-- in 5.519390821456909s
		[*] fold 1 complete (test score of 3.9504893728675814) -- in 356.424s
split data 	-- in 1.3113021850585938e-05s
made pipeline 	-- in 0.00012183189392089844s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   8.2s




[Pipeline] ............. (step 2 of 2) Processing model, total=13.1min
fit pipeline 	-- in 793.3045499324799s
eval pipeline 	-- in 5.405851125717163s
		[*] fold 2 complete (test score of 27.281421128763885) -- in 798.711s
split data 	-- in 3.0994415283203125e-06s
made pipeline 	-- in 8.511543273925781e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  13.1s




[Pipeline] ............. (step 2 of 2) Processing model, total=20.6min
fit pipeline 	-- in 1246.5140709877014s
eval pipeline 	-- in 5.521338939666748s
		[*] fold 3 complete (test score of 7.722587294744636) -- in 1252.036s
		[+] final score for params of 12.984832598792034 -- in 2407.172s
	[*] looking at hyperparameters {'C': 100000.0, 'dual': 'auto'}
split data 	-- in 1.9073486328125e-06s
made pipeline 	-- in 4.696846008300781e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   3.7s




[Pipeline] ............. (step 2 of 2) Processing model, total= 5.8min
fit pipeline 	-- in 350.0384418964386s
eval pipeline 	-- in 3.7425267696380615s
		[*] fold 1 complete (test score of 3.9504893728675814) -- in 353.781s
split data 	-- in 1.5735626220703125e-05s
made pipeline 	-- in 9.107589721679688e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   6.9s




[Pipeline] ............. (step 2 of 2) Processing model, total=12.7min
fit pipeline 	-- in 767.9597029685974s
eval pipeline 	-- in 5.039690017700195s
		[*] fold 2 complete (test score of 27.281421128763885) -- in 773.0s
split data 	-- in 1.0013580322265625e-05s
made pipeline 	-- in 7.295608520507812e-05s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  11.6s




[Pipeline] ............. (step 2 of 2) Processing model, total=21.2min
fit pipeline 	-- in 1286.1262011528015s
eval pipeline 	-- in 5.67920994758606s
		[*] fold 3 complete (test score of 7.722587294744636) -- in 1291.806s
		[+] final score for params of 12.984832598792034 -- in 2418.587s
	[+] best param configuration of {'dual': 'auto', 'C': 1000.0} found with score 12.973902604314347
	[+] saved model to ../results/LinearSVR(16:40 on 3-12-2023).pkl


In [22]:
import xgboost


XGB_PARAM_GRID = ParameterGrid({
    "learning_rate": [0.03],
    "n_estimators": [1000],
    #"early_stopping_rounds": [50],
    "reg_alpha": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
    "reg_lambda": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
    #"max_depth": [1,3,10,30,100],
    "seed": [14],
    "colsample_bytree": [0.9],              
    "subsample": [0.66]
    })

MLPipe_TimeSeries_RMSE(X, y, preprocessor, xgboost.XGBRegressor, XGB_PARAM_GRID)

[!] looking at model: XGBRegressor(22:06 on 3-12-2023)
	[*] doing initial dataset splitting
		[+] fold 1
		[+] fold 2


In [None]:
RIDGE_PARAM_GRID = ParameterGrid({
            'alpha': np.geomspace(0.01, 10000, 12), # no upper bound so the values are evenly spaced in log
            })
RIDGE_PARAM_GRID_SMALL = ParameterGrid({
            'alpha': [0.1, 1, 10, 100], # no upper bound so the values are evenly spaced in log
            })

#MLPipe_TimeSeries_RMSE(X, y, preprocessor, Ridge, RIDGE_PARAM_GRID)

In [None]:
ELASTIC_PARAM_GRID = {
            'alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000], # no upper bound so the values are evenly spaced in log
            'l1_ratio': np.linspace(0.2, 1, 9)
            }

MLPipe_TimeSeries_RMSE(X, y, preprocessor, ElasticNet, ELASTIC_PARAM_GRID)

### Training Non-Linear Models

In [19]:
RF_PARAM_GRID = ParameterGrid({
            'max_depth': [1, 3, 4, 7, 10, 14, 21, 31], # no upper bound so the values are evenly spaced in log
            'max_features': np.linspace(0.2, 1, 5)
            })
RF_PARAM_GRID_SMALL = ParameterGrid({
            'max_depth': [1, 3, 7, 14, 31], # no upper bound so the values are evenly spaced in log
            'max_features': np.linspace(0.2, 1, 3)
            })

MLPipe_TimeSeries_RMSE(X, y, preprocessor, RandomForestRegressor, RF_PARAM_GRID)

ELASTIC_PARAM_GRID = {
            'alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000], # no upper bound so the values are evenly spaced in log
            'l1_ratio': np.linspace(0.2, 1, 9)
            }

MLPipe_TimeSeries_RMSE(X, y, preprocessor, ElasticNet, ELASTIC_PARAM_GRID)

In [None]:
X_test_partial = X_test.sample(np.floor((len(X_test) * 0.3)).astype(int))
y_test_partial = y_test.loc[X_test_partial.index]
y_test_partial

In [None]:
X_trains = []
y_trains = []
X_vals = []
y_vals = []
for i in range(1, 3 + 1):
    train_portion = np.round(i * VAL_PORTION, 2)
    div_0 = new_quarters[0]
    div_1 = new_quarters[int(quarter_count * train_portion)]
    div_2 = new_quarters[int(quarter_count * (train_portion + VAL_PORTION))]

    X_trains.append(X[(div_0 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_1)])
    y_trains.append(y[(div_0 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_1)])
    X_vals.append(X[(div_1 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_2)])
    y_vals.append(y[(div_1 <= X["TIME_PERIOD"]) & (X["TIME_PERIOD"] < div_2)])

In [None]:
i_best = np.argmin(param_scores)
param_scores[i_best]
param_grid[i_best]

In [None]:
ML_algo.name

In [None]:
model = joblib.load("../results/Ridge(19:57 on 2-12-2023).pkl")
cfs = pd.DataFrame(model.named_steps["model"].coef_)
cfs.index = model.named_steps['preprocess'].get_feature_names_out()
cfs[0].sort_values(ascending=False).head(40)

In [10]:
#old split checking
def check_split_sizes(X, train, test, val):
    fails = 0
    print("[*] checking train test val split")
    train_set_qs = set(train["TIME_PERIOD"])
    test_set_qs = set(test["TIME_PERIOD"])
    val_set_qs = set(val["TIME_PERIOD"])
    
    #check for TIME_PERIOD overlap
    shared = (train_set_qs & test_set_qs) | (val_set_qs & test_set_qs) | (train_set_qs & val_set_qs)
    if (len(shared) != 0):
        warnings.warn('\t[-] overlap between train, test, or val time_periods')
        fails+=1
    else:
        print("\t[+] no overlap between train, test, or val TIME_PERIODS")
        
    #check for a fairly even 60/20/20 split
    NAMES = ['train', 'test ', 'val  ']
    TARGETS = [0.6, 0.2, 0.2]
    ALLOWED_FRACTION_ERROR = 0.02
    sizes = [len(train) / len(X), len(test) / len(X), len(val) / len(X)]
    for i in range(0, 3):
        if (np.abs(sizes[i] - TARGETS[i]) < ALLOWED_FRACTION_ERROR):
            print("\t[+] " + str(NAMES[i]) + " is " + str(np.round(sizes[i], 3)) + " of datapoints which is within bounds of its " + str(TARGETS[i]) + " target")
        else:
            warnings.warn("\t[-] " + str(NAMES[i]) + " is " + str(np.round(sizes[i], 3)) + " of datapoints which is out of bounds")
            fails+=1

    if (fails == 0):
        print("\t[+] \x1b[42mPASSED ALL\x1b[0m train test val split tests")
    else:
        print("\t[?] \033[91mFAILED " + str(fails) + "\033[0m train test val split tests")
        

#check_split_sizes(X, X_train, X_test, X_val)

## 3. Feature Scaling

In [13]:
#****************************************************feature scaling********************************************************

ordinal_ftrs = ['age', 'TIME_PERIOD']
ordinal_cats = [['UNK','Y_LT14','Y14-17','Y18-34','Y35-64','Y_GE65'], quarters]
                                                                     #^^i'm using quarters not new_quarters here so that
                                                                     #  the model can still tell where in history this q is
onehot_ftrs = ['citizen', 'geo', 'sex']
#onehot_ftrs = ['geo', 'sex']
minmax_ftrs = []
std_ftrs = [a for a in X.columns.to_list() if 'TOTAL' in a]

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories = ordinal_cats), ordinal_ftrs),
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), onehot_ftrs),
        ('minmax', MinMaxScaler(), minmax_ftrs),
        ('std', StandardScaler(), std_ftrs)])

#clf = Pipeline(steps=[('preprocessor', preprocessor)]) # for now we only preprocess 
                                                       # later on we will add other steps here

#X_train_prep = clf.fit_transform(X_train)
#X_val_prep = clf.transform(X_val)
#X_test_prep = clf.transform(X_test)

#print(X_train.shape)
#print(X_train_prep.shape)
#print(X_train_prep)
#X_prep