# Splitting, Preprocessing and Model Development
This notebook is used for:


### Declaring Imports

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import ticker
from matplotlib.colors import LogNorm, Normalize
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

### Color Palette & Typeface Sizing

In [22]:
YELLOW = '#F2DC5D'
GREEN = '#9BC53D'
DARK_GREEN = '#597222'
RED = '#C3423F'
LIGHT_BLUE = '#2596BE'
GRAY = '#666666'

AXIS_SIZE = 12
TITLE_SIZE = 16
DESCRIPTION_SIZE = 9
FIGURE_SIZE = (10*2/3,6*2/3)

RANDOM_STATE = 14

In [8]:
#****************************************************import dataset****************************************************
df = pd.read_csv('../data/final.csv', dtype={'citizen': 'string', 'sex': 'string', 'age': 'string', 'decision': 'string', 'geo': 'string', 'TIME_PERIOD': 'string', 'GENCONV': "Int64", 'HUMSTAT': "Int64", 'SUB_PROT': "Int64", 'REJECTED': "Int64", 'TOTAL_POS': "Int64", 'TOTAL_APPS': "Int64", "POS_RATE": "Float64"}, keep_default_na=False, na_values=['nan'])

##remove partial 2023-Q3 Data
df = df[df["TIME_PERIOD"] != "2023-Q3"]

df

Unnamed: 0,citizen,sex,age,geo,TIME_PERIOD,GENCONV,HUMSTAT,SUB_PROT,REJECTED,TOTAL_POS,TOTAL_APPS
0,AD,F,UNK,AT,2008-Q1,0,0,0,0,0,0
1,AD,F,UNK,AT,2008-Q2,0,0,0,0,0,0
2,AD,F,UNK,AT,2008-Q3,0,0,0,0,0,0
3,AD,F,UNK,AT,2008-Q4,0,0,0,0,0,0
4,AD,F,UNK,AT,2009-Q1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
7221109,ZW,UNK,Y_LT14,UK,2019-Q3,0,0,0,0,0,0
7221110,ZW,UNK,Y_LT14,UK,2019-Q4,0,0,0,0,0,0
7221111,ZW,UNK,Y_LT14,UK,2020-Q1,0,0,0,0,0,0
7221112,ZW,UNK,Y_LT14,UK,2020-Q2,0,0,0,0,0,0


In [114]:
#****************************************************re-sort dataframe****************************************************

sort_order = ['citizen', 'sex', 'age', 'geo', 'TIME_PERIOD']
df = df.sort_values(by =sort_order) 

#*********************************************create sequential list of quarters****************************************************

quarters = []
for i in range(2008, 2024):
    quarters.append(str(i) + "-Q1")
    quarters.append(str(i) + "-Q2")
    quarters.append(str(i) + "-Q3")
    quarters.append(str(i) + "-Q4")

#****************************************************lagged features****************************************************

QUARTERS_OF_LAG = 10

def lagged_features(target_var, lag_count, unit):
    lagged = pd.DataFrame()
    columns = []
    for i in range(1, lag_count + 1):
        lagged = pd.concat([lagged, target_var.shift(i)], axis=1)
        name = target_var.name
        if (i == 1):
            columns.append(name + " - lag " + str(i) + " " + str(unit))
        else:
            columns.append(name + " - lag " + str(i) + " " + str(unit) + "s")
        #print(i)
        #print(lagged)
    lagged.columns = columns
    return lagged

#introduce lag
df_lagged = pd.concat([df, lagged_features(df["TOTAL_APPS"], QUARTERS_OF_LAG, "quarter")], axis=1)

#remove all features with less than the lag amount of historical data
df_lagged = df_lagged[df_lagged["TIME_PERIOD"] >= quarters[QUARTERS_OF_LAG]]
df_lagged

Unnamed: 0,citizen,sex,age,geo,TIME_PERIOD,GENCONV,HUMSTAT,SUB_PROT,REJECTED,TOTAL_POS,...,TOTAL_APPS - lag 1 quarter,TOTAL_APPS - lag 2 quarters,TOTAL_APPS - lag 3 quarters,TOTAL_APPS - lag 4 quarters,TOTAL_APPS - lag 5 quarters,TOTAL_APPS - lag 6 quarters,TOTAL_APPS - lag 7 quarters,TOTAL_APPS - lag 8 quarters,TOTAL_APPS - lag 9 quarters,TOTAL_APPS - lag 10 quarters
10,AD,F,UNK,AT,2010-Q3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,AD,F,UNK,AT,2010-Q4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,AD,F,UNK,AT,2011-Q1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,AD,F,UNK,AT,2011-Q2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,AD,F,UNK,AT,2011-Q3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7221109,ZW,UNK,Y_LT14,UK,2019-Q3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7221110,ZW,UNK,Y_LT14,UK,2019-Q4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7221111,ZW,UNK,Y_LT14,UK,2020-Q1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7221112,ZW,UNK,Y_LT14,UK,2020-Q2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [113]:
#****************************************************splitting****************************************************
TARGET_VAR = "REJECTED"

y = df_lagged[TARGET_VAR]
X = df_lagged.drop(['GENCONV', 'HUMSTAT', 'SUB_PROT', 'REJECTED', 'TOTAL_POS'], axis=1)


In [18]:
#****************************************************feature scaling****************************************************

ordinal_ftrs = ['age', 'TIME_PERIOD']
ordinal_cats = [['UNK','Y_LT14','Y14-17','Y18-34','Y35-64','Y_GE65'], quarters]
onehot_ftrs = ['citizen', 'geo', 'sex']
minmax_ftrs = []
std_ftrs = []

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories = ordinal_cats), ordinal_ftrs),
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), onehot_ftrs),
        ('minmax', MinMaxScaler(), minmax_ftrs),
        ('std', StandardScaler(), std_ftrs)])

clf = Pipeline(steps=[('preprocessor', preprocessor)]) # for now we only preprocess 
                                                       # later on we will add other steps here


X_prep = clf.fit_transform(X)
#X_train_prep = clf.fit_transform(X_train)
#X_val_prep = clf.transform(X_val)
#X_test_prep = clf.transform(X_test)

#print(X_train.shape)
#print(X_train_prep.shape)
#print(X_train_prep)
X_prep

array([[ 0.,  0.,  1., ...,  1.,  0.,  0.],
       [ 0.,  1.,  1., ...,  1.,  0.,  0.],
       [ 0.,  2.,  1., ...,  1.,  0.,  0.],
       ...,
       [ 1., 48.,  0., ...,  0.,  0.,  1.],
       [ 1., 49.,  0., ...,  0.,  0.,  1.],
       [ 1., 50.,  0., ...,  0.,  0.,  1.]])