In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from interpolation import CensusData, sequences, BlockInterpolator, CentroidInterpolator, archive, interpolation
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier

In [2]:
#Change these values to rerun the notebook
filled_1850 = pd.read_csv("../../data/dwelling_filled_sum_1850_mn_v01.csv")
cd_1850 = pd.read_csv("../../data/cd_1850_XYupdate.csv") #For calculating centroids
filled_1850.rename(columns = {"CENSUS_WARD_NUM":"Ward_Num", "CENSUS_SERIALP":"CENSUS_SERIALB", "BLOCK_NUM":"block_num", "CENSUS_REEL_HOUSEHOLD":"CENSUS_REEL", "CENSUS_SEX":"CENSUS_SEXB", "CENSUS_PAGENO_HOUSEHOLD":"CENSUS_PAGENUM", "CENSUS_RACE":"CENSUS_RACEB", "CD_X":"cd_X", "CD_Y":"cd_Y"}, inplace = True)
ward_col = "Ward_Num"
dwelling_col = "dwelling_id"
dwelling_col_num = "CENSUS_DWELLING_NUM"
block_col = "block_num"
x_col = "cd_X"
y_col = "cd_Y"

#### With both fixed length and distance sequences

In [3]:
#set clustering algo -- this is to make sure that the same clustering algorithm is used
#for each model, to ensure that testing is accurate
#note fitting before hand won't work for all types of clustering, for example, with agglomerative
#clustering this isn't going to work because the algorithm doesn't have a predict method
clust_algo = AgglomerativeClustering(5)

In [4]:
census_data = CensusData(filled_1850, ward_col=ward_col, dwelling_col=dwelling_col, block_col =  block_col, x_col = x_col, y_col = y_col)
census_data.apply_sequencing(fixed = True, distance = True)

In [5]:
all_cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "sequence_id", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL", "CENSUS_ID", "sequence_order_enum", 'CENSUS_DWELLING_SEQ', "fixed_seq"]
block_boost = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "sequence_id", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL", "CENSUS_ID", "sequence_order_enum", 'CENSUS_DWELLING_SEQ', "fixed_seq"])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 80))])

In [6]:
block_centroids = {ward:{block:interpolation.make_centroid(df_block[x_col], df_block[y_col]) for block, df_block in df.groupby(block_col)} for ward,df in cd_1850.groupby(ward_col)}
interpolate_sequences = CentroidInterpolator(census_data, 10, block_boost, all_cols, clust_algo, block_centroids)

In [7]:
interpolate_sequences.apply_clustering()
train,test = interpolate_sequences.stratified_train_test()
interpolate_sequences.train_test_model(train, test)

In [8]:
print("Training score:",interpolate_sequences.train_score)
print("Test score:", interpolate_sequences.test_score)

Training score: 1.0
Test score: 0.5167037861915368


#### With only fixed length sequences

In [9]:
census_data.apply_sequencing(fixed = True)
all_cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL", 'CENSUS_DWELLING_SEQ', "fixed_seq"]
block_boost = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "fixed_seq"])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 80))])
interpolate_fixed = CentroidInterpolator(census_data, 10, block_boost, all_cols, clust_algo, block_centroids)

In [10]:
interpolate_fixed.apply_clustering()
train,test = interpolate_fixed.stratified_train_test()
interpolate_fixed.train_test_model(train, test)

In [11]:
print("Training score:",interpolate_fixed.train_score)
print("Test score:", interpolate_fixed.test_score)

Training score: 1.0
Test score: 0.2865627319970304


#### With only distance length sequences

In [12]:
census_data.apply_sequencing(distance = True)
all_cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "sequence_id", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL","sequence_order_enum", 'CENSUS_DWELLING_SEQ']
block_boost = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "sequence_id", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",  "sequence_order_enum", 'CENSUS_DWELLING_SEQ'])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 50))])

In [13]:
interpolate_distance = CentroidInterpolator(census_data, 10, block_boost, all_cols, clust_algo, block_centroids)
interpolate_distance.apply_clustering()
train,test = interpolate_distance.stratified_train_test()
interpolate_distance.train_test_model(train, test)

In [14]:
print("Training score:",interpolate_distance.train_score)
print("Test score:", interpolate_distance.test_score)

Training score: 1.0
Test score: 0.5077951002227171


#### Without sequences

In [15]:
census_data.no_seq()

In [16]:
all_cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ']
block_boost = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ'])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 50))])

In [17]:
interpolate_noseq = CentroidInterpolator(census_data, 10, block_boost, all_cols, clust_algo, block_centroids)
interpolate_noseq.apply_clustering()
train,test = interpolate_noseq.stratified_train_test()
interpolate_noseq.train_test_model(train, test)

In [18]:
print("Training score:",interpolate_noseq.train_score)
print("Test score:", interpolate_noseq.test_score)

Training score: 1.0
Test score: 0.281365998515219


#### With dwelling_col_num sequence id only

In [19]:
all_cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "dwelling_seq_id"]
block_boost = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "dwelling_seq_id"])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 50))])

In [20]:
census_data.apply_sequencing(dwelling = True)
interpolate_dwell = CentroidInterpolator(census_data, 10, block_boost, all_cols, clust_algo, block_centroids)
interpolate_dwell.apply_clustering()
train,test = interpolate_dwell.stratified_train_test()
interpolate_dwell.train_test_model(train, test)

In [21]:
print("Training score:",interpolate_dwell.train_score)
print("Test score:", interpolate_dwell.test_score)

Training score: 1.0
Test score: 0.27839643652561247


#### With dwelling_col_num sequence id and distance sequences

In [22]:
#I think dwelling num would need to be preprocessed
all_cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id"]
block_boost = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id"])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 50))])

In [23]:
census_data.apply_sequencing(distance = True, dwelling = True)
interpolate_dwell = CentroidInterpolator(census_data, 10, block_boost, all_cols, clust_algo, block_centroids)
interpolate_dwell.apply_clustering()
train,test = interpolate_dwell.stratified_train_test()
interpolate_dwell.train_test_model(train, test)

In [24]:
print("Training score:",interpolate_dwell.train_score)
print("Test score:", interpolate_dwell.test_score)

Training score: 1.0
Test score: 0.5285820341499629


#### Random Forest

In [25]:
all_cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id"]
rf = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id"])),
                ('classifier', RandomForestClassifier(n_estimators = 400))])

In [26]:
interpolate_rf = CentroidInterpolator(census_data, 10, rf, all_cols, clust_algo, block_centroids)
interpolate_rf.apply_clustering()
train,test = interpolate_rf.stratified_train_test()
interpolate_rf.train_test_model(train, test)

In [27]:
print("Training score:",interpolate_rf.train_score)
print("Test score:", interpolate_rf.test_score)

Training score: 1.0
Test score: 0.5085374907201188


#### With CENSUS_DWELLING_COL

In [28]:
dwellings_1850 = filled_1850.groupby([ward_col, dwelling_col], as_index = False).first()

In [29]:
#Ordinal encoding (leaving as numeric)
all_cols = ["CENSUS_DWELLING_NUM","CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id"]
ordinal_encode = make_column_transformer((OrdinalEncoder(), ['CENSUS_DWELLING_NUM']))
block_boost_ord = Pipeline([('ordinal encode', ordinal_encode),
    ('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id"])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 50))])

In [30]:
census_data.apply_sequencing(distance = True, dwelling = True)
interpolate_dwell_ord = CentroidInterpolator(census_data, 10, block_boost, all_cols, clust_algo, block_centroids)
interpolate_dwell_ord.apply_clustering()
train,test = interpolate_dwell_ord.stratified_train_test()
interpolate_dwell_ord.train_test_model(train, test)

In [31]:
print("Training score:",interpolate_dwell_ord.train_score)
print("Test score:", interpolate_dwell_ord.test_score)

Training score: 1.0
Test score: 0.5241276911655531


In [32]:
#target encoding
all_cols = ["CENSUS_DWELLING_NUM","CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id"]
block_boost = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_DWELLING_NUM","CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id"])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 50))])

In [33]:
census_data.apply_sequencing(dwelling = True, distance = True)
interpolate_dwell_ord_enc = CentroidInterpolator(census_data, 10, block_boost, all_cols, clust_algo, block_centroids)
interpolate_dwell_ord_enc.apply_clustering()
train,test = interpolate_dwell_ord_enc.stratified_train_test()
interpolate_dwell_ord_enc.train_test_model(train, test)

In [34]:
print("Training score:",interpolate_dwell_ord_enc.train_score)
print("Test score:", interpolate_dwell_ord_enc.test_score)

Training score: 1.0
Test score: 0.38158871566443947


This indicates that using distance based sequences and dwelling based sequences with target encoding and without order within sequence works best. Next we'll experiment with incorporating the enumerator name and date

### Explore Enumeration and Date

In [35]:
enumerators = pd.read_csv("../../data/census_1850_enumerationDetail_mn_ward10.csv")

In [36]:
enumerators.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553 entries, 0 to 552
Data columns (total 7 columns):
CENSUS_PAGENO_HOUSEHOLD    553 non-null int64
CENSUS_PAGENNO             553 non-null int64
CENSUS_WARD_NUM            553 non-null int64
CENSUS_REEL_HOUSEHOLD      553 non-null int64
CENSUS_ENUMERATOR_NUM      553 non-null int64
CENSUS_ENUMERATOR_DATE     553 non-null object
ANCESTRY_REF               553 non-null object
dtypes: int64(5), object(2)
memory usage: 30.4+ KB


In [37]:
ward10 = filled_1850[filled_1850[ward_col] == 10]
ward10_enumerators = ward10.merge(enumerators,  how = "left", left_on= "CENSUS_PAGENUM", right_on = "CENSUS_PAGENO_HOUSEHOLD")

In [38]:
census_enumerators = CensusData(ward10_enumerators, ward_col=ward_col, dwelling_col=dwelling_col, block_col =  block_col, x_col = x_col, y_col = y_col)
census_enumerators.apply_sequencing(distance = True, dwelling = True)

In [39]:
all_cols_enum = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id", "CENSUS_ENUMERATOR_NUM", "CENSUS_ENUMERATOR_DATE", "ANCESTRY_REF"]
block_boost_enum = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id", "CENSUS_ENUMERATOR_NUM", "CENSUS_ENUMERATOR_DATE", "ANCESTRY_REF"])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 50))])

In [40]:
interpolate_enum = CentroidInterpolator(census_enumerators, 10, block_boost_enum, all_cols_enum, clust_algo, block_centroids)
interpolate_enum.apply_clustering()
train,test = interpolate_enum.stratified_train_test()
interpolate_enum.train_test_model(train, test)

In [41]:
print("Training score:",interpolate_enum.train_score)
print("Test score:", interpolate_enum.test_score)

Training score: 1.0
Test score: 0.5285820341499629


#### Let's try without any sequences but with the enumeration data

In [42]:
census_enumerators = CensusData(ward10_enumerators, ward_col=ward_col, dwelling_col=dwelling_col, block_col =  block_col, x_col = x_col, y_col = y_col)
census_enumerators.no_seq()

In [43]:
all_cols_enum = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "CENSUS_ENUMERATOR_NUM", "CENSUS_ENUMERATOR_DATE", "ANCESTRY_REF"]
block_boost_enum = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "CENSUS_ENUMERATOR_NUM", "CENSUS_ENUMERATOR_DATE", "ANCESTRY_REF"])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 50))])

In [44]:
interpolate_enum = CentroidInterpolator(census_enumerators, 10, block_boost_enum, all_cols_enum, clust_algo, block_centroids)
interpolate_enum.apply_clustering()
train,test = interpolate_enum.stratified_train_test()
interpolate_enum.train_test_model(train, test)

In [45]:
print("Training score:",interpolate_enum.train_score)
print("Test score:", interpolate_enum.test_score)

Training score: 1.0
Test score: 0.2925018559762435


#### Let's create a sequence id from enumerator number and date, and use only that as a sequence

In [46]:
census_enumerators_seq = CensusData(ward10_enumerators, ward_col=ward_col, dwelling_col=dwelling_col, block_col =  block_col, x_col = x_col, y_col = y_col)
census_enumerators_seq.apply_sequencing(enumerator = True)

In [47]:
all_cols_enum = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "CENSUS_ENUMERATOR_NUM", "CENSUS_ENUMERATOR_DATE", "ANCESTRY_REF", "enum_seq"]
block_boost_enum = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "CENSUS_ENUMERATOR_NUM", "CENSUS_ENUMERATOR_DATE", "ANCESTRY_REF", "enum_seq"])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 50))])

In [48]:
interpolate_enum = CentroidInterpolator(census_enumerators_seq, 10, block_boost_enum, all_cols_enum, clust_algo, block_centroids)
interpolate_enum.apply_clustering()
train,test = interpolate_enum.stratified_train_test()
interpolate_enum.train_test_model(train, test)

In [49]:
print("Training score:",interpolate_enum.train_score)
print("Test score:", interpolate_enum.test_score)

Training score: 1.0
Test score: 0.2991833704528582


#### Let's try including distance based sequence id, and dwelling based sequences too

In [50]:
census_enumerators_seq = CensusData(ward10_enumerators, ward_col=ward_col, dwelling_col=dwelling_col, block_col =  block_col, x_col = x_col, y_col = y_col)
census_enumerators_seq.apply_sequencing(enumerator = True, distance = True, dwelling = True)

In [51]:
all_cols_enum = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id", "CENSUS_ENUMERATOR_NUM", "CENSUS_ENUMERATOR_DATE", "ANCESTRY_REF", "enum_seq"]
block_boost_enum = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB","CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",'CENSUS_DWELLING_SEQ', "sequence_order_enum", "sequence_id", "dwelling_seq_id", "CENSUS_ENUMERATOR_NUM", "CENSUS_ENUMERATOR_DATE", "ANCESTRY_REF", "enum_seq"])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 50))])

In [52]:
interpolate_enum = CentroidInterpolator(census_enumerators_seq, 10, block_boost_enum, all_cols_enum, clust_algo, block_centroids)
interpolate_enum.apply_clustering()
train,test = interpolate_enum.stratified_train_test()
interpolate_enum.train_test_model(train, test)

In [53]:
print("Training score:",interpolate_enum.train_score)
print("Test score:", interpolate_enum.test_score)

Training score: 1.0
Test score: 0.5144766146993318


That made things worse, but I'm unclear as to why that is

#### Let's try incorporating distances into the enumerator and date based sequences

In [54]:
census_enum_seq = CensusData(ward10_enumerators, ward_col=ward_col, dwelling_col=dwelling_col, block_col =  block_col, x_col = x_col, y_col = y_col)
census_enum_seq.apply_sequencing(enumerator_dist = True, dwelling = True, fixed = True, distance = True)

In [55]:
all_cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "sequence_id", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL","sequence_order_enum", 'CENSUS_DWELLING_SEQ', "dwelling_seq_id", "fixed_seq", "enum_dist_id", "enum_dist_order"]
block_boost = Pipeline([('preprocess', TargetEncoder(cols = ["CENSUS_SERIALB", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_SEXB", "CENSUS_RACEB", "sequence_id", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_OCCSTR", "CENSUS_IMPREL",  "sequence_order_enum", 'CENSUS_DWELLING_SEQ', "dwelling_seq_id", "fixed_seq", "enum_dist_id", "enum_dist_order"])),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, max_depth = 2, n_estimators = 50))])

In [56]:
interpolate_sequences = CentroidInterpolator(census_enum_seq, 10, block_boost, all_cols, clust_algo, block_centroids)
interpolate_sequences.apply_clustering()
train,test = interpolate_sequences.stratified_train_test()
interpolate_sequences.train_test_model(train, test)

In [57]:
print("Training score:",interpolate_sequences.train_score)
print("Test score:", interpolate_sequences.test_score)

Training score: 1.0
Test score: 0.5241276911655531
