* Fall 2020
* Reproducing `interpolation_v02.ipynb`
* Working on disambiguated ward 10 census
* Everything is reproducible

## Set up

In [5]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from interpolation import CensusData, sequences, BlockInterpolator, CentroidInterpolator, archive, interpolation
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression, RidgeClassifier

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
## Ourput from disambiguation
filled_1850 = pd.read_csv("../../data/dwelling_filled_sum_1850_mn_v02.csv")
cd_1850 = pd.read_csv("../../data/cd_1850_mn_20200918.csv") #For calculating centroids
enumerators = pd.read_csv("../../data/census_1850_enumerationDetail_mn_ward10.csv")
ward_col = "CENSUS_WARD_NUM"
cd_ward_col = "CD_WARD_NUM"
dwelling_col = "dwelling_id"
dwelling_col_num = "CENSUS_DWELLING_NUM"
block_col = "CD_BLOCK_NUM"
x_col = "CD_X"
y_col = "CD_Y"
pagenum = "CENSUS_PAGENNO"


In [7]:
filled_1850.head()

Unnamed: 0,CD_BLOCK_NUM,CD_H_ADDRESS,CD_X,CD_Y,CENSUS_AGE,CENSUS_CITY,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_FIRST_NAME,...,CENSUS_OCCUPATION,CENSUS_PAGENUM,CENSUS_RACE,CENSUS_REEL,CENSUS_SEQ_NUM,CENSUS_SERIAL,CENSUS_WARD_NUM,dwelling_id,spatial_weight,spatial_weight_sum
0,mn1850_01_74.0091_40.7028,25 SOUTH ST,-74.009004,40.702638,49,MANHATTAN,1.0,1.0,84,JOHN M,...,INN KEEPER,70,120,534,1,2044262,1,1,,
1,mn1850_01_74.0091_40.7028,25 SOUTH ST,-74.009004,40.702638,36,MANHATTAN,1.0,1.0,84,MARY A,...,,70,120,534,2,2044263,1,1,,
2,mn1850_01_74.0091_40.7028,25 SOUTH ST,-74.009004,40.702638,12,MANHATTAN,1.0,1.0,84,MARY D,...,,70,120,534,3,2044264,1,1,,
3,mn1850_01_74.0091_40.7028,25 SOUTH ST,-74.009004,40.702638,40,MANHATTAN,1.0,1.0,84,JAMES C,...,COMB MERCHANT,70,120,534,4,2044265,1,1,,
4,mn1850_01_74.0091_40.7028,25 SOUTH ST,-74.009004,40.702638,28,MANHATTAN,1.0,1.0,84,CAROLINE E,...,,70,120,534,5,2044266,1,1,,


In [8]:
cd_1850.head()

Unnamed: 0,OBJECTID,CD_RECORD_ID,CD_INDEX,CD_RAW,CD_OCCUPATION,CD_OCCUPATION_STD,CD_LAST_NAME,CD_FIRST_NAME,CD_MIDDLE_NAME,CD_H_ADDRESS,CD_H_HOUSE_NUMBER,CD_H_STREET_NAME,CD_H_CITY,CD_WARD_NUM,CD_BLOCK_NUM,CD_X,CD_Y
0,1,MN_1850_ADDISON_ACKLEY_1,1,"Ackley Addison, clerk, 47 Broadway, h. 39 Pearl",clerk,clerk,Ackley,Addison,,39 PEARL ST,39,PEARL,,1,mn1850_01_74.0124_40.7034,-74.012269,40.703363
1,3,MN_1850_FREDERICK_ANDERSON_6,6,"Anderson Frederick 11. com. mer. 13 Front, h. ...",com mer,com mer,Anderson,Frederick,11,77 BARROW ST,77,BARROW,,9,mn1850_09_74.0026_40.7323,-74.002268,40.732722
2,4,MN_1850_PETER_ANTHES_8,8,"Anthes Peter, barber 34 Av. C",barber,barbers,Anthes,Peter,,34 C AVE,34,C AVE,,11,mn1850_11_73.9792_40.7214,-73.980422,40.721839
3,5,MN_1850_&_ANTHES_9,9,"Anthes & Eckardt, barbers, 34 Av. C",barbers,barber,Anthes,&,Eckardt,34 C AVE,34,C AVE,,11,mn1850_11_73.9792_40.7214,-73.980422,40.721839
4,6,MN_1850_JOHN_APPELL_10,10,"Appell John, tailor, 40 Av. C",tailor,tailor,Appell,John,,40 C AVE,40,C AVE,,11,mn1850_11_73.9792_40.7214,-73.980322,40.721976


In [9]:
enumerators.head()

Unnamed: 0,CENSUS_PAGENO_HOUSEHOLD,CENSUS_PAGENNO,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_ENUMERATOR_NUM,CENSUS_ENUMERATOR_DATE,ANCESTRY_REF
0,60,6,10,545,1,1850-07-15,4202063_6
1,70,7,10,545,1,1850-07-15,4202063_7
2,80,8,10,545,1,1850-07-15,4202063_8
3,90,9,10,545,1,1850-07-15,4202063_9
4,100,10,10,545,1,1850-07-15,4202063_10


## Code Rerun

In [10]:
## dist:: ward >> block >> block centroid
block_centroids = {ward:
                   {block:
                    interpolation.make_centroid(df_block[x_col], df_block[y_col])\
                                   for block, df_block in df.groupby(block_col)} \
                   for ward, df in cd_1850.groupby(cd_ward_col)}

In [11]:
## Add enumerators info to disambiguated census data
ward10 = filled_1850[filled_1850[ward_col] == 10]
ward10_enumerators = ward10.merge(enumerators.drop(columns="CENSUS_WARD_NUM", 
                                                   axis = 1), 
                                 how='left', left_on='CENSUS_PAGENUM',
                                 right_on='CENSUS_PAGENO_HOUSEHOLD')

In [12]:
## Generate sequences using enumerators info and non-address census info
census_enum_seq = CensusData(ward10_enumerators, ward_col=ward_col,
                            dwelling_col=dwelling_col, block_col=block_col, 
                             x_col=x_col, y_col=y_col, 
                             pagenum=pagenum)
census_enum_seq.apply_sequencing(enumerator_dist=True,
                                dwelling=True, fixed=True, 
                                 distance=True)

In [13]:
all_cols = ["CENSUS_SERIAL", "CENSUS_AGE", "CENSUS_PAGENUM", "CENSUS_GENDER", 
            "CENSUS_RACE", "sequence_id", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", 
            "CENSUS_LABFORCE", "CENSUS_LINE", "CENSUS_MARST", "CENSUS_FIRST_NAME", 
            "CENSUS_LAST_NAME", "CENSUS_OCCUPATION", "CENSUS_IMPREL",
            "sequence_order_enum", 'CENSUS_DWELLING_SEQ', "dwelling_seq_id", 
            "fixed_seq", "enum_dist_id", "enum_dist_order"]
block_boost = Pipeline([('preprocess', TargetEncoder(cols=all_cols)),
                        ('classifier', XGBClassifier(colsample_bytree = 0.6, 
                                                     max_depth = 2, 
                                                     n_estimators = 50))])

In [14]:
interpolate_sequences = CentroidInterpolator(census_enum_seq, 10, block_boost, all_cols, 
                                             KMeans(5), block_centroids)
score, model = interpolate_sequences.kmeans_best(5)
interpolate_sequences.set_clustering_algo(model)
interpolate_sequences.apply_clustering(algo_fit = True)
# interpolate_sequences.clustervis(kmeans = True)

n is 5 and it's the 0th iteration
n is 5 and it's the 50th iteration


In [25]:
train, test = interpolate_sequences.stratified_train_test()
interpolate_sequences.train_test_model(train, test)
print("Training score:",interpolate_sequences.train_score)
print("Test score:", interpolate_sequences.test_score)

Training score: 1.0
Test score: 0.7680608365019012


In [28]:
## Cross validation
interpolate_sequences.cross_validate_model()
print('Cross Validation')
print("avg Training score:", np.array(interpolate_sequences.train_score).mean())
print("avg Test score:", np.array(interpolate_sequences.test_score).mean())

Cross Validation
avg Training score: 1.0
avg Test score: 0.7193473630802691


In [19]:
## training data with all columns used in the training
train.loc[:, interpolate_sequences.feature_names].head()

Unnamed: 0,CENSUS_SERIAL,CENSUS_AGE,CENSUS_PAGENUM,CENSUS_GENDER,CENSUS_RACE,sequence_id,CENSUS_DWELLING_SIZE,CENSUS_SEQ_NUM,CENSUS_LABFORCE,CENSUS_LINE,...,CENSUS_FIRST_NAME,CENSUS_LAST_NAME,CENSUS_OCCUPATION,CENSUS_IMPREL,sequence_order_enum,CENSUS_DWELLING_SEQ,dwelling_seq_id,fixed_seq,enum_dist_id,enum_dist_order
14,2085898,26,60,1,120,2.0,4,1,2,150,...,JOSEPH S,WOOD,FRUIT DEALER,1,1.0,4.0,0.0,0,2.0,1.0
15,2085898,17,60,2,120,2.0,4,2,0,150,...,ENUS B,WOOD,,2,1.0,4.0,0.0,0,2.0,1.0
16,2085898,7,60,1,120,2.0,4,3,0,150,...,THOMAS,NETHERWAY,,10,1.0,4.0,0.0,0,2.0,1.0
17,2085898,30,60,2,120,2.0,4,4,0,150,...,JULIA A,SUMMERS,,7,1.0,4.0,0.0,0,2.0,1.0
26,2085900,56,60,1,120,11.0,9,1,2,270,...,CHRISTOPHER,GOETS,MERCHANT,1,2.0,6.0,0.0,0,11.0,2.0


In [21]:
## y of training data
print(interpolate_sequences.y)
train[interpolate_sequences.y]

cluster


14       0
15       0
16       0
17       0
26       3
        ..
23084    3
23085    3
23086    3
23087    3
23088    3
Name: cluster, Length: 11500, dtype: int64