## supplementary run to `EXPLORE similarity labels with vs without sequence order.ipybn`

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from interpolation import CensusData, sequences, BlockInterpolator, CentroidInterpolator, archive, interpolation
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


## read in data

In [2]:
filled_1850 = pd.read_csv("../../data/dwelling_filled_sum_1850_mn_v02.csv")
cd_1850 = pd.read_csv("../../data/cd_1850_mn_20200918.csv") #For calculating centroids
# enumerators = pd.read_csv("../../data/census_1850_enumerationDetail_mn_union.csv")
enumerators = pd.read_csv("../../data/census_1850_enumerationDetail_mn_ward10.csv")

ward_col = "CENSUS_WARD_NUM"
dwelling_col = "dwelling_id"
block_col = "CD_BLOCK_NUM"
cd_ward_col = "CD_WARD_NUM"
cd_block_col = "CD_BLOCK_NUM"
dwelling_num_col = "CENSUS_DWELLING_NUM"
cd_address = "CD_H_ADDRESS"
pagenum = "CENSUS_PAGENUM"
x_col = "CD_X"
y_col = "CD_Y"

In [3]:
cd_1850.columns

Index(['OBJECTID', 'CD_RECORD_ID', 'CD_INDEX', 'CD_RAW', 'CD_OCCUPATION',
       'CD_OCCUPATION_STD', 'CD_LAST_NAME', 'CD_FIRST_NAME', 'CD_MIDDLE_NAME',
       'CD_H_ADDRESS', 'CD_H_HOUSE_NUMBER', 'CD_H_STREET_NAME', 'CD_H_CITY',
       'CD_WARD_NUM', 'CD_BLOCK_NUM', 'CD_X', 'CD_Y'],
      dtype='object')

In [4]:
#set clustering algo -- this is to make sure that the same clustering algorithm is used
#for each model, to ensure that testing is accurate
#note fitting before hand won't work for all types of clustering, for example, with agglomerative
#clustering this isn't going to work because the algorithm doesn't have a predict method
block_centroids = {ward:{block:interpolation.make_centroid(df_block[x_col], df_block[y_col]) for block, df_block in df.groupby(cd_block_col)} for ward,df in cd_1850.groupby(cd_ward_col)}

In [5]:
ward10 = filled_1850[filled_1850[ward_col] == 10]
ward10_enumerators = ward10.merge(enumerators,  how = "left", left_on= [ward_col, "CENSUS_PAGENUM"], 
                                  right_on = [ward_col, "CENSUS_PAGENO_HOUSEHOLD"])

## With sequence order

In [6]:
### with sequence order in similarity check 

census_enum_seq_order = CensusData(ward10_enumerators, ward_col=ward_col, 
                             dwelling_col=dwelling_col, block_col =  block_col, 
                             x_col = x_col, y_col = y_col, pagenum = pagenum)
census_enum_seq_order.apply_sequencing(d=0.1, enumerator_dist = True, 
                                 dwelling = True, fixed = True, distance = True)

gamma_list = [0.1, 1, 5, 10, 20, 25, 40, 50]
n_cluster_list = [30,40, 50]
result_train_n_pro = {}
result_test_n_pro = {}

for k in n_cluster_list:
    
    
    result_train_g = {}
    result_test_g = {}  
    model_g = {}
    ## runn= gamma
    for g in gamma_list:
        
        kpro_model = KPrototypes(n_clusters=k, init = "random", n_init = 1, gamma=g)
        cate_similar_cols = ["sequence_id", "dwelling_seq_id", "fixed_seq", "enum_dist_id"]
        cont_similar_cols = ['sequence_order_enum']
        try:
            census_enum_seq_order.apply_similarity(kpro_model=kpro_model, cate_sim_columns=cate_similar_cols, 
                                             cont_sim_columns=cont_similar_cols)
        except:
            print(f'failed at n={k} and g={g}')
            result_train_g[g] = np.nan
            result_test_g[g] = np.nan
            break
            
        all_cols = ['CENSUS_SERIAL', 'CENSUS_AGE', 'CENSUS_PAGENUM', 'CENSUS_GENDER', 'CENSUS_RACE',
                    "sequence_id", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", 
                    "CENSUS_MARST", 'CENSUS_FIRST_NAME', 'CENSUS_LAST_NAME', 'CENSUS_OCCUPATION', 'CENSUS_IMPREL',
                    "sequence_order_enum", 'CENSUS_DWELLING_SEQ', "dwelling_seq_id", "fixed_seq", "enum_dist_id", 
                    "enum_dist_order"]

        transformer = ColumnTransformer(transformers=[('target', TargetEncoder(), all_cols), 
                                                      ('onehot', OneHotEncoder(handle_unknown='ignore'), ['similarity_label'])])
        # define pipeline
        pipeline = Pipeline(steps=[('preprocess', transformer), 
                                   ('classifier',XGBClassifier(colsample_bytree = 0.6, max_depth = 5, 
                                                               n_estimators = 30))])
        interpolate_sequences_order = CentroidInterpolator(census_enum_seq_order, 10, pipeline, all_cols + ['similarity_label'],
                                                     KMeans(5), block_centroids)
        score, model = interpolate_sequences_order.kmeans_best(5)
        interpolate_sequences_order.set_clustering_algo(model)
        interpolate_sequences_order.apply_clustering(algo_fit = True)
        # interpolate_sequences.clustervis(kmeans = True)

        interpolate_sequences_order.cross_validate_model(k=5)
    #     print("avg Training score:", np.array(interpolate_sequences.train_score).mean())
    #     print('Test score:', interpolate_sequences.test_score)
    #     print("avg Test score:", np.array(interpolate_sequences.test_score).mean())
        result_train_g[g] = np.array(interpolate_sequences_order.train_score).mean()
        result_test_g[g] = np.array(interpolate_sequences_order.test_score).mean()  
    
    result_train_n_pro[k] = result_train_g
    result_test_n_pro[k] = result_test_g

d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
n is 5 and it's the 10th iteration
n is 5 and it's the 10th iteration
n is 5 and it's the 10th iteration
failed at n=30 and g=10
n is 5 and it's the 10th iteration
n is 5 and it's the 10th iteration
failed at n=40 and g=5
failed at n=50 and g=0.1


In [9]:
import pickle

a = (result_train_n_pro, result_test_n_pro)

with open('../../result_2.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [10]:
### with sequence order in similarity check 

census_enum_seq_order = CensusData(ward10_enumerators, ward_col=ward_col, 
                             dwelling_col=dwelling_col, block_col =  block_col, 
                             x_col = x_col, y_col = y_col, pagenum = pagenum)
census_enum_seq_order.apply_sequencing(d=0.1, enumerator_dist = True, 
                                 dwelling = True, fixed = True, distance = True)

gamma_list = [0.1, 1, 5, 10, 20, 25, 40, 50]
n_cluster_list = [20]
result_train_n_pro = {}
result_test_n_pro = {}

for k in n_cluster_list:
    
    
    result_train_g = {}
    result_test_g = {}  
    model_g = {}
    ## runn= gamma
    for g in gamma_list:
        
        kpro_model = KPrototypes(n_clusters=k, init = "random", n_init = 1, gamma=g)
        cate_similar_cols = ["sequence_id", "dwelling_seq_id", "fixed_seq", "enum_dist_id"]
        cont_similar_cols = ['sequence_order_enum']
        try:
            census_enum_seq_order.apply_similarity(kpro_model=kpro_model, cate_sim_columns=cate_similar_cols, 
                                             cont_sim_columns=cont_similar_cols)
        except:
            print(f'failed at n={k} and g={g}')
            result_train_g[g] = np.nan
            result_test_g[g] = np.nan
            break
            
        all_cols = ['CENSUS_SERIAL', 'CENSUS_AGE', 'CENSUS_PAGENUM', 'CENSUS_GENDER', 'CENSUS_RACE',
                    "sequence_id", "CENSUS_DWELLING_SIZE","CENSUS_SEQ_NUM", "CENSUS_LABFORCE", "CENSUS_LINE", 
                    "CENSUS_MARST", 'CENSUS_FIRST_NAME', 'CENSUS_LAST_NAME', 'CENSUS_OCCUPATION', 'CENSUS_IMPREL',
                    "sequence_order_enum", 'CENSUS_DWELLING_SEQ', "dwelling_seq_id", "fixed_seq", "enum_dist_id", 
                    "enum_dist_order"]

        transformer = ColumnTransformer(transformers=[('target', TargetEncoder(), all_cols), 
                                                      ('onehot', OneHotEncoder(handle_unknown='ignore'), ['similarity_label'])])
        # define pipeline
        pipeline = Pipeline(steps=[('preprocess', transformer), 
                                   ('classifier',XGBClassifier(colsample_bytree = 0.6, max_depth = 5, 
                                                               n_estimators = 30))])
        interpolate_sequences_order = CentroidInterpolator(census_enum_seq_order, 10, pipeline, all_cols + ['similarity_label'],
                                                     KMeans(5), block_centroids)
        score, model = interpolate_sequences_order.kmeans_best(5)
        interpolate_sequences_order.set_clustering_algo(model)
        interpolate_sequences_order.apply_clustering(algo_fit = True)
        # interpolate_sequences.clustervis(kmeans = True)

        interpolate_sequences_order.cross_validate_model(k=5)
    #     print("avg Training score:", np.array(interpolate_sequences.train_score).mean())
    #     print('Test score:', interpolate_sequences.test_score)
    #     print("avg Test score:", np.array(interpolate_sequences.test_score).mean())
        result_train_g[g] = np.array(interpolate_sequences_order.train_score).mean()
        result_test_g[g] = np.array(interpolate_sequences_order.test_score).mean()  
    
    result_train_n_pro[k] = result_train_g
    result_test_n_pro[k] = result_test_g

d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
d:  0.1
n is 5 and it's the 10th iteration
n is 5 and it's the 10th iteration
n is 5 and it's the 10th iteration
n is 5 and it's the 10th iteration
n is 5 and it's the 10th iteration
n is 5 and it's the 10th iteration
n is 5 and it's the 10th iteration
n is 5 and it's the 10th iteration


In [11]:
import pickle

a = (result_train_n_pro, result_test_n_pro)

with open('../../result_3.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
    