In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import copy
import numpy as np
import scipy as sp
from functools import partial
import json
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

from utils.data_loader import *
from utils.analysis_utils import *
from utils.null_simulator import *
from NullAnalysis import *
from NullImputer import *
from NullPredictor import *

results = json.load(open('results.json'))

In [2]:
dataset = ACSEmploymentDataset(state=['GA'], year=2018, with_nulls=False, optimize=False)
dataset.X_data

Downloading data for 2018 1-Year person survey for GA...


Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P
0,51,13.0,5,16,2,0.0,1,3.0,4.0,1,1,2,2,2.0,1,2
1,56,16.0,3,16,1,0.0,1,1.0,4.0,4,1,2,1,2.0,2,1
2,23,20.0,5,17,1,0.0,1,1.0,4.0,4,1,2,2,1.0,2,2
3,43,17.0,1,16,2,0.0,1,1.0,4.0,1,1,2,2,2.0,1,2
4,20,19.0,5,16,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100850,51,21.0,1,1,2,0.0,1,1.0,4.0,3,1,2,2,2.0,2,1
100851,18,16.0,5,2,2,0.0,1,1.0,4.0,3,1,2,2,2.0,2,1
100852,46,21.0,5,0,2,0.0,1,3.0,4.0,3,1,2,2,2.0,2,2
100853,48,20.0,4,0,1,0.0,1,1.0,4.0,1,1,2,2,2.0,2,2


In [3]:
dataset.columns_with_nulls

[]

In [23]:
corrupted_data = nulls_simulator(dataset.X_data, 'DIS', 'DIS', [1], 0.75, nan_value=np.nan)
corrupted_data_ = nulls_simulator(corrupted_data, 'DIS', 'DIS', [2], 0.25, nan_value=np.nan)

In [24]:
# Sanity check: Making sure indexes are consistent between the original and corrupted dataset
corrupted_data_.index.to_list().sort() == dataset.X_data.index.to_list().sort()

True

In [25]:
corrupted_data_.isna().sum()

AGEP            0
SCHL            0
MAR             0
RELP            0
DIS         32509
ESP             0
CIT             0
MIG             0
MIL             0
ANC             0
NATIVITY        0
DEAR            0
DEYE            0
DREM            0
SEX             0
RAC1P           0
dtype: int64

In [26]:
corrupted_data_.DIS.value_counts()

2.0    64698
1.0     3648
Name: DIS, dtype: int64

In [27]:
dataset.X_data.DIS.value_counts()

2    86264
1    14591
Name: DIS, dtype: int64

In [28]:
dataset_with_nulls = copy.deepcopy(dataset)
dataset_with_nulls.update_X_data(corrupted_data_)

In [29]:
dataset_with_nulls.columns_with_nulls

['DIS']

In [30]:
baseline = NullAnalysis(dataset, ['SEX','RAC1P'], [1,1])
data_with_nulls = NullAnalysis(dataset_with_nulls, ['SEX','RAC1P'], [1,1])

In [31]:
X_train, y_train, X_test, y_test, X_val, y_val = data_with_nulls.create_train_test_val_split(SEED=111)
X_train.shape, X_test.shape, X_val.shape

((60513, 16), (20171, 16), (20171, 16))

In [32]:
X_train_base, X_test_base, y_train_base, y_test_base, X_val_base, y_val_base = baseline.set_train_test_val_data_by_index(data_with_nulls.X_train.index, data_with_nulls.X_test.index, data_with_nulls.X_val.index)
X_train_base.shape, X_test_base.shape, X_val_base.shape

((60513, 16), (60513,), (20171, 16))