In [8]:
import numpy as np
import pandas as pd

In [152]:
# Generate train dataframe with missing values

# Initialise parameters
n_rows = 10
nominal_categories = {'stanage': 0.3, 'burbage': 0.2, 'almscliff': 0.2, 'froggatt': 0.15, 'blacknor':0.15}

# Generate dataframe
np.random.seed(0)
train = pd.DataFrame({
    'numeric_1': np.random.choice(100, n_rows), 
    'numeric_2': np.random.choice(10, n_rows, replace=False), 
    'numeric_3': np.random.choice(22, n_rows, replace=False), 
    'numeric_4': np.random.choice(5, n_rows), 
    'nominal': np.random.choice(list(nominal_categories.keys()), n_rows, replace=True, p=list(nominal_categories.values())), 
})
np.random.seed(0)
train = train.mask((np.random.random(size=train.shape) > 0.75))
train['bool'] = [True, True, np.nan, True, False, np.nan, False, True, True, np.nan]

# Non-consecutive index for extra test (simulate result after sklearn train-test split)
np.random.seed(0)
train.index = np.random.choice(100, train.shape[0], replace=False)

In [153]:
train

Unnamed: 0,numeric_1,numeric_2,numeric_3,numeric_4,nominal,bool
26,44.0,9.0,4.0,0.0,burbage,True
86,47.0,7.0,,,stanage,True
2,,3.0,7.0,,stanage,
55,67.0,5.0,,,,True
75,,,19.0,,blacknor,False
93,9.0,2.0,,2.0,burbage,
16,83.0,,10.0,4.0,almscliff,False
73,21.0,0.0,14.0,,blacknor,True
54,36.0,8.0,11.0,0.0,stanage,True
95,87.0,6.0,1.0,0.0,blacknor,


In [188]:
bool_elements = {True: 0.6, False: 0.4}

In [194]:
t = list(np.random.choice(list(bool_elements.keys()), n_rows, replace=True, p=list(bool_elements.values())))

In [195]:
t

[False, True, True, False, True, True, True, True, False, False]

In [216]:
mask = list(np.random.random(size=len(t)) > 0.75)

In [217]:
[(x, m) for (x, m) in zip(t, mask)]

[(False, False),
 (True, False),
 (True, True),
 (False, False),
 (True, False),
 (True, True),
 (True, True),
 (True, False),
 (False, False),
 (False, False)]

In [221]:
[(x*m or (np.nan)*~m) for (x, m) in zip(t, mask)]

[nan, nan, True, nan, nan, True, True, nan, nan, nan]

In [219]:
[~m for (x, m) in zip(t, mask)]

[True, True, False, True, True, False, False, True, True, True]

In [193]:
t

array([ True,  True,  True,  True, False, False, False,  True,  True,
        True])

In [178]:
t[[2, 3]]

array([ True,  True])

In [None]:
np.no

In [156]:
list(bool_elements.values())

[0.3, 0.2, 0.2]

In [148]:
# Generate test dataframe with missing values

# Initialise parameters
n_rows = 8
nominal_categories = {'stanage': 0.3, 'burbage': 0.2, 'almscliff': 0.2, 'froggatt': 0.15, 'blacknor':0.15}

# Generate dataframe
np.random.seed(0)
test = pd.DataFrame({
    'numeric_1': np.random.choice(100, n_rows), 
    'numeric_2': np.random.choice(10, n_rows, replace=False), 
    'numeric_3': np.random.choice(22, n_rows, replace=False), 
    'numeric_4': np.random.choice(5, n_rows), 
    'nominal': np.random.choice(list(nominal_categories.keys()), n_rows, replace=True, p=list(nominal_categories.values())), 
})
np.random.seed(0)
test = test.mask((np.random.random(size=test.shape) > 0.75))
test['bool'] = [True, True, np.nan, True, False, np.nan, False, True]

# Non-consecutive index for extra test (simulate result after sklearn test-test split)
np.random.seed(0)
test.index = np.random.choice(100, test.shape[0], replace=False)

In [149]:
test

Unnamed: 0,numeric_1,numeric_2,numeric_3,numeric_4,nominal,bool
26,44.0,3.0,19.0,3.0,burbage,True
86,47.0,1.0,,,stanage,True
2,,9.0,16.0,,froggatt,
55,67.0,8.0,,,,True
75,,,1.0,,stanage,False
93,9.0,0.0,,0.0,burbage,
16,83.0,,10.0,2.0,stanage,False
73,21.0,7.0,2.0,,stanage,True


In [116]:
# Generate test dataframe with missing values
np.random.seed(1)
test = pd.DataFrame({
    'numeric_1': np.random.choice(100, 5), 
    'numeric_2': np.random.choice(10, 5, replace=False), 
    'numeric_3': np.random.choice(22, 5, replace=False), 
    'numeric_4': np.random.choice(5, 5), 
    'nominal': ['curbar', 'almscliff', 'stanage', 'stanage', 'wen_zawn'], 
})

np.random.seed(1)
test = test.mask((np.random.random(size=test.shape) > 0.5))

test['bool'] = [np.nan, True, np.nan, True, np.nan]

# Non-consecutive index for extra test (simulate result after sklearn test-test split)
np.random.seed(1)
test.index = np.random.choice(100, 5, replace=False)

In [117]:
test

Unnamed: 0,numeric_1,numeric_2,numeric_3,numeric_4,nominal,bool
80,37.0,,15.0,1.0,curbar,
84,12.0,7.0,8.0,1.0,,True
33,72.0,,16.0,,stanage,
81,,2.0,,4.0,stanage,True
93,,,19.0,,,
