In [51]:
import numpy as np
import pandas as pd

def _generate_X_train_1():

    # Generate X_train dataframe with missing values

    # Initialise parameters
    n_rows = 10
    nominal_categories = {'stanage': 0.3, 'burbage': 0.2, 'almscliff': 0.2, 'froggatt': 0.15, 'blacknor':0.15}

    # Generate dataframe
    np.random.seed(0)
    X_train = pd.DataFrame({
        'numeric_1': np.random.choice(100, n_rows), 
        'numeric_2': np.random.choice(10, n_rows, replace=False), 
        'numeric_3': np.random.choice(22, n_rows, replace=False), 
        'numeric_4': np.random.choice(5, n_rows), 
        'nominal': np.random.choice(list(nominal_categories.keys()), n_rows, replace=True, p=list(nominal_categories.values())), 
    })
    np.random.seed(0)
    X_train = X_train.mask((np.random.random(size=X_train.shape) > 0.75))

    # Add boolean (this doesn't work before mask applied as this turns bool dtype to float if NaNs exist)
    bool_elements = {True: 0.6, False: 0.4}
    np.random.seed(0)
    bool_list = list(np.random.choice(list(bool_elements.keys()), n_rows, replace=True, p=list(bool_elements.values())))
    np.random.seed(0)
    mask = list(np.random.random(size=len(bool_list)) > 0.5)
    X_train['bool'] = [(x if m else np.nan) for (x, m) in zip(bool_list, mask)]

    # Add timestamp column
    timestamps = pd.date_range(start=pd.datetime(2019, 1, 6), end=pd.datetime(2020, 1, 20), periods=n_rows)
    timestamps = pd.Series(timestamps.values).sample(frac=1, random_state=0).values
    X_train['timestamp'] = timestamps
    
    # Non-consecutive index for extra test (simulate result after sklearn X_train-test split)
    np.random.seed(0)
    X_train.index = np.random.choice(100, X_train.shape[0], replace=False)
    
    return X_train

_generate_X_train_1()

def _generate_X_test_1():

    # Generate X_test dataframe with missing values

    # Initialise parameters
    n_rows = 10
    nominal_categories = {'stanage': 0.2, 'burbage': 0.2, 'almscliff': 0.2, 'wen_zawn': 0.2, 'hoy':0.2}

    # Generate dataframe
    np.random.seed(1)
    X_test = pd.DataFrame({
        'numeric_1': np.random.choice(100, n_rows), 
        'numeric_2': np.random.choice(10, n_rows, replace=False), 
        'numeric_3': np.random.choice(22, n_rows, replace=False), 
        'numeric_4': np.random.choice(5, n_rows), 
        'nominal': np.random.choice(list(nominal_categories.keys()), n_rows, replace=True, p=list(nominal_categories.values())), 
    })
    np.random.seed(1)
    X_test = X_test.mask((np.random.random(size=X_test.shape) > 0.6))

    # Add boolean (this doesn't work before mask applied as this turns bool dtype to float if NaNs exist)
    bool_elements = {True: 0.6, False: 0.4}
    np.random.seed(1)
    bool_list = list(np.random.choice(list(bool_elements.keys()), n_rows, replace=True, p=list(bool_elements.values())))
    np.random.seed(1)
    mask = list(np.random.random(size=len(bool_list)) > 0.25)
    X_test['bool'] = [(x if m else np.nan) for (x, m) in zip(bool_list, mask)]

    # Add timestamp column
    timestamps = pd.date_range(start=pd.datetime(2015, 1, 6), end=pd.datetime(2020, 1, 30), periods=n_rows)
    timestamps = pd.Series(timestamps.values).sample(frac=1, random_state=1).values
    X_test['timestamp'] = timestamps
    
    # Non-consecutive index for extra test (simulate result after sklearn X_test-test split)
    np.random.seed(1)
    X_test.index = np.random.choice(100, X_test.shape[0], replace=False)
    
    return X_test

_generate_X_test_1()

def test_data_1():
    return {
        'X_train': _generate_X_train_1(),
        'X_test': _generate_X_test_1(),
    }

In [52]:
def test_data_1():
    return {
        'X_train': _generate_X_train_1(),
        'X_test': _generate_X_test_1(),
    }

In [53]:
d = test_data_1()

In [54]:
d['X_train']

Unnamed: 0,numeric_1,numeric_2,numeric_3,numeric_4,nominal,bool,timestamp
26,44.0,9.0,4.0,0.0,burbage,True,2019-03-31 05:20:00
86,47.0,7.0,,,stanage,False,2019-12-08 21:20:00
2,,3.0,7.0,,stanage,False,2019-06-23 10:40:00
55,67.0,5.0,,,,True,2020-01-20 00:00:00
75,,,19.0,,blacknor,,2019-02-17 02:40:00
93,9.0,2.0,,2.0,burbage,False,2019-09-15 16:00:00
16,83.0,,10.0,4.0,almscliff,,2019-10-27 18:40:00
73,21.0,0.0,14.0,,blacknor,False,2019-05-12 08:00:00
54,36.0,8.0,11.0,0.0,stanage,False,2019-01-06 00:00:00
95,87.0,6.0,1.0,0.0,blacknor,,2019-08-04 13:20:00


In [55]:
d['X_test']

Unnamed: 0,numeric_1,numeric_2,numeric_3,numeric_4,nominal,bool,timestamp
80,37.0,,3.0,2.0,stanage,True,2016-02-21 02:40:00
84,12.0,5.0,2.0,1.0,wen_zawn,False,2020-01-30 00:00:00
33,72.0,,5.0,,almscliff,,2018-05-23 08:00:00
81,,0.0,10.0,3.0,stanage,True,2017-04-07 05:20:00
93,,,11.0,,,,2015-01-06 00:00:00
17,,4.0,8.0,1.0,,,2016-09-13 16:00:00
36,79.0,2.0,,3.0,,,2015-07-30 13:20:00
82,64.0,,,4.0,,True,2018-12-14 21:20:00
69,,,1.0,,burbage,True,2019-07-08 10:40:00
65,1.0,,12.0,1.0,stanage,True,2017-10-29 18:40:00
