In [1]:
!pip install -U gretel-synthetics

Collecting gretel-synthetics
  Downloading gretel_synthetics-0.17.0-py3-none-any.whl (63 kB)
[?25l[K     |█████▏                          | 10 kB 22.6 MB/s eta 0:00:01[K     |██████████▍                     | 20 kB 20.9 MB/s eta 0:00:01[K     |███████████████▌                | 30 kB 10.5 MB/s eta 0:00:01[K     |████████████████████▊           | 40 kB 8.4 MB/s eta 0:00:01[K     |██████████████████████████      | 51 kB 5.4 MB/s eta 0:00:01[K     |███████████████████████████████ | 61 kB 5.5 MB/s eta 0:00:01[K     |████████████████████████████████| 63 kB 1.4 MB/s 
Collecting loky==2.8.0
  Downloading loky-2.8.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 4.2 MB/s 
[?25hCollecting sentencepiece==0.1.91
  Downloading sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 35.9 MB/s 
[?25hCollecting tensorflow-privacy==0.5.1
  Downloading tensorflow_privacy-0.5.1-py3-none-any.whl (149 k

In [2]:
import pandas as pd
from gretel_synthetics.batch import DataFrameBatch

source_df = pd.read_csv("https://gretel-public-website.s3-us-west-2.amazonaws.com/tests/synthetics/data/USAdultIncome14K.csv")

In [3]:
source_df.shape


(14000, 15)

In [4]:

from pathlib import Path

from gretel_synthetics.config import TensorFlowConfig
from gretel_synthetics.tokenizers import CharTokenizerTrainer
from gretel_synthetics.train import train

checkpoint_dir = str(Path.cwd() / "test-model-2")

config = TensorFlowConfig(
    gen_lines=1000,
    max_lines=1e5,
    dp=True,
    predict_batch_size=1,
    rnn_units=256,
    batch_size=16,
    epochs=3,
    learning_rate=0.0015,
    dp_noise_multiplier=0.2,
    dp_l2_norm_clip=1.0,
    dropout_rate=0.5,
    dp_microbatches=1,
    reset_states=False,
    overwrite=True,
    checkpoint_dir=(Path.cwd() / 'checkpoints').as_posix(),
    input_data_path='https://gretel-public-website.s3-us-west-2.amazonaws.com/tests/synthetics/data/USAdultIncome14K.csv'
)

# Initialize the tokenizer
tokenizer = CharTokenizerTrainer(config=config)

# Train the model
train(config, tokenizer)

2022-03-31 11:35:00,112 : MainThread : INFO : Loading input data from https://gretel-public-website.s3-us-west-2.amazonaws.com/tests/synthetics/data/USAdultIncome14K.csv
2022-03-31 11:35:03,558 : MainThread : INFO : Tokenizing input data
100%|██████████| 14001/14001 [00:00<00:00, 37597.00it/s]
2022-03-31 11:35:03,950 : MainThread : INFO : Shuffling input data
2022-03-31 11:35:07,924 : MainThread : INFO : Creating validation dataset
2022-03-31 11:35:07,997 : MainThread : INFO : Creating training dataset
2022-03-31 11:35:08,045 : MainThread : INFO : Initializing synthetic model
2022-03-31 11:35:08,979 : MainThread : INFO : Using keras.optimizers.RMSprop optimizer in differentially private mode


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (16, None, 256)           17152     
                                                                 
 dropout (Dropout)           (16, None, 256)           0         
                                                                 
 lstm (LSTM)                 (16, None, 256)           525312    
                                                                 
 dropout_1 (Dropout)         (16, None, 256)           0         
                                                                 
 lstm_1 (LSTM)               (16, None, 256)           525312    
                                                                 
 dropout_2 (Dropout)         (16, None, 256)           0         
                                                                 
 dense (Dense)               (16, None, 67)            1







Epoch 2/3
Epoch 3/3


2022-03-31 13:10:37,533 : MainThread : INFO : Saving model history to model_history.csv
2022-03-31 13:10:37,543 : MainThread : INFO : Saving model to /content/checkpoints/synthetic


In [5]:
from collections import Counter
import datetime
import pandas as pd
import json

from gretel_synthetics.generate import generate_text


# extract training params
def get_privacy_guarantees():
    df = pd.read_csv(f"{config.checkpoint_dir}/model_history.csv")
    epsilon = df[df['best'] == 1]['epsilon'].values[0]
    delta = df[df['best'] == 1]['delta'].values[0]
    return {
        "epsilon": epsilon,
        "delta": delta,
    }

# Build a validator
def validate_record(line):
    rec = line.split(",")
    if len(rec) == 4:
        datetime.datetime.strptime(rec[3], '%Y-%m-%d')
        int(rec[2])
        int(rec[1])
        int(rec[0])
    else:
        raise Exception('record not valid')


# Print differential privacy epsilon and delta values
print(json.dumps(get_privacy_guarantees(), indent=2))


{
  "epsilon": 72.6647,
  "delta": 0.0
}


In [None]:
batcher.create_training_data()

2022-03-29 11:38:48,688 : MainThread : INFO : Generating training DF and CSV for batch 0
2022-03-29 11:38:48,732 : MainThread : INFO : Generating training DF and CSV for batch 1
2022-03-29 11:38:48,774 : MainThread : INFO : Generating training DF and CSV for batch 2


In [None]:
batcher.train_all_batches()

2022-03-29 11:38:51,382 : MainThread : INFO : Loading SentencePieceTokenizerTrainer
2022-03-29 11:38:51,384 : MainThread : INFO : Loading input data from /content/test-model-2/batch_0/train.csv
2022-03-29 11:38:51,426 : MainThread : INFO : Training SentencePiece tokenizer
2022-03-29 11:38:51,946 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 11:38:51,959 : MainThread : INFO : Tokenizer model vocabulary size: 5536 tokens
2022-03-29 11:38:51,964 : MainThread : INFO : Mapping first line of training data

'30<d>?<d>157289<d>11th<d>7<n>'
 ---- sample tokens mapped to pieces ---- > 
▁3, 0, <d>, ?, <d>, 15728, 9, <d>, 11, th, <d>, 7, <n>

2022-03-29 11:38:51,965 : MainThread : INFO : Mapping first line of training data

'30<d>?<d>157289<d>11th<d>7<n>'
 ---- sample tokens mapped to int ---- > 
25, 47, 4, 55, 4, 1140, 13, 4, 50, 41, 4, 34, 3

2022-03-29 11:38:52,074 : MainThread : INFO : Tokenizing input data
100%|██████████| 14000/14000 [00:00<00:00, 87544.83it/s]
2022-03-29 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           1417216   
                                                                 
 dropout (Dropout)           (64, None, 256)           0         
                                                                 
 lstm (LSTM)                 (64, None, 256)           525312    
                                                                 
 dropout_1 (Dropout)         (64, None, 256)           0         
                                                                 
 lstm_1 (LSTM)               (64, None, 256)           525312    
                                                                 
 dropout_2 (Dropout)         (64, None, 256)           0         
                                                                 
 dense (Dense)               (64, None, 5536)          1

2022-03-29 12:00:13,633 : MainThread : INFO : Saving model history to model_history.csv
2022-03-29 12:00:13,641 : MainThread : INFO : Saving model to /content/test-model-2/batch_0/synthetic
2022-03-29 12:00:13,645 : MainThread : INFO : Loading SentencePieceTokenizerTrainer
2022-03-29 12:00:13,650 : MainThread : INFO : Loading input data from /content/test-model-2/batch_1/train.csv
2022-03-29 12:00:13,684 : MainThread : INFO : Training SentencePiece tokenizer
2022-03-29 12:00:14,062 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 12:00:14,067 : MainThread : INFO : Tokenizer model vocabulary size: 118 tokens
2022-03-29 12:00:14,070 : MainThread : INFO : Mapping first line of training data

'Never-married<d>?<d>Unmarried<d>White<d>Male<n>'
 ---- sample tokens mapped to pieces ---- > 
▁, N, e, ve, r, -, m, ar, ri, ed, <d>, ?, <d>, U, n, m, a, r, r, i, e, d, <d>, W, hi, t, e, <d>, M, ale, <n>

2022-03-29 12:00:14,072 : MainThread : INFO : Mapping first line of training data

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (64, None, 256)           30208     
                                                                 
 dropout_3 (Dropout)         (64, None, 256)           0         
                                                                 
 lstm_2 (LSTM)               (64, None, 256)           525312    
                                                                 
 dropout_4 (Dropout)         (64, None, 256)           0         
                                                                 
 lstm_3 (LSTM)               (64, None, 256)           525312    
                                                                 
 dropout_5 (Dropout)         (64, None, 256)           0         
                                                                 
 dense_1 (Dense)             (64, None, 118)          

2022-03-29 12:19:29,483 : MainThread : INFO : Saving model history to model_history.csv
2022-03-29 12:19:29,489 : MainThread : INFO : Saving model to /content/test-model-2/batch_1/synthetic
2022-03-29 12:19:29,494 : MainThread : INFO : Loading SentencePieceTokenizerTrainer
2022-03-29 12:19:29,495 : MainThread : INFO : Loading input data from /content/test-model-2/batch_2/train.csv
2022-03-29 12:19:29,534 : MainThread : INFO : Training SentencePiece tokenizer
2022-03-29 12:19:29,771 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 12:19:29,777 : MainThread : INFO : Tokenizer model vocabulary size: 265 tokens
2022-03-29 12:19:29,779 : MainThread : INFO : Mapping first line of training data

'0<d>0<d>40<d>United-States<d><=50K<n>'
 ---- sample tokens mapped to pieces ---- > 
▁, 0, <d>, 0, <d>, 4, 0, <d>, U, ni, te, d, -, S, t, a, te, s, <d>, <, =, 50, K, <n>

2022-03-29 12:19:29,782 : MainThread : INFO : Mapping first line of training data

'0<d>0<d>40<d>United-States<d><=

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (64, None, 256)           67840     
                                                                 
 dropout_6 (Dropout)         (64, None, 256)           0         
                                                                 
 lstm_4 (LSTM)               (64, None, 256)           525312    
                                                                 
 dropout_7 (Dropout)         (64, None, 256)           0         
                                                                 
 lstm_5 (LSTM)               (64, None, 256)           525312    
                                                                 
 dropout_8 (Dropout)         (64, None, 256)           0         
                                                                 
 dense_2 (Dense)             (64, None, 265)          

2022-03-29 12:30:41,590 : MainThread : INFO : Saving model history to model_history.csv
2022-03-29 12:30:41,594 : MainThread : INFO : Saving model to /content/test-model-2/batch_2/synthetic


In [None]:

status = batcher.generate_all_batch_lines(num_lines=2000)

Valid record count :   0%|          | 0/2000 [00:00<?, ?it/s]

Invalid record count :   0%|          | 0/1000 [00:00<?, ?it/s]

2022-03-29 12:37:47,431 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 12:37:47,470 : MainThread : INFO : Tokenizer model vocabulary size: 5536 tokens
2022-03-29 12:37:47,477 : MainThread : INFO : Mapping first line of training data

'30<d>?<d>157289<d>11th<d>7<n>'
 ---- sample tokens mapped to pieces ---- > 
▁, 3, 0, <d>, ?, <d>, 1, 5, 7, 2, 89, <d>, 11, t, h, <d>, 7, <n>

2022-03-29 12:37:47,485 : MainThread : INFO : Mapping first line of training data

'30<d>?<d>157289<d>11th<d>7<n>'
 ---- sample tokens mapped to int ---- > 
25, 47, 4, 55, 4, 1140, 13, 4, 50, 41, 4, 34, 3



Valid record count :   0%|          | 0/2000 [00:00<?, ?it/s]

Invalid record count :   0%|          | 0/1000 [00:00<?, ?it/s]

2022-03-29 12:38:18,229 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 12:38:18,236 : MainThread : INFO : Tokenizer model vocabulary size: 118 tokens
2022-03-29 12:38:18,239 : MainThread : INFO : Mapping first line of training data

'Never-married<d>?<d>Unmarried<d>White<d>Male<n>'
 ---- sample tokens mapped to pieces ---- > 
▁, N, e, v, er, -, m, a, rried, <d>, ?, <d>, U, n, m, arried, <d>, W, h, i, te, <d>, Ma, le, <n>

2022-03-29 12:38:18,242 : MainThread : INFO : Mapping first line of training data

'Never-married<d>?<d>Unmarried<d>White<d>Male<n>'
 ---- sample tokens mapped to int ---- > 
7, 14, 5, 54, 13, 6, 12, 33, 4, 80, 4, 69, 29, 12, 33, 4, 9, 8, 10, 4, 11, 17, 3



Valid record count :   0%|          | 0/2000 [00:00<?, ?it/s]

Invalid record count :   0%|          | 0/1000 [00:00<?, ?it/s]

2022-03-29 12:38:50,553 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 12:38:50,561 : MainThread : INFO : Tokenizer model vocabulary size: 265 tokens
2022-03-29 12:38:50,564 : MainThread : INFO : Mapping first line of training data

'0<d>0<d>40<d>United-States<d><=50K<n>'
 ---- sample tokens mapped to pieces ---- > 
▁, 0, <d>, 0, <, d, >, 40, <d>, U, n, i, te, d, -, S, ta, t, e, s, <, d, >, <, =, 50, K, <n>

2022-03-29 12:38:50,566 : MainThread : INFO : Mapping first line of training data

'0<d>0<d>40<d>United-States<d><=50K<n>'
 ---- sample tokens mapped to int ---- > 
9, 5, 4, 5, 4, 19, 4, 14, 15, 6, 13, 10, 12, 18, 6, 11, 4, 16, 17, 7, 8, 3



In [None]:
batcher.batches[2].gen_data_stream.getvalue()

'capital_gain,capital_loss,hours_per_week,native_country,income_bracket\n0,0,40,Iran,<=50K\n0,0,40,England,>50K\n0,0,40,Guatemala,<=50K\n0,0,20,Mexico,<=50K\n0,0,50,Germany,<=50K\n0,0,40,United-States,>50K\n0,0,40,United-States,>50K\n0,0,40,United-States,>50K\n0,0,5,United-States,>50K\n0,0,40,United-States,>50K\n0,0,33,United-States,>50K\n0,0,60,United-States,>50K\n0,0,50,United-States,>50K\n0,0,20,United-States,>50K\n0,0,40,United-States,>50K\n0,0,50,United-States,<=50K\n0,0,43,United-States,<=50K\n5013,0,16,United-States,>50K\n0,0,20,United-States,<=50K\n0,0,60,United-States,<=50K\n0,0,55,United-States,<=50K\n0,0,4,United-States,<=50K\n0,0,30,United-States,<=50K\n0,0,60,United-States,<=50K\n0,0,35,United-States,<=50K\n0,0,40,United-States,<=50K\n0,0,60,United-States,<=50K\n0,0,40,United-States,<=50K\n0,0,40,United-States,<=50K\n0,0,46,United-States,<=50K\n0,1590,60,United-States,>50K\n0,0,35,United-States,<=50K\n0,0,50,United-States,<=50K\n0,0,12,United-States,<=50K\n0,1887,40,United

In [None]:
status

{0: GenerationSummary(valid_lines=724, invalid_lines=1000, is_valid=False),
 1: GenerationSummary(valid_lines=2000, invalid_lines=21, is_valid=True),
 2: GenerationSummary(valid_lines=2000, invalid_lines=23, is_valid=True)}

In [None]:
batcher.batch_to_df(0)

Unnamed: 0,age,workclass,fnlwgt,education,education_num
0,19,?,2348,11th,7
1,17,Private,207675,12th,8
2,17,?,193438,HS-grad,9
3,314,Private,105047,11th,7
4,19,Private,20660,HS-grad,9
...,...,...,...,...,...
719,47314035,State-gov,67006,Bachelors,13
720,61817,Private,244605,Some-college,10
721,424401,State-gov,345195,Assoc-acdm,12
722,119,Self-emp-not-inc,182555,HS-grad,9


In [None]:
batcher.batches_to_df()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,19,?,2348,11th,7.0,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,Iran,<=50K
1,17,Private,207675,12th,8.0,Divorced,Sales,Own-child,White,Male,0,0,40,England,>50K
2,17,?,193438,HS-grad,9.0,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,Guatemala,<=50K
3,314,Private,105047,11th,7.0,Never-married,?,Unmarried,White,Female,0,0,20,Mexico,<=50K
4,19,Private,20660,HS-grad,9.0,Married-civ-spouse,Other-service,Husband,White,Male,0,0,50,Germany,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,,,,,,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,Philippines,<=50K
1996,,,,,,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K
1997,,,,,,Never-married,Craft-repair,Unmarried,White,Male,0,0,20,United-States,>50K
1998,,,,,,Married-civ-spouse,Adm-clerical,Wife,Black,Female,0,0,40,United-States,>50K


In [None]:
read_batch = DataFrameBatch(mode="read", checkpoint_dir=checkpoint_dir)

2022-03-29 12:43:07,294 : MainThread : INFO : Looking for and loading batch data...
2022-03-29 12:43:07,300 : MainThread : INFO : Found and loaded 3 batches
2022-03-29 12:43:07,303 : MainThread : INFO : Validating underlying models exist via generation test...


Valid record count :   0%|          | 0/1 [00:00<?, ?it/s]

Invalid record count :   0%|          | 0/1000 [00:00<?, ?it/s]

2022-03-29 12:43:07,419 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 12:43:07,440 : MainThread : INFO : Tokenizer model vocabulary size: 5536 tokens
2022-03-29 12:43:07,444 : MainThread : INFO : Mapping first line of training data

'30<d>?<d>157289<d>11th<d>7<n>'
 ---- sample tokens mapped to pieces ---- > 
▁, 3, 0, <d>, ?, <d>, 15728, 9, <d>, 1, 1, th, <d>, 7, <n>

2022-03-29 12:43:07,446 : MainThread : INFO : Mapping first line of training data

'30<d>?<d>157289<d>11th<d>7<n>'
 ---- sample tokens mapped to int ---- > 
25, 47, 4, 55, 4, 1140, 13, 4, 50, 41, 4, 34, 3

2022-03-29 12:43:08,508 : MainThread : INFO : Using keras.optimizers.RMSprop optimizer


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (64, None, 256)           1417216   
                                                                 
 dropout_9 (Dropout)         (64, None, 256)           0         
                                                                 
 lstm_6 (LSTM)               (64, None, 256)           525312    
                                                                 
 dropout_10 (Dropout)        (64, None, 256)           0         
                                                                 
 lstm_7 (LSTM)               (64, None, 256)           525312    
                                                                 
 dropout_11 (Dropout)        (64, None, 256)           0         
                                                                 
 dense_3 (Dense)             (64, None, 5536)         

Valid record count :   0%|          | 0/1 [00:00<?, ?it/s]

Invalid record count :   0%|          | 0/1000 [00:00<?, ?it/s]

2022-03-29 12:43:11,453 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 12:43:11,458 : MainThread : INFO : Tokenizer model vocabulary size: 118 tokens
2022-03-29 12:43:11,462 : MainThread : INFO : Mapping first line of training data

'Never-married<d>?<d>Unmarried<d>White<d>Male<n>'
 ---- sample tokens mapped to pieces ---- > 
▁, N, e, v, er, -, m, a, rried, <d>, ?, <d>, U, n, m, arried, <d>, W, hi, t, e, <d>, M, ale, <n>

2022-03-29 12:43:11,465 : MainThread : INFO : Mapping first line of training data

'Never-married<d>?<d>Unmarried<d>White<d>Male<n>'
 ---- sample tokens mapped to int ---- > 
7, 14, 5, 54, 13, 6, 12, 33, 4, 80, 4, 69, 29, 12, 33, 4, 9, 8, 10, 4, 11, 17, 3

2022-03-29 12:43:12,049 : MainThread : INFO : Using keras.optimizers.RMSprop optimizer


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (64, None, 256)           30208     
                                                                 
 dropout_12 (Dropout)        (64, None, 256)           0         
                                                                 
 lstm_8 (LSTM)               (64, None, 256)           525312    
                                                                 
 dropout_13 (Dropout)        (64, None, 256)           0         
                                                                 
 lstm_9 (LSTM)               (64, None, 256)           525312    
                                                                 
 dropout_14 (Dropout)        (64, None, 256)           0         
                                                                 
 dense_4 (Dense)             (64, None, 118)          

Valid record count :   0%|          | 0/1 [00:00<?, ?it/s]

Invalid record count :   0%|          | 0/1000 [00:00<?, ?it/s]

2022-03-29 12:43:14,820 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 12:43:14,828 : MainThread : INFO : Tokenizer model vocabulary size: 265 tokens
2022-03-29 12:43:14,830 : MainThread : INFO : Mapping first line of training data

'0<d>0<d>40<d>United-States<d><=50K<n>'
 ---- sample tokens mapped to pieces ---- > 
▁, 0, <d>, 0, <d>, 4, 0, <, d, >, U, ni, te, d, -, S, t, a, te, s, <, d, >, <, =, 5, 0, K, <n>

2022-03-29 12:43:14,833 : MainThread : INFO : Mapping first line of training data

'0<d>0<d>40<d>United-States<d><=50K<n>'
 ---- sample tokens mapped to int ---- > 
9, 5, 4, 5, 4, 19, 4, 14, 15, 6, 13, 10, 12, 18, 6, 11, 4, 16, 17, 7, 8, 3

2022-03-29 12:43:15,471 : MainThread : INFO : Using keras.optimizers.RMSprop optimizer


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (64, None, 256)           67840     
                                                                 
 dropout_15 (Dropout)        (64, None, 256)           0         
                                                                 
 lstm_10 (LSTM)              (64, None, 256)           525312    
                                                                 
 dropout_16 (Dropout)        (64, None, 256)           0         
                                                                 
 lstm_11 (LSTM)              (64, None, 256)           525312    
                                                                 
 dropout_17 (Dropout)        (64, None, 256)           0         
                                                                 
 dense_5 (Dense)             (64, None, 265)          

In [None]:
read_batch.generate_all_batch_lines(num_lines=5)

Valid record count :   0%|          | 0/5 [00:00<?, ?it/s]

Invalid record count :   0%|          | 0/1000 [00:00<?, ?it/s]

2022-03-29 12:45:19,648 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 12:45:19,678 : MainThread : INFO : Tokenizer model vocabulary size: 5536 tokens
2022-03-29 12:45:19,680 : MainThread : INFO : Mapping first line of training data

'30<d>?<d>157289<d>11th<d>7<n>'
 ---- sample tokens mapped to pieces ---- > 
▁, 30, <d>, ?, <d>, 1, 572, 89, <d>, 11, th, <d>, 7, <n>

2022-03-29 12:45:19,682 : MainThread : INFO : Mapping first line of training data

'30<d>?<d>157289<d>11th<d>7<n>'
 ---- sample tokens mapped to int ---- > 
25, 47, 4, 55, 4, 1140, 13, 4, 50, 41, 4, 34, 3

2022-03-29 12:45:20,937 : MainThread : INFO : Using keras.optimizers.RMSprop optimizer


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (64, None, 256)           1417216   
                                                                 
 dropout_18 (Dropout)        (64, None, 256)           0         
                                                                 
 lstm_12 (LSTM)              (64, None, 256)           525312    
                                                                 
 dropout_19 (Dropout)        (64, None, 256)           0         
                                                                 
 lstm_13 (LSTM)              (64, None, 256)           525312    
                                                                 
 dropout_20 (Dropout)        (64, None, 256)           0         
                                                                 
 dense_6 (Dense)             (64, None, 5536)         

Valid record count :   0%|          | 0/5 [00:00<?, ?it/s]

Invalid record count :   0%|          | 0/1000 [00:00<?, ?it/s]

2022-03-29 12:45:24,498 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 12:45:24,506 : MainThread : INFO : Tokenizer model vocabulary size: 118 tokens
2022-03-29 12:45:24,513 : MainThread : INFO : Mapping first line of training data

'Never-married<d>?<d>Unmarried<d>White<d>Male<n>'
 ---- sample tokens mapped to pieces ---- > 
▁, N, e, v, er, -, m, a, rried, <d>, ?, <d>, U, n, m, ar, ri, ed, <d>, W, hi, t, e, <d>, Ma, le, <n>

2022-03-29 12:45:24,517 : MainThread : INFO : Mapping first line of training data

'Never-married<d>?<d>Unmarried<d>White<d>Male<n>'
 ---- sample tokens mapped to int ---- > 
7, 14, 5, 54, 13, 6, 12, 33, 4, 80, 4, 69, 29, 12, 33, 4, 9, 8, 10, 4, 11, 17, 3

2022-03-29 12:45:25,115 : MainThread : INFO : Using keras.optimizers.RMSprop optimizer


Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (64, None, 256)           30208     
                                                                 
 dropout_21 (Dropout)        (64, None, 256)           0         
                                                                 
 lstm_14 (LSTM)              (64, None, 256)           525312    
                                                                 
 dropout_22 (Dropout)        (64, None, 256)           0         
                                                                 
 lstm_15 (LSTM)              (64, None, 256)           525312    
                                                                 
 dropout_23 (Dropout)        (64, None, 256)           0         
                                                                 
 dense_7 (Dense)             (64, None, 118)          

Valid record count :   0%|          | 0/5 [00:00<?, ?it/s]

Invalid record count :   0%|          | 0/1000 [00:00<?, ?it/s]

2022-03-29 12:45:27,417 : MainThread : INFO : Loading tokenizer from: m.model
2022-03-29 12:45:27,422 : MainThread : INFO : Tokenizer model vocabulary size: 265 tokens
2022-03-29 12:45:27,424 : MainThread : INFO : Mapping first line of training data

'0<d>0<d>40<d>United-States<d><=50K<n>'
 ---- sample tokens mapped to pieces ---- > 
▁, 0, <d>, 0, <d>, 40, <d>, U, ni, te, d, -, S, ta, te, s, <d>, <, =, 5, 0, K, <n>

2022-03-29 12:45:27,426 : MainThread : INFO : Mapping first line of training data

'0<d>0<d>40<d>United-States<d><=50K<n>'
 ---- sample tokens mapped to int ---- > 
9, 5, 4, 5, 4, 19, 4, 14, 15, 6, 13, 10, 12, 18, 6, 11, 4, 16, 17, 7, 8, 3

2022-03-29 12:45:28,052 : MainThread : INFO : Using keras.optimizers.RMSprop optimizer


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (64, None, 256)           67840     
                                                                 
 dropout_24 (Dropout)        (64, None, 256)           0         
                                                                 
 lstm_16 (LSTM)              (64, None, 256)           525312    
                                                                 
 dropout_25 (Dropout)        (64, None, 256)           0         
                                                                 
 lstm_17 (LSTM)              (64, None, 256)           525312    
                                                                 
 dropout_26 (Dropout)        (64, None, 256)           0         
                                                                 
 dense_8 (Dense)             (64, None, 265)          

{0: GenerationSummary(valid_lines=5, invalid_lines=32, is_valid=True),
 1: GenerationSummary(valid_lines=5, invalid_lines=2, is_valid=True),
 2: GenerationSummary(valid_lines=5, invalid_lines=0, is_valid=True)}

In [None]:
read_batch.batches_to_df()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,17,Private,158921,11th,7,Married-civ-spouse,?,Husband,Black,Male,0,0,40,Japan,>50K
1,19,Private,160671,11th,7,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,Canada,<=50K
2,59172,?,163484,Bachelors,13,Never-married,Sales,Unmarried,White,Male,0,0,40,China,<=50K
3,19,Private,160703,Doctorate,16,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,Ecuador,<=50K
4,19,Private,117505,7th-8th,4,Never-married,?,Own-child,White,Male,0,0,65,Mexico,<=50K
