## This is the tf dataset section

Contains snippets of code I might want to refer back to in the future, te topic is tf dataset operations and tfrecords functions for tensorflow 2

In [53]:
# Defining functions and classes used to load the dataset from its TFRecord file

def parse_example(example_proto, feature_description):
    '''
    Parses example proto from
    
    :param example_proto: 
    :param feature_description: 
    '''
    
    # Parse the input tf.Example proto using the dictionary above.
    example = tf.io.parse_single_example(example_proto, feature_description)
    
    # Reconstructing Ragged Tensors from Example
    for t in tickers:
        example['_'.join(['docs', t])] = tf.RaggedTensor.from_row_lengths(example['docs_{}/vals'.format(t)].values,
                                                           row_lengths=example['docs_{}/lens'.format(t)].values)

    # Deleting Redundant Keys
    for t in tickers:
        del example['docs_{}/vals'.format(t)]
        del example['docs_{}/lens'.format(t)]
        
    return example

In [54]:
# Loading the Dataset

# Loading the raw dataset from the TFRecord file
dataset = tf.data.TFRecordDataset(os.path.join(path_to_data, 'dataset.tfrecord'))
# Loading the dataset's feature_description
with open(os.path.join(path_to_data, 'dataset_feature_description.pickle'), 'rb') as f:
    feature_description = pickle.load(f)
# Decoding the raw dataset using the dataset's feature_description
dataset = dataset.map(lambda example_proto: parse_example(example_proto, feature_description))

2. Spliting the dataset by stock ticker

In [55]:
def split(example, features, ticker):
    return {feature_name: example['_'.join([feature_name, ticker])] for feature_name in features}

datasets = [dataset.map(lambda ex: split(ex, ['log_adj_daily_returns', 'docs'], t)) for t in tickers]

3. Reshaping datasets

In [56]:
# Defining functions and classes used to reshape datasets

def make_window_dataset(ds, window_size, shift=1, stride=1):
    
    windows = ds.window(window_size, shift=shift, stride=stride)
    
    feature_datasets = {key: windows.flat_map(lambda x: x[key].batch(window_size, drop_remainder=True))
                        for key in windows.element_spec.keys()}
    
    return tf.data.Dataset.zip(feature_datasets)

def extract_labels(timeslice, label_features):
    labels = {}
    
    for feature_key in timeslice.keys():
        feature_timeslice = timeslice[feature_key]
        if feature_key in label_features:
            labels[feature_key] = feature_timeslice[-1]
        timeslice[feature_key] = feature_timeslice[:-1]
        
    return (timeslice, labels)


def to_time_series(ds, label_features, window_size, steps_to_pred=1, num_of_preds=1):
    
    # making full time series Dataset object (features + labels)
    full_ts_ds = make_window_dataset(ds, window_size=window_size+1)
    
    # mapping dataset to Dataset where each el is: (features: dict, labels)
    ts_ds = full_ts_ds.map(lambda s: extract_labels(s, label_features))
    
    return ts_ds

def sample_documents(sample):
    # Extracting all documents in the sample
    docs_in_sample = sample.values
    # Sampling a random document from all the documents in the sample
    if docs_in_sample.nrows() != 0:
        i = tf.random.uniform([1], maxval=docs_in_sample.nrows(), dtype=tf.int64)[0]
        sample_doc = docs_in_sample[i]
    else:
        sample_doc = tf.constant([], dtype=tf.int64)
        
    return sample_doc

def select_doc(features, labels):
    
    for fname in features.keys():
        feature = features[fname]
        timesteps = feature.shape[0]
        # Feature is a doc feature
        if isinstance(feature, tf.RaggedTensor):
            doc = sample_documents(feature)
            feature = tf.stack([doc for day in range(timesteps)])
            features[fname] = feature
        
    return (features, *list(labels.values()))

def filter_fn(f, l):
    shape = tf.shape(f['docs'])[1]
    return tf.math.not_equal(shape, 0)

def reshape(dataset, window_size, label_name):
    # Converting to time series
    ds = to_time_series(dataset, label_name, window_size=window_size)
    # Selecting document features
    ds = ds.map(select_doc)
    # Filtering out elements without a document feature
    ds = ds.filter(filter_fn)
    return ds

In [57]:
# Reshaping Datasets
reshaped_datasets = list(map(lambda d: reshape(d, TIMESTEPS, 'log_adj_daily_returns'), datasets))

4. Concatenating datasets, and shuffling dataset

In [58]:
dataset = reduce(lambda a, b: a.concatenate(b), reshaped_datasets).shuffle(1000, reshuffle_each_iteration=False)

5. Splitting dataset into train, validation, and test datasets

In [59]:
# Defining Functions and Classes for splitting datasets into train, validation, and test datasets

def k_folds(dataset, k):
    '''
    Splits :param dataset: into :param k: number of equally sized (or close to equally sized) components.
    
    :param dataset: tf.data.Dataset, dataset to split into k folds
    :param k: int, number of folds to split :param dataset: into
    
    ---> list, of tf.data.Dataset objets
    '''
    return [dataset.shard(k, i) for i in range(k)]

def train_test_split(dataset, train_size):
    '''
    Splits :param dataset: into
    
    :param dataset: tf.data.Dataset, to split into train and test datasets
    :param train_size: float between 0 and 1, proportion of :param dataset: to put into train dataset
    
    ---> (tf.data.Dataset, tf.data.Dataset), representing train, test datasets
    '''
    train_size = Fraction(train_size).limit_denominator()
    x, k = train_size.numerator, train_size.denominator
    folds = k_folds(dataset, k)
    train = reduce(lambda a, b: a.concatenate(b), folds[:x])
    test = reduce(lambda a, b: a.concatenate(b), folds[x:])
    return train, test

For our models we will reserve 60% of the dataset for training, 20% for validation, and 20% for testing.

In [60]:
# Splitting our dataset into train, validation, test datasets

# Creating datasets
train_dataset, test_val_dataset = train_test_split(dataset, train_size=0.6)
val_dataset, test_dataset = train_test_split(test_val_dataset, train_size=0.5)

# Prepping datasets for modeling
train_dataset = (train_dataset.shuffle(10)
                 .padded_batch(BATCH_SIZE, 
                               padded_shapes=({'log_adj_daily_returns': [TIMESTEPS,], 
                                               'docs': [TIMESTEPS, None]}, [])))
val_dataset = (val_dataset.shuffle(10)
               .padded_batch(BATCH_SIZE,
                             padded_shapes=({'log_adj_daily_returns': [TIMESTEPS,], 
                                             'docs': [TIMESTEPS, None]}, [])))
test_dataset = (test_dataset.shuffle(10)
                .padded_batch(BATCH_SIZE, 
                              padded_shapes=({'log_adj_daily_returns': [TIMESTEPS,], 
                                              'docs': [TIMESTEPS, None]}, [])))



In [111]:


feature_names = ['log_adj_daily_returns', 'docs_WFC', 'docs_JPM', 'docs_C', 'docs_BAC']

# destroying already made graph nodes in the tensorflow backend





timesteps = 5




# document embedding Model

    
with open(os.path.join(path_to_data, 'vocab.json')) as f:
    vocab = json.load(f)
        
#document = keras.Input(shape=(None,), name='document')
#word_embedding = Word_Embedding(vocab, trainable=False)(document)
#document_embedding = layers.LSTM(400)(word_embedding)
#document_embedder = keras.Model(document, document_embedding, name='document_embedder')
#print(document_embedder.summary())

#keras.utils.plot_model(document_embedder, 'document_embedder.png', show_shapes=True)



# Inputs
input_docs_WFC = keras.Input(shape=(timesteps, None), name='docs_WFC')
#input_docs_JPM = keras.Input(shape=(timesteps, None), name='docs_JPM')
#input_docs_BAC = keras.Input(shape=(timesteps, None), name='docs_BAC')
#input_docs_C = keras.Input(shape=(timesteps, None), name='docs_C')
input_log_returns = keras.Input(shape=(timesteps,), name='log_adj_daily_returns')

# Splitting docs_WFC input into its individual timesteps
timesteps_layer = [input_docs_WFC[:, t] for t in range(timesteps)]

# Flattening Documents Dimension for each timestep (cause I don't know how to deal with the extra dimension for the LSTM)
#flattened_timesteps_layer = [tf.reshape(timestep, [-1, tf.reduce_prod(tf.shape(timestep)[1:])]) for timestep in timesteps_layer]

# Word Embedding Layer
word_embedding = Word_Embedding(vocab, init='glove', trainable=True, mask_zero=True)
word_embedding_layer = [word_embedding(timestep) for timestep in timesteps_layer]


# Document Embedding Layer
document_embedding = layers.LSTM(100)
doc_embedding_layer = [document_embedding(timestep) for timestep in word_embedding_layer]

# Preparing Inputs for Time Series
num_features = tf.expand_dims(input_log_returns, -1)
doc_features = tf.stack(doc_embedding_layer, axis=1)
ts_input = layers.Concatenate()([doc_features, num_features])



# Time Series Component
time_series = layers.LSTM(100)(ts_input)

# Output
output = layers.Dense(1)(time_series)

# Creating Model
test_model = keras.Model({'docs_WFC': input_docs_WFC, 'log_adj_daily_returns': input_log_returns}, output, name='test_model')

# Compiling Model
test_model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError())





# Training Model
#test_ds = tsds.batch(1).repeat()


#test_model.fit_generator(test_ds, epochs=3, steps_per_epoch=100)
keras.utils.plot_model(test_model, 'test.png', show_shapes=True)

IndexError: list assignment index out of range

In [7]:
d = tf.data.Dataset.range(10)
d.element_spec.dtype


tf.int64

### Text Embedding

In [4]:


def preprocess_pm(raw_df):
    '''
    Preprocessing raw_df into the shape raw_df should have been after coming out of the fetch process, as well as 
    normalizing the documents.
    '''
    
    # Reshaping DataFrame
    reshaped_df = reshape(raw_df)
    
    # Normalizing and updating documents    
    def update_doclist(s):
        doclist = json.loads(s)
        updated_doclist = []
        for docpath in doclist:
            save_point = os.path.join(os.path.split(docpath)[0], 'normalized')
            norm_docpath = normalize_save_document(docpath, save_point)
            updated_doclist.append(norm_docpath)
            
        return json.dumps(updated_doclist)
    
    for t in tickers:
        reshaped_df['_'.join(['docs', t])] = reshaped_df['_'.join(['docs', t])].map(update_doclist)
        
    # Preprocessing numerical data
    reshaped_df['log_adj_close'] = np.log(reshaped_df['adjusted_close'])
    reshaped_df['log_adj_daily_returns'] = reshaped_df['log_adj_close'] - reshaped_df['log_adj_close'].shift(-1)
    reshaped_df.dropna(subset=['log_adj_daily_returns'], inplace=True)
    
    # Building vocabulary json file
    path_to_vocab = os.path.join(path_to_data, 'vocab.json')
    
    def vocab_from_doclist(s):
        
        doclist = json.loads(s)
        
        for docpath in doclist:
            with open(docpath, 'r') as f:
                doc = f.read()
            
            build_vocab(doc)
        
        return json.dumps(doclist)
    
    for t in tickers:
        reshaped_df['_'.join(['docs', t])].map(vocab_from_doclist)
        
    # Encoding documents based off of vocabulary json file
    
    
    return reshaped_df

In [5]:
# Writing tfrecords

feature_names = ['log_adj_daily_returns', 'docs_WFC', 'docs_JPM', 'docs_C', 'docs_BAC']
with open(os.path.join(path_to_data, 'vocab.json'), 'r') as f:
    vocab = json.load(f)


# The following functions can be used to convert a value to a type compatible
# with tf.Example.

def _bytes_feature(value):
    '''Returns a bytes_list from a string / byte.'''
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    '''Returns a float_list from a float / double.'''
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    '''Returns an int64_list from a bool / enum / int / uint.'''
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(sample, feature_names):
    
    feature = {}
    
    for feature_name in feature_names:
        # if feature is a number 
        if isinstance(sample[feature_name], float):
            feature[feature_name] = tf.train.Feature(float_list=tf.train.FloatList(value=[sample[feature_name]]))
        
        # if feature is a doclist
        elif isinstance(sample[feature_name], list):
            lens = list(map(len, sample[feature_name]))
            values = [word for doc in sample[feature_name] for word in doc]
            feature[feature_name + '/vals'] = tf.train.Feature(int64_list=tf.train.Int64List(value=values))
            feature[feature_name + '/lens'] = tf.train.Feature(int64_list=tf.train.Int64List(value=lens))

    return tf.train.Example(features=tf.train.Features(feature=feature))
        


def unpack_doclist(doclist_string):        
    def load_encode_file(filename):
        with open(filename, 'r') as f:
            text = f.read()
        return [vocab[word] for word in text.split()]
    return list(map(load_encode_file, json.loads(doclist_string)))


record_file = 'test.tfrecords'
with tf.io.TFRecordWriter(record_file) as writer:
    for i in range(300, -1, -1):
        row = raw_df_fixed.iloc[i].copy(deep=True)
        # Unpacking Text Features
        row[['_'.join(['docs', t]) for t in tickers]] = row[['_'.join(['docs', t]) for t in tickers]].map(unpack_doclist)
        # Serializing Example to disk
        example = serialize_example(row, feature_names)
        writer.write(example.SerializeToString())
    


Preparing our data involves:
1. Loading the dataset from the TFRecord file
2. Splitting the dataset by stock ticker
3. Reshaping each dataset to prepare it for training:
    1. Windowing the dataset so each element produces a time series of features along with there corresponding label
    2. Sampling the document feature for the document that will represent the specific window's document and cloning that document for each timestep in our defined window size
    3. Filtering our dataset to include only elements with a document feature
4. Concatenating the reshaped datasets together, and shuffling the dataset
5. Splitting the dataset into train, validation, and test datasets

1. Loading the dataset from TFRecord file

### Writing Dataset to TFRecords

After sucessfully preprocessing our dataset we next write our dataset to a TFRecords file (https://www.tensorflow.org/tutorials/load_data/tfrecord) a binary file format that is read efficiently by the TensorFlow framework. 

In [3]:
# Defining Functions and Classes used to write TFRecord files

def serialize_example(sample, feature_names):
    '''
    Maps dictionary :param sample: to a tf.train.Example object where the list feature_names determines which 
    subset of :param sample:'s keys are to be used. 
    
    :param sample: dict, where the keys are the names of the features of the specific data sample, and the values 
                   are the values each feature takes on for the specific data sample
    :param feature_names: list, of strings, a subset of sample.keys(), these are the features we will
                          be considering for analysis
    
    ---> tf.train.Example, object representing the data sample
    '''
    
    feature = {}
    feature_description = {}
    
    for feature_name in feature_names:
        # if feature is a float number 
        if isinstance(sample[feature_name], float):
            feature[feature_name] = tf.train.Feature(float_list=tf.train.FloatList(value=[sample[feature_name]]))
            feature_description[feature_name] = tf.io.FixedLenFeature([], tf.float32)
        
        # if feature is a list of documents
        elif isinstance(sample[feature_name], list) and all(isinstance(word, int) for doc in sample[feature_name] for word in doc):
            lens = list(map(len, sample[feature_name]))
            values = [word for doc in sample[feature_name] for word in doc]
            feature[feature_name + '/vals'] = tf.train.Feature(int64_list=tf.train.Int64List(value=values))
            feature[feature_name + '/lens'] = tf.train.Feature(int64_list=tf.train.Int64List(value=lens))
            feature_description[feature_name + '/vals'] = tf.io.VarLenFeature(dtype=tf.int64)
            feature_description[feature_name + '/lens'] = tf.io.VarLenFeature(dtype=tf.int64)
        
        # if feature is an integer number
        elif isinstance(sample[feature_name], int):
            feature[feature_name] = tf.train.Feature(int64_list=tf.train.Int64List(value=[sample[feature_name]]))
            feature_description[feature_name] = tf.io.FixedLenFeature([], tf.int64)
        
        # Feature doesn't fit any of the tf example types
        else:
            raise ValueError('Value of Feature does not fit any of the tf.train.Feature serializable types')

    return tf.train.Example(features=tf.train.Features(feature=feature)), feature_description


def write_tfrecord(df, feature_names, filename='dataset.tfrecord'):
    '''
    Writes TFRecord file named :param filename: to dataset directory, and generates the corresponding 
    feature_description dictionary mapping a sample's feature name to the data type description of that
    feature.
    
    :param df: pd.DataFrame, containing data and refrences to data that needs to be written to disk
    :param feature_names, list of strings, a subset of the names of columns of df, that represents which subset
                          of features from our preprocessed DataFrame that will be considered for modeling
    :param filename: string, name of TFRecord file
    
    ---> dict, of names of features mapping to tf.io.VarLenFeature and tf.io.FixedLenFeature objects
    '''
    
    def unpack_doclist(doclist_string):
        '''
        Takes a json format string that when loaded contains a list of paths to encoded document pickle files, and
        returns a list of the objects loaded from these pickle files.
        
        :param doclist_string: string, json formated, contain a list of paths to encoded document pickle files
        
        ---> list, of encoded documents
        '''
        
        def load_file(filename):
            '''
            Takes a path to a pickle file, and returns the loaded object.
            
            :param filename: string, path to pickle file
            
            ---> python object loaded file located at :param filename:
            '''
            
            with open(filename, 'rb') as f:
                doc = pickle.load(f)
            return doc
        
        return list(map(load_file, json.loads(doclist_string)))
    
    with tf.io.TFRecordWriter(os.path.join(path_to_data, filename)) as writer:
        print('Writing TFRecord file to: {}'.format(os.path.join(path_to_data, filename)))
        for i in range(len(df)-1, -1, -1):
            row = df.iloc[i].copy(deep=True)
            # Unpacking Text Features
            row[['_'.join(['docs', t]) for t in tickers]] = row[['_'.join(['docs', t]) for t in tickers]].map(unpack_doclist)
            # Serializing Example to disk
            example, feature_description = serialize_example(row, feature_names)
            writer.write(example.SerializeToString())
    print('Finished writing TFRecord file.')

    return feature_description

In [4]:
# Writing TFRecord file

# Defining the subset of features from our preprocessed DataFrame that we will be using for modeling
feature_names = ['_'.join([feature, t]) for feature in ['log_adj_daily_returns', 'docs'] for t in tickers]
print('The features we will be considering for analysis are: {}'.format(', '.join(feature_names)))
print()
# Loading the preprocessed DataFrame from disk if it has not been instantiated
try:
    preprocessed_df
except:
    preprocessed_df = pd.read_csv(os.path.join(path_to_data, 'preprocessed.csv'), parse_dates=['timestamp'])
# Writing the TFRecord file
feature_description = write_tfrecord(preprocessed_df, feature_names)
# Writing the TFRecord's feature_description object to disk
with open(os.path.join(path_to_data, 'dataset_feature_description.pickle'), 'wb') as f:
    pickle.dump(feature_description, f)

The features we will be considering for analysis are: log_adj_daily_returns_WFC, log_adj_daily_returns_JPM, log_adj_daily_returns_BAC, log_adj_daily_returns_C, docs_WFC, docs_JPM, docs_BAC, docs_C

Writing TFRecord file to: /media/Data/Programs/FinTech/data/dataset.tfrecord
Finished writing TFRecord file.


For this notebook we will be using only the bare minimum amount of features for modeling ie: our text features listed in the doc columns and the feature we are trying to predict in time series.