In [4]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import tensorflow as tf
from tensorflow.python.lib.io import file_io
from tensorflow.python.keras.preprocessing import sequence
import numpy as np
import msgpack
from io import BytesIO
import logging
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import (ADASYN, RandomOverSampler, SMOTE)

In [39]:
data_dir = 'wikimedia-personal-attacks-data.bin'
embeddings_dir = 'wikimedia-personal-attacks-embeddings.npy'

# Read in saved files.
logging.info('Loading data.')
f = BytesIO(file_io.read_file_to_string(embeddings_dir, binary_mode=True))
vocab = np.load(f)

# Load features and labels.
f = BytesIO(file_io.read_file_to_string(data_dir, binary_mode=True))
data = msgpack.unpack(f, raw=False)


def prepare_data(raw_data):
    # Convert data to numpy arrays.
    logging.info('Converting data to arrays.')

    # For keeping number of words in longest document in data.
    max_words = 0

    # Create arrays to store docs and labels.
    docs = []
    labels = []

    # Iterate over data to build arrays of docs and labels.
    num_docs = len(raw_data)
    for i in range(num_docs):
        #sys.stdout.write("processing record %i of %i       \r" % (i + 1, num_docs))
        #sys.stdout.flush()

        # Get index of document.
        doc = raw_data[i]['idx']

        # Retrieve document from saved data and cast to array.
        doc = [item for sublist in doc for item in sublist]

        # Add document to docs array.
        docs.append(doc)

        # Add label to label array at same index.
        labels.append(raw_data[i]['label'])

        # Track maximum number of words in document.
        if len(doc) > max_words:
            max_words = len(doc)

    del raw_data
    print()

    # Label encoder.
    #   Encode labels with value between 0 and n_classes-1,
    #   so for example 1 to 5 star ratings become 0 to 4.
    le = LabelEncoder()
    y = le.fit_transform(labels)

    # Binarize labels in one-vs-all fashion if three or more classes.
    lb = LabelBinarizer()
    y_bin = lb.fit_transform(y)
    
    docs = np.asarray(docs)
    
    print('docs:', docs.shape)
    print('y_bin', y_bin.shape)

    x_train, x_test, y_train, y_test = train_test_split(docs, y_bin,
                                                        test_size=0.20)

    # Char used as padding elements.
    pad_id = 0

    # Pads all docs to the length of the longest doc using the pad_id char.
    x_train_padded = sequence.pad_sequences(x_train,
                                            maxlen=400,
                                            truncating='post',
                                            padding='post',
                                            value=pad_id)

    x_test_padded = sequence.pad_sequences(x_test,
                                           maxlen=400,
                                           truncating='post',
                                           padding='post',
                                           value=pad_id)

    print('Unpadded length of first training doc:\t', len(x_train[0]))
    print('Unpadded length of second training doc:\t', len(x_train[1]))
    print('Padded len of first doc:\t', len(x_train_padded[0]))
    print('Padded len of second doc:\t', len(x_train_padded[1]))
    print('x_train shape:\t\t\t', x_train_padded.shape)
    print('x_test shape:\t\t\t', x_test_padded.shape)
    print()
    print(len(x_train) + len(x_test), 'documents each of length',
          400, '.')

    # Store pre-truncated/padded lengths of docs.
    x_len_train = np.array([min(len(x), max_words) for x in x_train])
    x_len_test = np.array([min(len(x), max_words) for x in x_test])
    print('Length of original, unpadded train docs:', x_len_train)
    print('Length of original, unpadded test docs:', x_len_test)

    print('x_train_padded.shape:', x_train_padded.shape)
    
    # Oversample training data to compensate for unbalanced labels.
    sampler = RandomOverSampler()
    # Flatten labels array to have shape (n_samples, ) on input.
    x_train, y_train = sampler.fit_resample(x_train_padded, y_train.flatten())
    
    print('x_train_padded_resampled.shape:',x_train.shape)
    print('y_bin_resampled.shape:',y_train.shape)
    
    
    # TODO Add data loader to create train/test splits.
    # Turn this into a dataset class.

    # TODO Use sklearn to make train/test split?
    return x_train, x_len_train, y_train, x_test_padded, x_len_test, y_test

In [40]:
# Get prepared data.
x_train, x_len_train, y_train, x_test, x_len_test, y_test = prepare_data(data)


docs: (115864,)
y_bin (115864, 1)
Unpadded length of first training doc:	 299
Unpadded length of second training doc:	 41
Padded len of first doc:	 400
Padded len of second doc:	 400
x_train shape:			 (92691, 400)
x_test shape:			 (23173, 400)

115864 documents each of length 400 .
Length of original, unpadded train docs: [299  41  61 ...  26  96  56]
Length of original, unpadded test docs: [20 51 25 ... 59  3 31]
x_train_padded.shape: (92691, 400)
x_train_padded_resampled.shape: (162710, 400)
y_bin_resampled.shape: (162710,)
