In [1]:
from keras.preprocessing import sequence
from keras.preprocessing import text
import random
from datetime import datetime 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import pandas as pd

In [3]:
# Vectorization parameters
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000


In [4]:
# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 500


In [5]:
def sequence_vectorize(train_texts, train_labels):
    """Vectorizes texts as sequence vectors.

    1 text = 1 sequence vector with fixed length.

    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val, word_index: vectorized training and validation
            texts and word index dictionary.
    """
   
    # Split training and validation data
    random.seed(datetime.now())
    train_count = int(0.8 * len(train_texts))
    random.shuffle(train_texts)
    random.shuffle(train_labels)
    x_train, x_val = train_texts[:train_count], train_texts[train_count:]
    y_train, y_val = train_labels[:train_count], train_labels[train_count:]
    
    
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(x_train)
    
    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(x_train)
    x_val = tokenizer.texts_to_sequences(x_val)
   
    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    return x_train, x_val, y_train, y_val, tokenizer.word_index

In [6]:
df = pd.read_csv("cleaned.csv")
X, Y = [], []

In [7]:
#cols = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
X = [str(i[0]) for i in zip(df['comment_text'])]
Y = list(zip(df['toxic'], df['severe_toxic'], df['obscene'], df['threat'], df['insult'], df['identity_hate']))

In [8]:
vals = sequence_vectorize(X, Y)
#vals = x_train, x_val, y_train, y_val, tokenizer.word_index

In [9]:
X[0]

'spam notice please add inappropriate external links wikipedia wikipedia mere directory links used advertising promotion inappropriate links include limited links personal web sites links web sites affiliated links exist attract visitors web site promote product see external links guideline spam policies explanations links considered appropriate feel link added article please discuss rather readding see welcome page learn wikipedia thank'

In [10]:
Y[0]

(0, 0, 0, 0, 0, 0)

In [13]:
vals[2][0]

(0, 0, 0, 0, 0, 0)

In [15]:
import pickle
with open('train_data.h5', 'wb+') as f:
    pickle.dump(vals, f)