In [4]:
import os
import numpy as np
import pandas as pd
import data_helpers
import pickle
from data_helpers import TrainValTensorBoard
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Embedding, Activation, Flatten, Dense, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Dropout, LSTM
from keras.models import Model
from keras.callbacks import CSVLogger
from data_helpers import BPE

In [2]:
#==================Preprocess===================

# Load data
csv = '../data/twitter/clean_tweet_char.csv'
df = pd.read_csv(csv, index_col=0)
print(df.head())

# Delete Null row
df = df.dropna()
print(df.target.value_counts())


x_text = df['text'].values
y = df['target'].values
y = to_categorical(y)

  mask |= (ar1 == a)


                                                text  target
0    - awww, that's a bummer.  you shoulda got da...       0
1  is upset that he can't update his facebook by ...       0
2   i dived many times for the ball. managed to s...       0
3    my whole body feels itchy and like its on fire        0
4   no, it's not behaving at all. i'm mad. why am...       0
0    799997
1    799995
Name: target, dtype: int64


In [5]:
# Convert subword to index, function version
def subword2index(texts, vocab):
    sentences = []
    for s in texts:
        s = s.split()
        one_line = []
        for word in s:
            if word not in vocab.keys():
                one_line.append(vocab['unk'])
            else:
                one_line.append(vocab[word])
        sentences.append(one_line)
    return sentences


# replace all digits with 0
import re

train_texts = [re.sub('\d', '0', s) for s in x_text]

# replace all URLs with <url>
url_reg = r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b'
train_texts = [re.sub(url_reg, '<url>', s) for s in train_texts]

# Convert string to subword, this process may take several minutes
bpe = BPE("./pre-trained-model/en.wiki.bpe.op25000.vocab")
train_texts = [bpe.encode(s) for s in train_texts]

# Build vocab, {token: index}
vocab = {}
for i, token in enumerate(bpe.words):
    vocab[token] = i + 1

# Convert train and test
train_sentences = subword2index(train_texts, vocab)

In [8]:
# See subword level length
length = [len(sent) for sent in train_sentences]
print('The max length is: ', max(length))
print('The min length is: ', min(length))
print('The average length is: ', sum(length)/len(length))

The max length is:  364
The min length is:  1
The average length is:  20.11691870959355


In [9]:
# Padding
from keras.preprocessing.sequence import pad_sequences
train_data = pad_sequences(train_sentences, maxlen=364, padding='post')

In [12]:
# Shuffle data
np.random.seed(42)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = train_data[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train and test
training_rate = 0.9
train_len = int(len(y) * training_rate)
x_train = x_shuffled[:train_len]
y_train = y_shuffled[:train_len]
x_test = x_shuffled[train_len:]
y_test = y_shuffled[train_len:]
print('Training data size is: ', x_train.shape)
print('Validation data size is: ', x_test.shape)

Training data size is:  (1439992, 364)
Validation data size is:  (160000, 364)


In [14]:
ls ../data/twitter/

clean_tweet.csv
clean_tweet_char.csv
[31mtraining.1600000.processed.noemoticon.csv[m[m*


In [16]:
data_dir = '../data/twitter/preprocessed_dataset.npz'

np.savez(data_dir, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)

In [18]:
for i, items in enumerate(vocab):
    print(items)
    if i > 3:
        break

<unk>
<s>
</s>
▁t
▁a


In [None]:
bpe = BPE("./pre-trained-model/en.wiki.bpe.op25000.vocab")
train_texts = [bpe.encode(s) for s in train_texts]


In [None]:
# read data from saved file 
dataset = np.load('../data/twitter/preprocessed_dataset.npz')

x_train = dataset['x_train']
y_train = dataset['y_train']
x_test = dataset['x_test']
y_test = dataset['y_test']

In [None]:
# Embedding Initialization
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format("./pre-trained-model/en.wiki.bpe.op25000.d50.w2v.bin", binary=True)

from keras.layers import Embedding

input_size = 364
embedding_dim = 50
embedding_weights = np.zeros((len(vocab) + 1, embedding_dim)) # (25001, 50)

for subword, i in vocab.items():
    if subword in model.vocab:
        embedding_vector = model[subword]
        if embedding_vector is not None:
            embedding_weights[i] = embedding_vector
    else:
#         print(subword) # print the subword in vocab but not in model
        continue

embedding_layer = Embedding(len(vocab)+1,
                            embedding_dim,
                            weights=[embedding_weights],
                            input_length=input_size)