In [1]:
import os
import numpy as np
import pandas as pd
import data_helpers
import pickle
from data_helpers import TrainValTensorBoard
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Embedding, Activation, Flatten, Dense, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model


Using TensorFlow backend.


In [2]:
#==================Preprocess===================

# Load data
csv = '../data/twitter/clean_tweet_char.csv'
df = pd.read_csv(csv, index_col=0)
print(df.head())

x_text = df['text'].values
y = df['target'].values
y = to_categorical(y)

                                                text  target
0    - awww, that's a bummer.  you shoulda got da...       0
1  is upset that he can't update his facebook by ...       0
2   i dived many times for the ball. managed to s...       0
3    my whole body feels itchy and like its on fire        0
4   no, it's not behaving at all. i'm mad. why am...       0


  mask |= (ar1 == a)


In [5]:
non_df = df[df.isnull().any(axis=1)]
non_df

Unnamed: 0,text,target
504887,,0
737161,,0
761827,,0
1046539,,1
1055283,,1
1153737,,1
1337856,,1
1428143,,1


In [10]:
# Delete Null row
df = df.dropna()
print(df.target.value_counts())

x_text = df['text'].values
y = df['target'].values
y = to_categorical(y)

0    799997
1    799995
Name: target, dtype: int64


In [11]:
length = [len(str(sent)) for sent in x_text]
print('The max length is: ', max(length))
print('The min length is: ', min(length))
print('The average length is: ', sum(length)/len(length))

The max length is:  366
The min length is:  1
The average length is:  67.4089039195196


In [12]:
for sent in x_text:
    if isinstance(sent, float):
        print(sent)

In [14]:
# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(x_text)

# If we already have a character list, then replace the tk.word_index
# If not, just skip below part

# -----------------------Skip part start--------------------------
# construct a new vocabulary
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1

# Use char_dict to replace the tk.word_index
tk.word_index = char_dict.copy()
# Add 'UNK' to the vocabulary
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
# -----------------------Skip part end----------------------------

# Convert string to index
sequences = tk.texts_to_sequences(x_text)
# See char level length
length = [len(sent) for sent in sequences]
print('The max length is: ', max(length))
print('The min length is: ', min(length))
print('The average length is: ', sum(length)/len(length))


The max length is:  366
The min length is:  1
The average length is:  67.4089039195196


In [17]:
tk.word_index

{' ': 37,
 '!': 41,
 '"': 45,
 '#': 51,
 '$': 52,
 '%': 53,
 '&': 55,
 "'": 44,
 '(': 64,
 ')': 65,
 '*': 56,
 '+': 59,
 ',': 38,
 '-': 60,
 '.': 40,
 '/': 46,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 ':': 43,
 ';': 39,
 '<': 62,
 '=': 61,
 '>': 63,
 '?': 42,
 '@': 50,
 'UNK': 70,
 '[': 66,
 '\\': 47,
 ']': 67,
 '^': 54,
 '_': 49,
 '`': 58,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '{': 68,
 '|': 48,
 '}': 69,
 '~': 57}

In [16]:
print(x_text[0])
print(sequences[0])

  - awww, that's a bummer.  you shoulda got david carr of third day to do it. ;d
[37, 37, 60, 37, 1, 23, 23, 23, 38, 37, 20, 8, 1, 20, 44, 19, 37, 1, 37, 2, 21, 13, 13, 5, 18, 40, 37, 37, 25, 15, 21, 37, 19, 8, 15, 21, 12, 4, 1, 37, 7, 15, 20, 37, 4, 1, 22, 9, 4, 37, 3, 1, 18, 18, 37, 15, 6, 37, 20, 8, 9, 18, 4, 37, 4, 1, 25, 37, 20, 15, 37, 4, 15, 37, 9, 20, 40, 37, 39, 4]
