In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['training.1600000.processed.noemoticon.csv']


In [2]:
!pip install -q tensorflow==2.0.0-beta1

# Import Libraries

In [3]:
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
print(tf.__version__)

2.0.0-beta1


# Define Parameters

In [5]:
train_split = 0.9
VOCAB_SIZE = 10000
EPOCHS = 10
PADDING = 'post'
TRUNC = 'post'
MAXLEN = 1024
EMB_DIM = 16
BATCH_SIZE = 256

# Load Data

In [6]:
sentences = []
labels = []

df = pd.read_csv('../input/training.1600000.processed.noemoticon.csv', 
                 header=None, 
                 names=['target', 'ids', 'date', 'flag', 'user', 'text'],
                 encoding='latin-1')

sentences = df['text']
labels = np.array(df['target']==4, dtype=int)

print(sentences[0])
print(labels[0])
print(len(sentences))
print(len(labels))
print (np.unique(labels))

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
0
1600000
1600000
[0 1]


# Split data

In [7]:
split = int(train_split * len(sentences))

train_sents = sentences[:split]
val_sents = sentences[split:]

train_labels = labels[:split]
val_labels = labels[split:]

# Sentences to Sequences

In [8]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_sents)
word_index = tokenizer.word_index

train_seqs = tokenizer.texts_to_sequences(train_sents)
train_seqs = pad_sequences(train_seqs, padding=PADDING, truncating=TRUNC, maxlen=MAXLEN)
val_seqs = tokenizer.texts_to_sequences(val_sents)
val_seqs = pad_sequences(val_seqs, padding=PADDING, truncating=TRUNC, maxlen=MAXLEN)

print(train_seqs.shape)
print(train_labels.shape)
print(val_seqs.shape)
print(val_labels.shape)

(1440000, 1024)
(1440000,)
(160000, 1024)
(160000,)


# Define Model

In [9]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMB_DIM, input_length=MAXLEN),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(32, activation='tanh'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile Model

In [10]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1024, 16)          160000    
_________________________________________________________________
bidirectional (Bidirectional (None, 1024, 128)         41472     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 244,801
Trainable params: 244,801
Non-trainable params: 0
_________________________________________________________________


# Train model

In [11]:
history = model.fit(train_seqs,
                    train_labels,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(val_seqs, val_labels),
                    verbose=1)

Train on 1440000 samples, validate on 160000 samples
Epoch 1/10
   1792/1440000 [..............................] - ETA: 14:37:48 - loss: 0.6905 - acc: 0.5273

KeyboardInterrupt: 

# Plot Results

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(EPOCHS)

plt.plot(epochs, acc, 'r')
plt.plot(epochs, val_acc, 'b')
plt.xlabel('EPOCHS')
plt.ylabel('Accuracies')
plt.legend('Train Acc', 'Val Acc')
plt.figure()

plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.xlabel('EPOCHS')
plt.ylabel('Losses')
plt.legend('Train Loss', 'Val Loss')
plt.figure()