In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os, re
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import nltk

In [2]:
df = pd.read_csv('data.csv')
df.iloc[2].values

array([1,
       '@ChargerJenn Thx for answering so quick,I was afraid I was gonna crash twitter with all the spamming I did 2 RR..sorry bout that '],
      dtype=object)

## Data Cleaning
* Removing punctuations
* Numbers

In [3]:
targets = df["polarity"].values
text = df['text'].values

In [4]:
english_words = np.array([word for word in nltk.corpus.words.words('en')])
english_words

array(['A', 'a', 'aa', ..., 'zythum', 'Zyzomys', 'Zyzzogeton'],
      dtype='<U24')

In [5]:
test = text[2]
test

'@ChargerJenn Thx for answering so quick,I was afraid I was gonna crash twitter with all the spamming I did 2 RR..sorry bout that '

In [6]:
def clean_sent(sent):
    a = re.sub(r'[^a-zA-Z]', ' ', sent)
    b = re.sub(r'\s+', ' ', a).strip().lower()
    return b

In [7]:
cleaned_sents = []
for sent in text:
    cleaned_sents.append(clean_sent(sent))

In [8]:
cleaned_sents[:5]

['kconsidder you never tweet',
 'sick today coding from the couch',
 'chargerjenn thx for answering so quick i was afraid i was gonna crash twitter with all the spamming i did rr sorry bout that',
 'wii fit says i ve lost pounds since last time',
 'mrkinetik not a thing i don t really have a life']

In [9]:
len(cleaned_sents), len(targets)

(500000, 500000)

In [10]:
from collections import Counter

In [11]:
Counter(targets)

Counter({0: 250275, 1: 249725})

### Create ``Word Vectors``

> Find number of unique words

In [12]:
counter = Counter()
for sent in cleaned_sents:
    words = word_tokenize(sent)
    for word in words:
        counter[word] +=1

In [13]:
unique_words = len(counter)
unique_words

272544

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

> Splitting `test` and `train` sets.

In [15]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_sents, targets, test_size=.2, random_state=42)

In [16]:
X_train[:2]

['bengrossman utoh they moved you up an hour bummer i can t get down there that fast break a leg',
 'nite folk i only got hours of sleep last night so i might sleep before jimmyfallon is over maybe']

> Create a `word_index` from the `test` data.

In [17]:
tokenizer = Tokenizer(num_words=unique_words, oov_token="<OOV>" )
tokenizer.fit_on_texts(X_train)
tokenizer

<keras_preprocessing.text.Tokenizer at 0x1b71e519a90>

In [18]:
word_indices = tokenizer.word_index

> Creating a `decord` function.

In [19]:
word_indices_reversed = dict({(value, key) for (key, value) in word_indices.items()})

In [20]:
def decord_text(sent):
    return " ".join([word_indices_reversed[i] for i in sent])

> Creating `word_tokens` from the `copra`.

In [41]:
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [22]:
print(X_train_tokens[0])

[75395, 75396, 77, 1383, 9, 32, 97, 378, 1210, 2, 31, 14, 36, 164, 71, 18, 724, 539, 5, 1276]


> Decording the first `sent`.

In [23]:
decord_text(X_train_tokens[0])

'bengrossman utoh they moved you up an hour bummer i can t get down there that fast break a leg'

> Padd `suquencing`.

In [42]:
max_words_in_a_sent = 100
X_train_tokens_padded = pad_sequences(X_train_tokens, maxlen=max_words_in_a_sent, truncating='post', padding='post' )
X_test_tokens_padded = pad_sequences(X_test_tokens, maxlen=max_words_in_a_sent, truncating='post', padding='post')

In [26]:
for sent in X_train_tokens[:5]:
    print(len(sent), end=', ')

100, 100, 100, 100, 100, 

> Creating a `Neural` Network.

```python

        [EmbedingLayer]
               |-------
            [LSTM]     | Bidirectional (backward = LSTM)
               |<------       
            [LSTM]     | Bidirectional (backward = GRU)
               |-------
            [LSTM] 
               |
    [GlobalAveragePooling1D]
               |
            [Dense]
               |
            [Dense]
               |
            [Dense]
            /    \
       (polite)  (rude)
            
            
```

In [28]:
import tensorflow.keras as keras

In [50]:
forward_layer_1 = keras.layers.LSTM(128, return_sequences=True, name="forward_layer_1",
                                   dropout=.5
                                   )
backward_layer_1 = keras.layers.LSTM(128, activation='relu', return_sequences=True, 
                                     go_backwards=True, name="backward_layer_1",
                                    dropout=.5
                                    )
forward_layer_2 = keras.layers.LSTM(64, return_sequences=True, 
                                    dropout = .5,
                                    name="forward_layer_2",
                                   )
backward_layer_2 = keras.layers.GRU(64, activation='relu', return_sequences=True, go_backwards=True,
                                    dropout = .5,
                                    name="backward_layer_2"
                                   )

model = keras.Sequential([
    keras.layers.Embedding(input_dim=unique_words, output_dim=64, input_length=max_words_in_a_sent),
    keras.layers.Bidirectional(forward_layer_1,
                                    backward_layer = backward_layer_1
                                   ,name="bidirectional_layer_1"),
    keras.layers.Bidirectional(forward_layer_2,
                                backward_layer = backward_layer_2,
                                name="bidirectional_layer_2"),
    keras.layers.LSTM(32, name="lstm_last_layer"),
    keras.layers.Flatten(name="flatten_layer"),
    keras.layers.Dense(64, activation='tanh', name="dense_layer_1"),
    keras.layers.Dense(1, activation='sigmoid', name="output_dense_layer")
], name="model_sequential")
model.compile(
    loss = keras.losses.binary_crossentropy,
    optimizer = keras.optimizers.Adam(),
    metrics=["acc"]
)
model.summary()

Model: "model_sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 64)           17442816  
_________________________________________________________________
bidirectional_layer_1 (Bidir (None, 100, 256)          197632    
_________________________________________________________________
bidirectional_layer_2 (Bidir (None, 100, 128)          144000    
_________________________________________________________________
lstm_last_layer (LSTM)       (None, 32)                20608     
_________________________________________________________________
flatten_layer (Flatten)      (None, 32)                0         
_________________________________________________________________
dense_layer_1 (Dense)        (None, 64)                2112      
_________________________________________________________________
output_dense_layer (Dense)   (None, 1)            

In [44]:
y_train.shape

(400000,)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                patience=3,
                                                verbose=0
                                              )
history = model.fit(X_train_tokens_padded, y_train, epochs=10, verbose=1, validation_split=.2, batch_size=128,
         callbacks=[early_stopping])

Epoch 1/10

In [None]:
class BidirectionaLayer(keras.layers.Layer):
    def __init__(self):
        super().__init__()
        
    def call(self, x):
        pass

In [None]:
class Net(keras.Model):
    pass