In [1]:
import tensorflow as tf
from tensorflow.keras.layers import GRU, LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

import string
import re
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('/home/tim/Datasets/twitter4000.csv')
data.sample(5)

Unnamed: 0,twitts,sentiment
93,does any1 have the spousal abuse number? I jus...,0
2117,"@BigSithewineguy Hey there. Looks like, on BBR...",1
839,is heartbroken,0
841,I dont wanna leave San Diego but I pack anyw...,0
609,Tonight at Mission Street Food: Trumpet mushro...,0


In [3]:
data.iloc[3333]

twitts       Unsuccessfully tried to get my friend's car ke...
sentiment                                                    1
Name: 3333, dtype: object

In [4]:
%time
def clean_text(txt):
    contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", 
                        "could've": "could have", "couldn't": "could not", "didn't": "did not", 
                        "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not",
                        "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", 
                        "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                        "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have",
                        "I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", 
                        "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have",
                        "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                        "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us",
                        "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                        "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", 
                        "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                        "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
                        "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                        "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                        "she'll've": "she will have", "she's": "she is", "should've": "should have", 
                        "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                        "so's": "so as", "this's": "this is","that'd": "that would", 
                        "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                        "there'd've": "there would have", "there's": "there is", "here's": "here is",
                        "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
                        "they'll've": "they will have", "they're": "they are", "they've": "they have",
                        "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
                        "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", 
                        "weren't": "were not", "what'll": "what will", "what'll've": "what will have", 
                        "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is",
                        "when've": "when have", "where'd": "where did", "where's": "where is",
                        "where've": "where have", "who'll": "who will", "who'll've": "who will have",
                        "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have",
                        "will've": "will have", "won't": "will not", "won't've": "will not have",
                        "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
                        "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have",
                        "y'all're": "you all are","y'all've": "you all have","you'd": "you would", 
                        "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                        "you're": "you are", "you've": "you have"}
        
    def _get_contractions(contraction_dict):
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re
    
    def replace_contractions(text):
        contractions, contractions_re = _get_contractions(contraction_dict)
        def replace(match):
            return contractions[match.group(0)]
        return contractions_re.sub(replace, text)
    
    txt = replace_contractions(txt)
    
    txt = "".join([char for char in txt if char not in string.punctuation])
    txt = re.sub('[0-9]+', '', txt)
    
    words = word_tokenize(txt)
    
#    stop_words = set(stopwords.words('english'))
#    words = [w for w in words if not w in stop_words]
    
    words = [word for word in words if word.isalpha()]
    
    cleaned_text = ' '.join(words)
    return cleaned_text

data['cleaned_text'] = data['twitts'].apply(lambda txt: clean_text(txt))

CPU times: user 11 µs, sys: 2 µs, total: 13 µs
Wall time: 25 µs


In [5]:
data.sample(10)

Unnamed: 0,twitts,sentiment,cleaned_text
293,arghhh pissed off that essendon lost,0,arghhh pissed off that essendon lost
2876,@peterfacinelli ..watching son playing soccer....,1,peterfacinelli watching son playing soccerin t...
1894,why doesn't my pic show up when I tweetsearch?,0,why does not my pic show up when I tweetsearch
3988,@TiaMowry CW sux for dropping the show. Thank...,1,TiaMowry CW sux for dropping the show Thanks G...
83,@marcusjroberts working on a Sunday sucks don...,0,marcusjroberts working on a Sunday sucks do no...
1203,It's hot. Kinda wishing I did this. Bummer. G...,0,Its hot Kinda wishing I did this Bummer Got ta...
560,I hate hiccups I've had them all day.,0,I hate hiccups I have had them all day
3774,Outta wk in 10 mins &amp; Running home cus my ...,1,Outta wk in mins amp Running home cus my Bros ...
3697,@JasonNegron My team didn't *pouting* I was ho...,1,JasonNegron My team did not pouting I was hopi...
860,yeah. several years ago. miss him every day ...,0,yeah several years ago miss him every day but ...


In [6]:
x_train, x_test, y_train, y_test = train_test_split(data['cleaned_text'], data['sentiment'], test_size=0.20, shuffle=True)

In [7]:
max_len = x_train.apply(lambda x: len(x)).max()
max_len

135

In [8]:
max_words = 3000
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

x_train_padded = pad_sequences(x_train_seq, maxlen = max_len)
x_test_padded = pad_sequences(x_test_seq, maxlen = max_len)

In [9]:
word_index = tokenizer.word_index
len(word_index)

8516

In [10]:
print(x_train[:1])
print(x_train_seq[:1])
print(x_train_padded[:1])

3769    With sister arianne httpplurkcompxkesi
Name: cleaned_text, dtype: object
[[625, 443, 2428, 2429]]
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0  625  443 2428 2429]]


In [11]:
%time
embedding_vectors = {}
with open('/home/tim/trained/glove/glove.6B.50d.txt', 'r', encoding='utf-8') as file:
    for row in file:
        values = row.split(' ')
        word = values[0]
        weights = np.asarray([float(val) for val in values[1:]])
        embedding_vectors[word] = weights

print(len(embedding_vectors))

CPU times: user 11 µs, sys: 1e+03 ns, total: 12 µs
Wall time: 22.6 µs
400000


In [12]:
emb_dim = 50
if max_words is not None:
    vocab_len = max_words
else:
    vocab_len = len(word_index)+1
    
embedding_matrix = np.zeros((vocab_len, emb_dim))
oov_count = 0
oov_words = []
for word, idx in word_index.items():
    if idx < vocab_len:
        embedding_vector = embedding_vectors.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector
        else:
            oov_count +=1
            oov_words.append(word)
            
print(oov_words[0:5])        

['hahaha', 'lmao', 'bday', 'mileycyrus', 'tommcfly']


In [13]:
print(oov_count, 'out of', vocab_len, 'words were OOV')

375 out of 3000 words were OOV


In [14]:
model = Sequential()
model.add(Embedding(vocab_len, emb_dim, trainable=True, weights= [embedding_matrix]))
model.add(LSTM(8))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation= 'sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics= ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 50)          150000    
_________________________________________________________________
lstm (LSTM)                  (None, 8)                 1888      
_________________________________________________________________
dense (Dense)                (None, 16)                144       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 152,049
Trainable params: 152,049
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.fit(x_train_padded, np.asarray(y_train), epochs=5, validation_data=(x_test_padded, np.asarray(y_test)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7efc651887f0>

In [19]:
model.save('/home/tim/trained/imdbglove')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: /home/tim/trained/imdbglove/assets


In [80]:
print(x_test.head(60))

205     charbrum deliver weekends nickbev place order ...
2075                 mileycyrus hated lady gaga ow looove
477     well six months trying get pregnant still go h...
2446                          accelerate ps hang sometime
2662                      iamthecrime awww change I liked
1577    Kind dealbreaker u discover hot guy u know kin...
3444    CYHSYtheband I shall dance I really want satan...
3956    nlowenlsu watch next week I even remind make s...
670     A random BEAUTIFUL baby gave hug shopping So p...
3448    enjoyed Plymouth uni Summer Ball last nite awe...
263                                 therealtommyg LIARRRR
3698             chrisaffair Warped tour Our summer dates
3803                               YoungCash Awwwww sweet
3059    want change hair againn looove add hair ate tw...
2161    Just got great feedback negotiation nice disco...
1547                     Didnt good day feeling kinda sad
640     pod doesnt enough space better start saving mo...
1248          

In [93]:
test = (x_test_padded[205])
print(test)

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    4
  255    2  509 1149  651  192  339  370 1578]


In [89]:
x_test[205]

'charbrum deliver weekends nickbev place order take tea'

In [96]:
pre = model.predict((test))
len(pre)
pre

array([[0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.52215976],
       [0.522

In [106]:
sample_text = ['the movie was good, i will recommend to watch']
tokenizer.fit_on_texts(sample_text)
seq = tokenizer.texts_to_sequences(sample_text)
pad = pad_sequences(seq, maxlen=135)
predictions = model.predict(pad)
predictions

array([[0.7282256]], dtype=float32)