In [2]:
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf
import pandas as pd
import numpy as np
import time

from tensorflow import keras

In [3]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='latin-1', usecols=[0,5])

In [185]:
df.rename(columns={df.columns[0]: 'Sentiment', df.columns[1]: 'Content'}, inplace=True)

In [186]:
df.sample(5)

Unnamed: 0,Sentiment,Content
1461514,4,YESSS. @DavidArchie is finally following @Tomm...
269176,0,i really hated the mtv music awards ouugh
1208341,4,@apogeum I can totally tell its her eyes idky
319381,0,Add sore neck to the headache. Three conferenc...
24826,0,So I need to finish a book in 12 hours.


In [187]:
#testing code
df = df.sample(10000)

In [188]:
df = df.sort_values(by='Content')

In [189]:
print((df['Sentiment']==0).sum(), (df['Sentiment']==4).sum())

5123 4877


In [190]:
print(df['Sentiment'].unique())

[0 4]


In [191]:
# Preprocessing
import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [192]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df.Content:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

 am suddenly v. v. sad and tired. :\ also - can anyone think of great hero anthems? counterpart to heroine's: http://snurl.com/kixxt
snurl
 am suddenly v. v. sad and tired. :\ also - can anyone think of great hero anthems? counterpart to heroine's: 


In [193]:
df["Content"] = df.Content.map(remove_URL) # map(lambda x: remove_URL(x))
df["Content"] = df.Content.map(remove_punct)

In [194]:
# remove stopwords
# pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Brian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [195]:
df["Content"] = df.Content.map(remove_stopwords)

In [196]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(df.Content)

In [197]:
num_unique_words = len(counter)

In [198]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
train_sentences = train_df.Content.to_numpy()
train_labels = train_df.Sentiment.to_numpy()
val_sentences = val_df.Content.to_numpy()
val_labels = val_df.Sentiment.to_numpy()

In [199]:
train_sentences.shape, val_sentences.shape

((8000,), (2000,))

In [200]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [201]:
# each word has unique index
word_index = tokenizer.word_index

In [202]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [203]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['shoulda known got suspicous mf talking buying new chain game last night today wtf lol'
 'spoke soon' 'think shrunk fav boxer briefs oh well ill new ones 2moro'
 'jon amp kate dont let stupid tabloids ruin something thats lasted 10 years'
 'kinda depressed showing emotions ahh fuck']
[[1474, 1278, 16, 4644, 4645, 422, 1475, 35, 2170, 195, 38, 36, 10, 767, 14], [2914, 86], [23, 4646, 677, 2915, 4647, 34, 22, 31, 35, 375, 2171], [1279, 30, 1476, 6, 142, 289, 4648, 2172, 108, 28, 2173, 365, 281], [215, 1129, 1280, 4649, 512, 472]]


In [204]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape

((8000, 20), (2000, 20))

In [205]:
train_padded[100]

array([  14, 4791,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0])

In [206]:
print(train_sentences[100])
print(train_sequences[100])
print(train_padded[100])

lol waddup
[14, 4791]
[  14 4791    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]


In [207]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [208]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [209]:
decoded_text = decode(train_sequences[100])

print(train_sequences[100])
print(decoded_text)

[14, 4791]
lol waddup


In [225]:
# Create LSTM model
from tensorflow.keras import layers

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have 
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a 
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 20, 32)            616256    
                                                                 
 lstm_9 (LSTM)               (None, 64)                24832     
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 641,153
Trainable params: 641,153
Non-trainable params: 0
_________________________________________________________________


In [226]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [227]:
model.fit(train_padded, train_labels, epochs=20, validation_data=(val_padded, val_labels), verbose=1)

Epoch 1/20

KeyboardInterrupt: 

In [215]:
predictions = model.predict(train_padded)
predictions = [4 if p > 0.5 else 0 for p in predictions]



In [216]:
print(train_sentences[100:120])

print(train_labels[100:120])
print(predictions[100:120])

['lol waddup' 'shakes fist air brown' 'sigh finished book want sequel'
 'cries beyond fucking nervous speech'
 'finally saw revolutionary road saying wasnt depressing kate leo never disappoint beautiful'
 'sigh anything people fishing bloody dalaran fountain need 3 coins'
 'sigh work tuesday 4 days toiling mill'
 'sings fckin best best ever u aint gotta ask twice u lt3 share like last slice'
 '300 followers 2 weeksgoosh'
 'ryanseacrest workout seems well manyou even seeem taller lately ï¿½jermaineï¿½'
 'got pool think im sunburnt'
 'listening metro station shake remixquite good' 'summer'
 'nothing like dragonforce feel fervently fortuitous fb'
 'gt lunch soon yay rainbow appears'
 'debycats washed 3 cars today 10 real awesome'
 'coz wanna live life rich famous lss'
 'need revise chemisty fun feel like shit stirred thoroughly bucket ive even let vamp wars slip'
 'got sunshine' 'dropped piizza pool']
[4 0 0 0 4 0 0 4 4 4 0 4 4 4 4 4 4 0 4 0]
[4, 4, 0, 0, 4, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 