In [None]:
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import re
import pandas as pd
import pickle
import nltk


In [None]:

nltk.download('stopwords')


In [None]:

# importing the dataset
DATASET_ENCODING = "ISO-8859-1"
DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "tweet"]
df = pd.read_csv('./training.1600000.processed.noemoticon.csv', delimiter=',', encoding=DATASET_ENCODING , names=DATASET_COLUMNS)


In [None]:
df.head()

In [None]:
print((df.sentiment == 4).sum())  # Pos
print((df.sentiment == 0).sum())  # Neg


In [None]:
# Preprocessing
import re
import string


def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

def remove_mention(text):
    return re.sub("@[A-Za-z0-9]+", "", text)


def stem_tweets(tweet):
    tokens = tweet.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)


In [None]:
# df = df.head(100)
print((df.sentiment == 4).sum())  # Pos
print((df.sentiment == 0).sum())  # Neg


In [None]:
df["tweet"] = df.tweet.map(remove_mention)
df["tweet"] = df.tweet.map(remove_URL)  # map(lambda x: remove_URL(x))
df["tweet"] = df.tweet.map(remove_punct)
df.head()


In [None]:
# remove stopwords
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def remove_stopwords(text):
    filtered_words = [word.lower()
                      for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)


In [None]:
stop

In [None]:
df["tweet"] = df.tweet.map(remove_stopwords)
df.tweet


In [None]:
from collections import Counter

# Count unique words


def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(df.tweet)

len(counter)

In [None]:
counter

In [None]:
counter.most_common(5)


In [None]:
num_unique_words = len(counter)



In [None]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

df['tweet'] = df.tweet.map(stem_tweets)
df.head()

In [None]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
train_sentences = train_df.tweet.to_numpy()
train_labels = train_df.sentiment.to_numpy()
val_sentences = val_df.tweet.to_numpy()
val_labels = val_df.sentiment.to_numpy()


In [None]:
train_sentences.shape, val_sentences.shape


In [None]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences)  # fit only to training


In [None]:
# each word has unique index
word_index = tokenizer.word_index


In [None]:
word_index

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)


In [None]:
print(train_sentences[0:5])
print(train_sequences[0:5])


In [None]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(
    train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(
    val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape


In [None]:
train_padded[10]


In [None]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])


In [None]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])


In [None]:
reverse_word_index

In [None]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])


In [None]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)


In [None]:
# Create LSTM model
from tensorflow.keras import layers
import keras

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()


In [None]:
from tensorflow import keras

loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)


In [None]:
model.fit(train_padded, train_labels, epochs=3,
          validation_data=(val_padded, val_labels))


In [None]:
predictions = model.predict(train_padded)
predictions = [1 if p > 3.5 else 0 for p in predictions]


In [None]:
print(train_sentences[10:20])

print(train_labels[10:20])
print(predictions[10:20])
