In [30]:
## Natural Language Processing Introduction
# Preparing text data training
import tensorflow as tf
import tensorflow_datasets as tfds
from bs4 import BeautifulSoup ## for removing html tags
from tensorflow.keras.preprocessing.text import Tokenizer ## For tokenization
import string
import csv
## NLP workflow
# Import Data
# Remove punctuations
# Remove stop words
# Split into training and test sets
# Train tokenizer on the train data, Tokenization is encoding the words into numbers
# Pad as required , This involves truncating or enlongating sentences so that all the data is the same length (so they can all pass through the NN the same)
# Reading from tensorflow datasets
stopwords = ['a','the', 'and', 'of', 'to', 'is', 'in', 'it', 'this', 'i', 'that', 'was', 'as', 'with', 'for', 'movie', 'but', 'film', 'on', 'not', 'are', 'you', 'his', 'have', 'be', 'he', 'one', 'its', 'at', 'all', 'by', 'an', 'they', 'who', 'from', 'like', 'so', 'her', 'or', 'just', 'about', 'has', 'out', 'if', 'some', 'what']
table = str.maketrans('', '', string.punctuation)
imdb_sentences = []
train_data = tfds.as_numpy(tfds.load('imdb_reviews', split="train"))
for item in train_data:
    sentence = str(item['text'].decode('UTF-8').lower()) # Decode text
    soup = BeautifulSoup(sentence) # Remove html tags with beautiful soup
    sentence = soup.get_text()
    words = sentence.split() # Split sentences into words
    filtered_sentence = ""
    for word in words:
        word = word.translate(table) # Remove Punctuations
        if word not in stopwords:
            filtered_sentence = filtered_sentence + word + " "
    imdb_sentences.append(filtered_sentence)
tokenizer = Tokenizer(num_words=25000) # Define tokenizer
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)
print(list(tokenizer.word_index)[:20])   

  soup = BeautifulSoup(sentence) # Remove html tags with beautiful soup


['there', 'good', 'more', 'very', 'when', 'she', 'even', 'up', 'no', 'would', 'my', 'which', 'only', 'time', 'really', 'story', 'their', 'were', 'had', 'see']


In [21]:
# First iteration to find stopwords
print(list(tokenizer.word_index)[:45])

['the', 'and', 'of', 'to', 'is', 'in', 'it', 'this', 'i', 'that', 'was', 'as', 'with', 'for', 'movie', 'but', 'film', 'on', 'not', 'are', 'you', 'his', 'have', 'be', 'he', 'one', 'its', 'at', 'all', 'by', 'an', 'they', 'who', 'from', 'like', 'so', 'her', 'or', 'just', 'about', 'has', 'out', 'if', 'some', 'what']


In [48]:
# Reading from csv file 
sentences=[]
labels=[]
with open('sentiment_analysis.csv', newline="", encoding="utf-8") as csvfile:
     reader = csv.reader(csvfile, delimiter=",")
     next(reader) # Skip header file
     for row in reader:
         # 0 id, 1 label, 2 tweet
         labels.append(int(row[1]))
         sentence = row[2].lower()
         sentence = sentence.replace(",", " , ")
         sentence = sentence.replace(".", " . ")
         sentence = sentence.replace("-", " - ")
         sentence = sentence.replace("/", " / ")
         soup = BeautifulSoup(sentence)
         sentence = soup.get_text()
         words = sentence.split()
         filtered_sentence = ""
         for word in words:
             word = word.translate(table)
             if word not in stopwords:
                 filtered_sentence = filtered_sentence + word + " "
         sentences.append(filtered_sentence)

  soup = BeautifulSoup(sentence)


In [50]:
training_size = int(len(sentences) * 0.8)

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [56]:
from collections import Counter

# Flatten all words into one list
all_words = [word for sentence in training_sentences for word in sentence.split()]
word_counts = Counter(all_words)

print(len(word_counts))   # total unique words
print(word_counts.most_common(20))   # top 20 frequent words
# to make sure there are no more stop words 
# the unique words count is to fit our tokenizer

20455
[('iphone', 2988), ('com', 2734), ('http', 2729), ('apple', 2359), ('p', 2143), ('my', 1851), ('instagram', 1756), ('samsung', 1151), ('twitter', 937), ('new', 919), ('me', 881), ('https', 827), ('phone', 786), ('am', 688), ('sony', 661), ('instagr', 628), ('…', 584), ('follow', 578), ('www', 514), ('ipad', 409)]


In [64]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 20455
max_length = 10
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) # Define tokenizer
tokenizer.fit_on_texts(training_sentences) # Fit
word_index = tokenizer.word_index

# Tokenize and then pad sentences
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(list(word_index)[:30])

['<OOV>', 'iphone', 'com', 'http', 'apple', 'p', 'my', 'instagram', 'samsung', 'twitter', 'new', 'me', 'https', 'phone', 'am', 'sony', 'instagr', '…', 'follow', 'www', 'ipad', 'pic', 'love', 'life', 'android', 'now', 'your', 'rt', 'day', 'ly']
