<a href="https://colab.research.google.com/github/Ashahet1/Google-Colab/blob/main/AIMLCodersBook/Chapter_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing
**Encoding Language into Numbers** for example “I love my dog.” You could encode that with the numbers [1, 2, 3, 4]. If you then wanted to encode “I love my cat.” it could be [1, 2, 3, 5]. You’ve already gotten to the point where you can tell that the sentences have a similar meaning because they’re similar numerically—[1, 2, 3, 4] looks a lot like [1, 2, 3, 5].

This process is called *tokenization*

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
sentences = [
    'i love my dog',
    'I, love my cat'
]
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences) # This will create the tokenized word index
word_index = tokenizer.word_index
print(word_index)

# Another example where it will remove the punctuation
sentences1 = [
    'i love my dog',
    'I, love my cat',
    'Is it love?'
]
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences1) # This will create the tokenized word index
word_index = tokenizer.word_index
print(word_index)
# Turning Sentences into sequences
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)


{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}
{'love': 1, 'i': 2, 'my': 3, 'dog': 4, 'cat': 5, 'is': 6, 'it': 7}
[[2, 1, 3, 4], [2, 1, 3, 5]]


# Using out-of-vocabulary tokens

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
sentences = [
    'Today is a snowy day',
    'Will it be rainy tomorrow?'
]
tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

# Turning Sentences into sequences
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)



{'<OOV>': 1, 'today': 2, 'is': 3, 'a': 4, 'snowy': 5, 'day': 6, 'will': 7, 'it': 8, 'be': 9, 'rainy': 10, 'tomorrow': 11}
[[2, 3, 4, 5, 6], [7, 8, 9, 10, 11]]


# Understanding padding
When training neural networks you typically need all your data to be in the same shape. As reformatted the images to be the same width and height. With text you face the same issue—once you’ve tokenized your words and converted your sentences into sequences, they can all be different lengths. To get them to be the same size and shape, you can use ***padding***.

sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?',
    'I really enjoyed walking in the snow today'
]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?',
    'I really enjoyed walking in the snow today'
]
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

# Turning Sentences into sequences
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)
# Padding
padded = pad_sequences(sequences)
print(padded)

# Post padding
padded1 = pad_sequences(sequences, padding = 'post')
print(padded1)

# Max Padding
padded2 = pad_sequences(sequences, padding = 'post', maxlen = 6)
print(padded2)

{'today': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5, 'rainy': 6, 'it': 7, 'i': 8, 'really': 9, 'enjoyed': 10, 'walking': 11, 'in': 12, 'the': 13, 'snow': 14}
[[1, 2, 3, 4, 5], [1, 2, 3, 6, 5], [2, 7, 4, 1], [8, 9, 10, 11, 12, 13, 14, 1]]
[[ 0  0  0  1  2  3  4  5]
 [ 0  0  0  1  2  3  6  5]
 [ 0  0  0  0  2  7  4  1]
 [ 8  9 10 11 12 13 14  1]]
[[ 1  2  3  4  5  0  0  0]
 [ 1  2  3  6  5  0  0  0]
 [ 2  7  4  1  0  0  0  0]
 [ 8  9 10 11 12 13 14  1]]
[[ 1  2  3  4  5  0]
 [ 1  2  3  6  5  0]
 [ 2  7  4  1  0  0]
 [10 11 12 13 14  1]]


# Removing Stopwords and Cleaning Text
The first is to strip out HTML tags. Fortunately, there’s a library called BeautifulSoup that makes this straightforward.

In [None]:
from bs4 import BeautifulSoup

sentence = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?',
    'I really enjoyed walking in the snow today'
]
# Join the list of sentences into a single string
text = ' '.join(sentences)
soup = BeautifulSoup(text, 'html.parser')  # Specify 'html.parser'
sentence = soup.get_text()
print(sentence)

Today is a sunny day Today is a rainy day Is it sunny today? I really enjoyed walking in the snow today


# Working with Real Data Sources
TensorFlow Datasets: the IMDb dataset, provides access to several text-based datasets; we’ll explore imdb_reviews, a dataset of 50,000 labeled movie reviews from the Internet Movie Database (IMDb), each of which is determined to be positive or negative in sentiment

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
# Loading the data
imdb_senetnce = []
train_data = tfds.as_numpy(tfds.load('imdb_reviews', split = 'train'))
for item in train_data:
    imdb_senetnce.append(str(item['text']))

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = 25000)
tokenizer.fit_on_texts(imdb_senetnce)
sequences = tokenizer.texts_to_sequences(imdb_senetnce)

print(tokenizer.word_index)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.Z3LAAT_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.Z3LAAT_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.Z3LAAT_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


# Getting Text from CSV Files
Sentiment Analysis

In [20]:
from bs4 import BeautifulSoup
import string

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
             "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how",
             "hows", "i", "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself",
             "lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes", "should",
             "so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then",
             "there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those", "through",
             "to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve", "were",
             "what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why",
             "whys", "with", "would", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
             "yourselves"]

table = str.maketrans('', '', string.punctuation)

In [22]:
import csv
import requests

# Download the file
url = 'https://storage.googleapis.com/learning-datasets/binary-emotion.csv'
local_file = '/tmp/binary-emotion.csv'  # Specify the desired local path

response = requests.get(url)
# Check if the download was successful
response.raise_for_status()  # Will raise an exception for bad status codes (4xx, 5xx)

with open(local_file, 'wb') as file:
    file.write(response.content)
sentences=[]
labels=[]
with open(local_file, encoding='UTF-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=",")
    for row in reader:
        labels.append(int(row[0]))
        sentence = row[1].lower()
        sentence = sentence.replace(",", " , ")
        sentence = sentence.replace(".", " . ")
        sentence = sentence.replace("-", " - ")
        sentence = sentence.replace("/", " / ")
        soup = BeautifulSoup(sentence)
        sentence = soup.get_text()
        words = sentence.split()
        filtered_sentence = ""
        for word in words:
            word = word.translate(table)
            if word not in stopwords:
                filtered_sentence = filtered_sentence + word + " "
        sentences.append(filtered_sentence)
print(len(labels))
print(len(sentences))

  soup = BeautifulSoup(sentence)


35327
35327


In [23]:
# Creating training and test subsets
# Now that the text corpus has been read into a list of sentences, you’ll need to split it into training and test subsets for training a model

training_size = 28000

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]


In [24]:
vocab_size = 20000
embedding_dim = 32
max_length = 10
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [26]:
print(len(training_sequences[0]))
print(len(training_padded[0]))
print(word_index)

8
10


A smaller dataset that’s stored in JSON and a lot of fun to work with is the News Headlines Dataset for Sarcasm Detection by Rishabh Misra, available on Kaggle. Sarcasam Pre-process.

In [29]:
import json
import requests

# Download the file
url = 'https://storage.googleapis.com/learning-datasets/sarcasm.json'
local_file = '/tmp/sarcasm.json'  # Specify the desired local path

response = requests.get(url)
response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
with open(local_file, 'wb') as file:  # Use 'wb' to write binary data
    file.write(response.content)

with open(local_file, 'r') as file:
    datastore = json.load(file)

sentences=[]
labels=[]
urls = []


for item in datastore:
  sentence = item['headline'].lower()
  sentence = sentence.replace(",", " , ")
  sentence = sentence.replace(".", " . ")
  sentence = sentence.replace("-", " - ")
  sentence = sentence.replace("/", " / ")
  soup = BeautifulSoup(sentence)
  sentence = soup.get_text()
  words = sentence.split()
  filtered_sentence = ""
  for word in words:
    word = word.translate(table)
    if word not in stopwords:
        filtered_sentence = filtered_sentence + word + " "
  sentences.append(filtered_sentence)
  labels.append(item['is_sarcastic'])
  urls.append(item['article_link'])
print(len(sentences))
print(len(labels))
print(len(urls))


  soup = BeautifulSoup(sentence)


26709
26709
26709


In [30]:
training_size = 20000
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [32]:
vocab_size = 20000
embedding_dim = 32
max_length = 10
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(training_sequences, padding='post')
print(word_index)


