The computer cannot directly understand a text because it does not have a structure (Unstructured), for processing some tags need to be added to the text.

*Text(Unstructured)* --> *TOKENIZATION* --> *Stemming* / *Lemmatization* --> *Parts of Speech Tagging* --> *N.E.R(Name Entity Recognition)* --> *Processed text(Structured)*

> Encoding letters v/s Encoding Words (Wats better?)

**Tokenization and Sequencing**
-

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer


**Tokenization**

The tokenizer doesnt is not case sensitive and it also ignores any punctuations

In [47]:
f=open("book.txt","r")
d=f.read()
sentences=d.split("\n")

tokenizer= Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)      # This tokenizer instance, is fit onto the given sentences.

word_dict= tokenizer.word_index
print(f"number of distinct tokens: {list(word_dict.values())[-1]}\n")
print(word_dict,"\n")

sequences= tokenizer.texts_to_sequences(sentences)
print(sequences)

number of distinct tokens: 98

{'the': 1, 'and': 2, 'of': 3, 'a': 4, 'harry': 5, 'potter': 6, 'series': 7, 'wizard': 8, 'in': 9, 'fantasy': 10, 'novels': 11, 'by': 12, 'british': 13, 'all': 14, 'school': 15, 'story': 16, 'united': 17, 'many': 18, 'includes': 19, 'is': 20, 'seven': 21, 'written': 22, 'author': 23, 'j': 24, 'k': 25, 'rowling': 26, 'chronicle': 27, 'lives': 28, 'young': 29, 'his': 30, 'friends': 31, 'ron': 32, 'weasley': 33, 'hermione': 34, 'granger': 35, 'whom': 36, 'are': 37, 'students': 38, 'at': 39, 'hogwarts': 40, 'witchcraft': 41, 'wizardry': 42, 'main': 43, 'arc': 44, 'concerns': 45, "harry's": 46, 'conflict': 47, 'with': 48, 'lord': 49, 'voldemort': 50, 'dark': 51, 'who': 52, 'intends': 53, 'to': 54, 'become': 55, 'immortal': 56, 'overthrow': 57, 'governing': 58, 'body': 59, 'known': 60, 'as': 61, 'ministry': 62, 'magic': 63, 'subjugate': 64, 'wizards': 65, 'muggles': 66, 'non': 67, 'magical': 68, 'people': 69, 'was': 70, 'originally': 71, 'published': 72, 'englis

**Sequencing**

In [None]:
sentences= ["I LOVE my dog",
            "i love my dog",
            "I love my cat",
            "I love my dog!!"]

tokenizer= Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)      # This tokenizer instance, is fit onto the given sentences.

print(f"number of distinct tokens: {list(word_dict.values())[-1]}")

word_dict= tokenizer.word_index  # Extract tokens
print(word_dict)

sequences= tokenizer.texts_to_sequences(sentences)   # Convert to sequences
print(sequences)

number of distinct tokens: 98
{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 5], [1, 2, 3, 4]]


In [None]:
test_data= ['I realy love my dog', 'my dog loves eating']

# I am using the above generated tokenizer to extract sequences from test_data
test_sqn= tokenizer.texts_to_sequences(test_data)

# Observe how a 5 word sentences turn into 4 word sentences, so some data is lost
print(word_dict)
print(test_sqn)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}
[[1, 2, 3, 4], [3, 4]]


In [None]:
sentences= ["I LOVE my dog",
            "i love my dog",
            "I love my cat",
            "I love my dog!!"]

# OOV stands for "Out-Of-Volcablury", this property will generate token for unseen token and hences preserve the sentence.
tokenizer= Tokenizer(num_words=100, oov_token="<OOV>")  
tokenizer.fit_on_texts(sentences)    


# Here, we can see that the very first token is <OOV> and this will be used for all the instances of unseen tokens in test_data.
word_dict= tokenizer.word_index  
sequences= tokenizer.texts_to_sequences(sentences)   
print(word_dict)

test_data= ['I realy love my dog', 'my dog loves eating']
test_sqn= tokenizer.texts_to_sequences(test_data)
print(test_sqn)

number of distinct tokens: 6
{'<OOV>': 1, 'i': 2, 'love': 3, 'my': 4, 'dog': 5, 'cat': 6}
[[2, 1, 3, 4, 5], [4, 5, 1, 1]]


In order to feed this Data into a NN, the size of the sentences should be the same we will pad the sentences, so we pad them with zero, similar to how we used to do with Images.

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences= ["I LOVE my dog",
            "i love my dog!!",
            "I love my cat",
            "I love my dog and he loves me too"]

tokenizer= Tokenizer(num_words=100, oov_token="<OOV>")  
tokenizer.fit_on_texts(sentences)    

word_dict= tokenizer.word_index  
sequences= tokenizer.texts_to_sequences(sentences)  
print(word_dict,"\n") 
print(sequences, "\n")

# pad_sequences(< sequences >, < padding=(pre/post) >, < truncating=(pre/post), < maxlen=(number) >)
# truncating parameter is used if we use maxlen, where to truncate the sentences it len(sentence)>maxlen.
padded_sqn= pad_sequences(sequences, padding='pre')
print(padded_sqn)

{'<OOV>': 1, 'i': 2, 'love': 3, 'my': 4, 'dog': 5, 'cat': 6, 'and': 7, 'he': 8, 'loves': 9, 'me': 10, 'too': 11} 

[[2, 3, 4, 5], [2, 3, 4, 5], [2, 3, 4, 6], [2, 3, 4, 5, 7, 8, 9, 10, 11]] 

[[ 0  0  0  0  0  2  3  4  5]
 [ 0  0  0  0  0  2  3  4  5]
 [ 0  0  0  0  0  2  3  4  6]
 [ 2  3  4  5  7  8  9 10 11]]


**Sentiment Anlysis of Sarcasm**
-


Loading Dataset

In [2]:
import json

f=open("Sarcasm_Headlines_Dataset_v2.json","r")

sentences= []
labels= []
urls= []

for line in f:
    item=json.loads(line)
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [3]:
print(len(sentences))
print(sentences[0])
print(labels[0])

28619
thirtysomething scientists unveil doomsday clock of hair loss
1


Generating Training and Test data

In [4]:
from sklearn.model_selection import train_test_split as tts
training_sentences, testing_sentences, train_label, test_label= tts(sentences, labels, test_size=0.2, random_state=42)

print(len(training_sentences))
print(len(train_label))
print(len(testing_sentences))
print(len(test_label))

22895
22895
5724
5724


In [5]:
input_dim= 10000
embedding_dim= 16
max_len= 100

Initilizing training tokenizer

1. After loading the dataset, initiate the tokenizer and fit it only on the Training set( training_sentences).
2. Create the sequences using the tokenizer and the training_sentence.
3. Th3e labels and sequences must be converted to npmpy array for the sequences tro be processed 

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer= Tokenizer(num_words=input_dim, oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences)

word_dict= tokenizer.word_index
print(f"No. of tokens: {list(word_dict.values())[-1]}")

sequences= tokenizer.texts_to_sequences(training_sentences)
padded_sqn= pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

print(padded_sqn[0])
print(padded_sqn.shape)

No. of tokens: 27770
[  27   13  109  638   17  781   67 4774    5   43 1939    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
(22895, 100)


Tokenizing Testing data

In [7]:
test_sequences= tokenizer.texts_to_sequences(testing_sentences)
padded_sqn_test= pad_sequences(test_sequences, maxlen=max_len, padding="post", truncating="post")

In [8]:
import numpy as np

padded_sqn= np.array(padded_sqn)
train_label= np.array(train_label)
padded_sqn_test= np.array(padded_sqn_test)
test_label= np.array(test_label)

In [9]:
from tensorflow.keras import layers, models

model=models.Sequential([
    layers.Embedding(input_dim, embedding_dim, input_length= max_len),
    layers.GlobalAveragePooling1D(),
    layers.Dense(24, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])



In [10]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [11]:
history= model.fit(padded_sqn, train_label, epochs=30, validation_data=(padded_sqn_test, test_label), verbose=1)
model.save("Sarcasm_detector.keras")

Epoch 1/30
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.5428 - loss: 0.6880 - val_accuracy: 0.7790 - val_loss: 0.5919
Epoch 2/30
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7585 - loss: 0.5305 - val_accuracy: 0.8129 - val_loss: 0.4222
Epoch 3/30
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8283 - loss: 0.3926 - val_accuracy: 0.8307 - val_loss: 0.3827
Epoch 4/30
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8577 - loss: 0.3351 - val_accuracy: 0.8125 - val_loss: 0.3978
Epoch 5/30
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8678 - loss: 0.3039 - val_accuracy: 0.8457 - val_loss: 0.3453
Epoch 6/30
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8891 - loss: 0.2731 - val_accuracy: 0.8513 - val_loss: 0.3372
Epoch 7/30
[1m716/716[0m 

In [22]:
sentence = ["a rainy day sun shines the best"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")
print(model.predict(padded))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[[0.00449917]]
