<a href="https://colab.research.google.com/github/Cerabbite/Easy-conalng/blob/main/Word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

For each word in the top 10.000 most used English, Spanish, Japanese and Chinese words. There will be 100-1.000 sample sentences with those words (written in using the english alphabet). Using these sentences this script is going to work out how related each word is to another word.

In [23]:
# Import libraries
#!pip install keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
import numpy as np

In [15]:
# Dataset to python
with open('Sarcasm_Headlines_Dataset_v2.json', 'r') as f:
  info = f.readlines()

datastore = []
for i in info:
  datastore.append(json.loads(i))

sentences = []
labels = []
urls = []
for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])
  urls.append(item['article_link'])

In [16]:
# Variables
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [17]:
# Seperate training from testing
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [18]:
# Tokenization
"""sentences = [
             'I love my dog',
             'I love my cat',
             'You love my dog!',
             'Do you think my dog is amazing?'
]"""

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#print(word_index)
print(training_padded[0])
print(training_padded.shape)

[   1  325 3169 5817 2489    3  655  993    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
(20000, 100)


In [19]:
# Test
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(testing_padded[0])
print(testing_padded.shape)

[  83 4338    1    6 2186  137  625  181   31   28 3008   56    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
(8619, 100)


In [24]:
# Neural network
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(24, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

num_epochs = 30

history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 3s - loss: 0.6506 - accuracy: 0.6266 - val_loss: 0.5229 - val_accuracy: 0.8107 - 3s/epoch - 5ms/step
Epoch 2/30
625/625 - 2s - loss: 0.4106 - accuracy: 0.8388 - val_loss: 0.3703 - val_accuracy: 0.8413 - 2s/epoch - 3ms/step
Epoch 3/30
625/625 - 3s - loss: 0.3078 - accuracy: 0.8758 - val_loss: 0.3485 - val_accuracy: 0.8460 - 3s/epoch - 4ms/step
Epoch 4/30
625/625 - 2s - loss: 0.2561 - accuracy: 0.9003 - val_loss: 0.3345 - val_accuracy: 0.8536 - 2s/epoch - 4ms/step
Epoch 5/30
625/625 - 2s - loss: 0.2222 - accuracy: 0.9119 - val_loss: 0.3386 - val_accuracy: 0.8507 - 2s/epoch - 4ms/step
Epoch 6/30
625/625 - 3s - loss: 0.1956 - accuracy: 0.9263 - val_loss: 0.3411 - val_accuracy: 0.8545 - 3s/epoch - 4ms/step
Epoch 7/30
625/625 - 2s - loss: 0.1731 - accuracy: 0.9370 - val_loss: 0.3570 - val_accuracy: 0.8500 - 2s/epoch - 4ms/step
Epoch 8/30
625/625 - 2s - loss: 0.1552 - accuracy: 0.9443 - val_loss: 0.3705 - val_accuracy: 0.8500 - 2s/epoch - 4ms/step
Epoch 9/30
625/625 - 2s 

In [26]:
# Use trained model
sentence = [
            "Granny starting to fear spiders in the garden might be real",
            "the weather today is bright and sunny"
]

sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(model.predict(padded))

[[9.881681e-01]
 [2.386909e-05]]
