<a href="https://colab.research.google.com/github/Cerabbite/Easy-conalng/blob/main/Word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

For each word in the top 10.000 most used English, Spanish, Japanese and Chinese words. There will be 100-1.000 sample sentences with those words (written in using the english alphabet). Using these sentences this script is going to work out how related words are.

In [None]:
# Import libraries
#!pip install keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
import numpy as np

In [None]:
# Dataset to python
with open('Sarcasm_Headlines_Dataset_v2 - Copy.json', 'r') as f:
  info = f.readlines()

datastore = []
for i in info:
  datastore.append(json.loads(i))


sentences = []
labels = []
urls = []
for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])
  urls.append(item['article_link'])

In [None]:
# Variables
vocab_size = 15000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [None]:
# Seperate training from testing
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [None]:
# Tokenization
"""sentences = [
             'I love my dog',
             'I love my cat',
             'You love my dog!',
             'Do you think my dog is amazing?'
]"""

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#print(word_index)
print(training_padded[0])
print(training_padded.shape)

[  328 12776   799  3405  2404    47   389  2214 12777     6  2614  8863
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(20000, 100)


In [None]:
# Test
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(testing_padded[0])
print(testing_padded.shape)

[    1  1100  6663  9423    30 11505  2439     5   519   109     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(6711, 100)


In [None]:
# Train neural network
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(24, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

num_epochs = 50

history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/50
625/625 - 4s - loss: 0.6730 - accuracy: 0.5688 - val_loss: 0.6205 - val_accuracy: 0.6005 - 4s/epoch - 6ms/step
Epoch 2/50
625/625 - 3s - loss: 0.4559 - accuracy: 0.8154 - val_loss: 0.3896 - val_accuracy: 0.8431 - 3s/epoch - 5ms/step
Epoch 3/50
625/625 - 3s - loss: 0.3073 - accuracy: 0.8820 - val_loss: 0.3590 - val_accuracy: 0.8464 - 3s/epoch - 5ms/step
Epoch 4/50
625/625 - 3s - loss: 0.2479 - accuracy: 0.9042 - val_loss: 0.3717 - val_accuracy: 0.8349 - 3s/epoch - 5ms/step
Epoch 5/50
625/625 - 3s - loss: 0.2079 - accuracy: 0.9223 - val_loss: 0.3433 - val_accuracy: 0.8581 - 3s/epoch - 5ms/step
Epoch 6/50
625/625 - 3s - loss: 0.1766 - accuracy: 0.9348 - val_loss: 0.3721 - val_accuracy: 0.8470 - 3s/epoch - 5ms/step
Epoch 7/50
625/625 - 3s - loss: 0.1498 - accuracy: 0.9478 - val_loss: 0.3634 - val_accuracy: 0.8574 - 3s/epoch - 5ms/step
Epoch 8/50
625/625 - 3s - loss: 0.1291 - accuracy: 0.9564 - val_loss: 0.3847 - val_accuracy: 0.8534 - 3s/epoch - 5ms/step
Epoch 9/50
625/625 - 3s 

In [None]:
# Use trained model
sentence = [
            "Granny starting to fear spiders in the garden might be real",
            "the weather today is bright and sunny",
            "That's just what I needed today!",
            "Well, what a surprise.",
            "Really, Sherlock? No! You are clever.",
            "Today I had a job interview.",
            "He went to grab some snacks."
]

sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(model.predict(padded))

[[9.9978399e-01]
 [2.6290691e-06]
 [9.9676037e-01]
 [1.3801695e-06]
 [1.7547780e-10]
 [9.6333427e-11]
 [4.0342513e-01]]
