In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

In [2]:
# https://www.kaggle.com/c/nlp-getting-started : NLP Disaster Tweets
# df = pd.read_csv("train.csv")
df = pd.read_csv("./data/data.csv")


In [3]:
df.shape

(200, 3)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Tweets,label
0,0,ไปเป็นประเทศที่แข็งแกร่ง ได้รับการยอมรับจากนาน...,0
1,1,ผมได้ตัดสินใจอย่างแน่วแน่ที่จะปรับเปลี่ยนวิธีก...,1
2,2,New Normal 3 – “ทำงานเชิงรุก” นายกรัฐมนตรีจะมี...,1
3,3,New Normal 2 – “ประเมินผลงานภาครัฐ โดยผู้มีส่ว...,0
4,4,New Normal 1 – “ผนึกทุกภาคส่วนร่วมวางอนาคตประเ...,1


In [5]:
print((df.label == 1).sum()) # Disaster
print((df.label == 0).sum()) # No Disaster

101
99


In [6]:
# Preprocessing
import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df.Tweets:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

ผมขอให้สถาบันของท่านมีความมั่นคง พร้อมช่วยกันดูแลและเดินหน้าประเทศไทยของเราให้ผ่านสถานการณ์นี้ไปให้ได้ด้วยกันทั้งประเทศ  https://t.co/5MMxfJN8nj
t
ผมขอให้สถาบันของท่านมีความมั่นคง พร้อมช่วยกันดูแลและเดินหน้าประเทศไทยของเราให้ผ่านสถานการณ์นี้ไปให้ได้ด้วยกันทั้งประเทศ  


In [8]:
df["Tweets"] = df.Tweets.map(remove_URL) # map(lambda x: remove_URL(x))
df["Tweets"] = df.Tweets.map(remove_punct)

In [9]:
# remove stopwords
# pip install nltk
from pythainlp.corpus import stopwords


# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = stopwords.words("thai")

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)


In [10]:
df["Tweets"] = df.Tweets.map(remove_stopwords)

In [11]:
from collections import Counter
from pythainlp.tokenize import word_tokenize
# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in word_tokenize(text):
            if word != ' ':
                count[word] += 1
    return count


counter = counter_word(df.Tweets)


In [12]:
len(counter)

1407

In [13]:
counter.most_common(5)


[('และ', 256), ('ที่', 206), ('ผม', 178), ('ของ', 136), ('ใน', 126)]

In [14]:
num_unique_words = len(counter)

In [15]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
train_sentences = train_df.Tweets.to_numpy()
train_labels = train_df.label.to_numpy()
val_sentences = val_df.Tweets.to_numpy()
val_labels = val_df.label.to_numpy()

In [16]:
train_sentences.shape, val_sentences.shape

((160,), (40,))

In [17]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [18]:
# each word has unique index
word_index = tokenizer.word_index

In [19]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [20]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['ผมเดินทางไปพบกับสมาคมธนาคารไทย ได้รับทราบข้อมูลและรับฟังข้อเสนอต่างๆ จากผู้บริหารระดับสูงของธนาคารพาณิชย์ที่เป็นสมาชิกของสมาคม ซึ่งเป็นการพูดคุยที่มีประโยชน์มาก'
 'the court will appoint professionals to supervise its rehabilitation and restructuring in a professional way i hope that we may see again an airline that thais can be proud about and which can contribute to the prosperity of thailand'
 'under this courtsupervised rehabilitation process thai airways may continue to fly and its staff still be employed but without the government putting in more money importantly it will now be able to start a muchdelayed restructuring'
 'let’s think about why we have thai airways thai airways exists to build our country’s reputation and support the prosperity of thais for that it needs to stand on its own feet and compete globally that is the basis on which i made my decision'
 'that’s why i must save the people’s money for future programmes that directly help them survive and then rebuild th

In [21]:
from tensorflow.keras.callbacks import TensorBoard
import datetime
NAME = "Emocial-LSTM-{}".format(int(time.time()))
tensorboard = TensorBoard(log_dir=os.path.join(
    "logs",
    "fit",
    datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),
)
)

In [22]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape

((160, 20), (40, 20))

In [23]:
train_padded[10]

array([132, 133, 134, 135,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])

In [24]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

ผมเดินทางไปพบกับสมาคมธนาคารไทย ได้รับทราบข้อมูลและรับฟังข้อเสนอต่างๆ จากผู้บริหารระดับสูงของธนาคารพาณิชย์ที่เป็นสมาชิกของสมาคม ซึ่งเป็นการพูดคุยที่มีประโยชน์มาก
[132, 133, 134, 135]
[132 133 134 135   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]


In [25]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [26]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [27]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

[132, 133, 134, 135]
ผมเดินทางไปพบกับสมาคมธนาคารไทย ได้รับทราบข้อมูลและรับฟังข้อเสนอต่างๆ จากผู้บริหารระดับสูงของธนาคารพาณิชย์ที่เป็นสมาชิกของสมาคม ซึ่งเป็นการพูดคุยที่มีประโยชน์มาก


In [28]:
# Create LSTM model
from tensorflow.keras import layers

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have 
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a 
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(128, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))



model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 32)            45024     
_________________________________________________________________
lstm (LSTM)                  (None, 128)               82432     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 127,585
Trainable params: 127,585
Non-trainable params: 0
_________________________________________________________________


In [29]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [30]:
model.fit(train_padded, train_labels, epochs=30, validation_data=(val_padded, val_labels),  callbacks=[tensorboard])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x20e94842788>

In [31]:
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [32]:
print(train_sentences[10:20])

print(train_labels[10:20])
print(predictions[10:20])

['ผมเดินทางไปพบกับสมาคมธนาคารไทย ได้รับทราบข้อมูลและรับฟังข้อเสนอต่างๆ จากผู้บริหารระดับสูงของธนาคารพาณิชย์ที่เป็นสมาชิกของสมาคม ซึ่งเป็นการพูดคุยที่มีประโยชน์มาก'
 'the court will appoint professionals to supervise its rehabilitation and restructuring in a professional way i hope that we may see again an airline that thais can be proud about and which can contribute to the prosperity of thailand'
 'under this courtsupervised rehabilitation process thai airways may continue to fly and its staff still be employed but without the government putting in more money importantly it will now be able to start a muchdelayed restructuring'
 'let’s think about why we have thai airways thai airways exists to build our country’s reputation and support the prosperity of thais for that it needs to stand on its own feet and compete globally that is the basis on which i made my decision'
 'that’s why i must save the people’s money for future programmes that directly help them survive and then rebuild th

In [33]:
model.save("th-model.h5", include_optimizer=False)

In [34]:
model = keras.models.load_model("th-model.h5")



In [35]:
model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [36]:
def review_encode(s):
	encoded = [1]

	for word in s:
		if word.lower() in word_index:
			encoded.append(word_index[word.lower()])
		else:
			encoded.append(2)

	return encoded

In [37]:
with open("../test.txt", encoding="utf-8") as f:
	for line in f.readlines():
		nline = line.replace(",", "").replace(".", "").replace("(", "").replace(")", "").replace(":", "").replace("\"","").strip().split(" ")
		encode = review_encode(nline)
		encode = keras.preprocessing.sequence.pad_sequences([encode], padding="post", maxlen=20) # make the data 250 words long
		predict = model.predict(encode)
		print(line)
		print(encode)
		print(round(float(predict[0])))
        


FileNotFoundError: [Errno 2] No such file or directory: '../test.txt'

In [39]:
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameter
from tensorflow.keras.layers.experimental import preprocessing
import kerastuner
LOG_DIR = f"tune/{int(time.time())}"
def build_model(hp):
    model = keras.models.Sequential()
    model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


    model.add(layers.LSTM(hp.Int("input_units", min_value=32, max_value=256, step=32), dropout=0.1, return_sequences=True))
    model.add(layers.Dense(1, activation="sigmoid"))
    # compile
    loss = keras.losses.BinaryCrossentropy(from_logits=False)
    optim = keras.optimizers.Adam(lr=0.001)
    metrics = ["accuracy"]

    model.compile(loss=loss, optimizer=optim, metrics=metrics)
    return model

# model = build_model()
# model.fit(train_padded, train_labels, epochs=100, validation_data=(val_padded, val_labels), verbose=2)
tuner = RandomSearch(build_model, objective=kerastuner.Objective("val_accuracy", direction="max"), max_trials=1, executions_per_trial=1, directory=LOG_DIR)
tuner.search_space_summary()
tuner.search(x=train_padded,y=train_labels, epochs=30,validation_data=(val_padded, val_labels))

Epoch 1/30


InvalidArgumentError:  Incompatible shapes: [32,1] vs. [32,20,128]
	 [[node gradient_tape/binary_crossentropy/mul_1/BroadcastGradientArgs (defined at d:\SK work\nsc2021\venv\lib\site-packages\kerastuner\engine\multi_execution_tuner.py:95) ]] [Op:__inference_train_function_33727]

Function call stack:
train_function
