In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import tokenizer_from_json

import json

import hyperparams as hp

2024-05-23 15:07:11.512932: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
### Load data
clean = pd.read_csv('train.csv')

In [3]:
### Load fitted tokenizer
with open('tokenizer.json') as file:
    data = json.load(file)
    tokenizer = tokenizer_from_json(data)

In [4]:
### Convert text to vectors

X = tokenizer.texts_to_sequences(clean['text'].values)
X = pad_sequences(X, maxlen = hp.max_length)
y = clean.target

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, 
    test_size = 0.2, 
    random_state = 42, 
    stratify=y
)

In [7]:
model = keras.Sequential([
    keras.layers.Embedding(hp.vocab_size, hp.embedding_dim, input_length = hp.max_length),
    keras.layers.LSTM(32),
    keras.layers.Dense(units=32, activation='relu'),
    keras.layers.Dense(units=1, activation='sigmoid')
])

model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 32)           480000    
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense_2 (Dense)             (None, 32)                1056      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 489,409
Trainable params: 489,409
Non-trainable params: 0
_________________________________________________________________


2024-05-23 15:09:24.145208: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-05-23 15:09:24.148423: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-05-23 15:09:24.150492: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [8]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size = hp.batch_size,
    epochs= hp.epochs,
)

Epoch 1/10


2024-05-23 15:09:28.747534: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-05-23 15:09:28.749441: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-05-23 15:09:28.751379: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2024-05-23 15:09:58.671567: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-05-23 15:09:58.673040: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-05-23 15:09:58.674200: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
### Saving model
model.save('lstm.keras')

In [None]:
# data1 = tokenizer.texts_to_sequences(['i wanna slit your throat and fuck the wound'])
# data2 = tokenizer.texts_to_sequences(['i love chicken stew'])
# data_pad1 = pad_sequences(data1, maxlen = hp.max_length)
# data_pad2 = pad_sequences(data2, maxlen = hp.max_length)

# print(data1, data2)

# p1 = model.predict(data_pad1)
# p2 = model.predict(data_pad2)

# print(p1, p2)

In [None]:
# for i in range(3): 
#     text = input('message: ')

#     data = tokenizer.texts_to_sequences([text])
#     padded = pad_sequences(data, maxlen = hp.max_length)

#     p = model.predict(padded)[0][0]

#     print(f'{p} --> ', end='')
          
#     if p > 0.5:
#         print('positive')
#     else:
#         print('negative')