In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import callbacks
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
import sklearn, os, pickle
from datetime import datetime

In [None]:
df = pd.read_csv(f"{os.getcwd()}/ecommerceDataset.csv")
df.columns = ["category", "item"]

In [None]:
df.head()

In [None]:
print("Shape of the data: ", df.shape)
print("Data info:\n", df.info())
print("Data description:\n", df.describe().transpose())
print("Example data:\n", df.head(1))

In [None]:
print(df.isna().sum())
print("\n\n")
print(df.duplicated().sum())

In [None]:
for item in df["item"]:
    if type(item) != str:
        print(type(item))

In [None]:
df = df.dropna()

In [None]:
categories_list = df["category"].unique()
print(df["item"].value_counts())

In [None]:
df_no_duplicates = df.drop_duplicates()
print(df_no_duplicates["category"].value_counts())

In [None]:
features = df_no_duplicates["item"].values
labels = df_no_duplicates["category"].values

In [None]:
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
labels_oh_encoder = OneHotEncoder()
labels_oh_encoded = labels_oh_encoder.fit_transform(labels.reshape(-1,1))

In [None]:
seed = 42
X_train, X_test, y_train, y_test = train_test_split(features, labels_encoded, train_size=0.8, random_state=seed)

In [None]:
vocab_size = 5000
oov_token = "<OOV>"
max_length = 200
embedding_dim = 64

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size, split=" ", oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

In [None]:
word_index = tokenizer.word_index
print(dict(list(word_index.items())[0:10]))

In [None]:
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_padded = keras.preprocessing.sequence.pad_sequences(X_train_tokens, maxlen=max_length, padding="post", truncating="post")
X_test_padded = keras.preprocessing.sequence.pad_sequences(X_test_tokens, maxlen=max_length, padding="post", truncating="post")

In [None]:
reverse_word_index = dict([(value,key) for (key, value) in word_index.items()])

def decode_tokens(tokens):
    return " ".join([reverse_word_index.get(i, "?") for i in tokens])

print(X_train_padded[2])
print("------------")
print(decode_tokens(X_train_padded[2]))

In [None]:
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, embedding_dim))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(48)))
model.add(keras.layers.Dense(48, activation="relu"))
model.add(keras.layers.Dense(len(np.unique(labels)), activation="softmax"))
model.summary()

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])#, tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [None]:
max_epoch = 20
logpath = os.path.join(os.getcwd(), 'tensorboard_log', datetime.now().strftime("%Y%m%d-%H%M%S"))
datetime.now()
tb = callbacks.TensorBoard(logpath)
early_stopping = keras.callbacks.EarlyStopping(patience=3)
history = model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=max_epoch, callbacks=[tb, early_stopping])

In [None]:
plt.figure()
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.legend(["Training Loss", "Validation Loss"])
plt.show()

In [None]:
plt.figure()
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.legend(["Training Accuracy", "Validation Accuracy"])
plt.show()

In [None]:
prediction = model.predict(X_test_padded)
prediction_index = np.argmax(prediction, axis=1)

report = sklearn.metrics.classification_report(y_test, prediction_index)
print(report)

In [None]:
if os.path.exists(f"{os.getcwd()}/saved_models") is False:
    os.mkdir(f"{os.getcwd()}/saved_models")

if os.path.exists(f"{os.getcwd()}/saved_models/latest") is False:
    os.mkdir(f"{os.getcwd()}/saved_models/latest")

latest_folder = f"{os.getcwd()}/saved_models/latest"

num = 1
saved_model_folder = f"{os.getcwd()}/saved_models/model{num}"
while os.path.exists(saved_model_folder):
    num += 1
    saved_model_folder = f"{os.getcwd()}/saved_models/model{num}"

os.mkdir(saved_model_folder)

In [None]:
with open(f"{saved_model_folder}/tokenizer.json", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
with open(f"{saved_model_folder}/label_encoder.json", "wb") as f:
    pickle.dump(label_encoder, f)

In [None]:
keras.models.save_model(model, f"{saved_model_folder}/saved_model.h5")

In [None]:
import shutil
for filename in os.listdir(saved_model_folder):
    shutil.copy(os.path.join(saved_model_folder, filename), os.path.join(latest_folder, filename))