In [1]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LeakyReLU, Dropout
import tensorflow as tf
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import string
from nltk.stem.snowball import SnowballStemmer
from gensim.models import Doc2Vec
import math
import gensim
import gensim.downloader as api
import pandas as pd
import numpy as np
from keras.utils import np_utils
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import random


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ruben\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('train.csv', delimiter='\t')

In [3]:
import gensim.downloader as api
import pickle

def tokenize(text: str, stopwords: set):
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")

    text = text.translate(str.maketrans('', '', string.punctuation))

    return [i for i in text.split(" ") if i not in stopwords]


def join_name_description(name: str, desc: str):
    name = name.lower()

    if pd.isna(desc):
        return name

    desc = desc.lower()

    if desc == "no description yet":
        return name

    return name + " " + desc


def stem(words: list, stemmer: SnowballStemmer):
    words = [stemmer.stem(word) for word in words]
    return " ".join(words)


def build_dataset(df):
    df = df[df["item_condition_id"].notna()]
    df = df[df["price"].notna()]
    df = df[df["category_name"].notna()]
    df = df.drop("train_id", axis=1)

    categories = df["category_name"].str.split("/")
    first_categories = categories.apply(
        lambda x: x[0]).astype("category").cat.codes
    second_categories = categories.apply(
        lambda x: x[1]).astype("category").cat.codes

    df["first_cat"] = first_categories
    df["second_cat"] = second_categories

    df = df.drop("category_name", axis=1)
    price = df["price"]
    df = df.drop("price", axis=1)

    X, X_test, Y, Y_test = train_test_split(
        df, price, test_size=0.3, stratify=df["first_cat"], shuffle=True)
    X_train, X_val, Y_train, Y_val = train_test_split(
        X, Y, test_size=0.2, stratify=X["first_cat"], shuffle=True)

    return X_train, X_val, X_test, Y_train, Y_val, Y_test


def one_hot_encode(dataset):
    res = pd.DataFrame(np_utils.to_categorical(
        dataset["item_condition_id"] - 1, 5))
    res = res.set_axis(["item_condition_" + str(x)
                       for x in range(5)], axis=1, copy=True)

    firstcat_onehot = np_utils.to_categorical(dataset["first_cat"])
    secondcat_onehot = np_utils.to_categorical(dataset["second_cat"])

    tmp = pd.DataFrame(firstcat_onehot)
    tmp = tmp.set_axis(["first_cat_" + str(x)
                       for x in range(10)], axis=1, copy=True)
    res = pd.concat([res, tmp], axis=1)

    num_sec_cat = len(dataset["second_cat"].unique())
    tmp = pd.DataFrame(secondcat_onehot)
    tmp = tmp.set_axis(["second_cat_" + str(x)
                       for x in range(num_sec_cat)], axis=1, copy=True)
    res = pd.concat([res, tmp], axis=1)

    return res


def get_stemmed_text(dataset):
    names = dataset["name"]
    descriptions = dataset["item_description"]

    combined = names.combine(descriptions, join_name_description)

    stop = set(stopwords.words('english'))
    tokenized = combined.apply(lambda x: tokenize(x, stop))

    stemmer = SnowballStemmer("english")
    return tokenized.apply(lambda x: stem(x, stemmer))


def tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

def train_doc2vec(text_data):   
  print('start training doc2vec')
  data_for_training = list(tagged_document(text_data))
  model = gensim.models.doc2vec.Doc2Vec(
      vector_size=500, min_count=2, epochs=30)
  model.build_vocab(data_for_training)
  model.train(data_for_training, total_examples=model.corpus_count,
              epochs=model.epochs)
  print('end training doc2vec')
  # print(model.infer_vector(
  #     ['violent', 'means', 'to', 'destroy', 'the', 'organization']))
  return model

def preprocess(train, val, test):
    # train set
    X_train = one_hot_encode(train)
    X_train["shipping"] = train["shipping"]
    X_train["shipping"] = X_train["shipping"].fillna(0.5)
    stemmed_train = get_stemmed_text(train)

    # validation set
    X_val = one_hot_encode(val)
    X_val["shipping"] = val["shipping"]
    X_val["shipping"] = X_val["shipping"].fillna(0.5)
    stemmed_val = get_stemmed_text(val)

    # test set
    X_test = one_hot_encode(test)
    X_test["shipping"] = test["shipping"]
    X_test["shipping"] = X_test["shipping"].fillna(0.5)
    stemmed_test = get_stemmed_text(test)

    # embedding-------------------------------------------------
    
    #-----------------------------------------------------------
    

    # tr_text_df = pd.DataFrame.sparse.from_spmatrix(tr_text)

    # n_features = tr_text_df.shape[1]
    # features_names = ["text_" + str(x) for x in range(n_features)]

    # tr_text_df = tr_text_df.set_axis(features_names, axis=1, copy=True)

    # val_text_df = pd.DataFrame.sparse.from_spmatrix(
    #     val_text).set_axis(features_names, axis=1, copy=True)
    # te_text_df = pd.DataFrame.sparse.from_spmatrix(
    #     te_text).set_axis(features_names, axis=1, copy=True)

    # X_train = pd.concat([X_train, tr_text], axis=1)
    # X_val = pd.concat([X_val, val_text], axis=1)
    # X_test = pd.concat([X_test, te_text], axis=1)

    return X_train, X_val, X_test, stemmed_train, stemmed_test, stemmed_val


In [4]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = build_dataset(df)


In [6]:
type(Y_train)
Y_train = Y_train.apply(lambda x: np.log(x+0.1))
Y_test = Y_test.apply(lambda x: np.log(x+0.1))
Y_val = Y_val.apply(lambda x: np.log(x+0.1))


In [35]:
type(Y_train)

pandas.core.series.Series

In [21]:
X_train, X_val, X_test, stemmed_train, stemmed_test, stemmed_val = preprocess(
    X_train, X_val, X_test)


In [22]:
stemmed_train = stemmed_train.to_numpy()
stemmed_test = stemmed_test.to_numpy()
stemmed_val = stemmed_val.to_numpy()

stemmed_train = [x.split(' ') for x in stemmed_train]
stemmed_test = [x.split(' ') for x in stemmed_test]
stemmed_val = [x.split(' ') for x in stemmed_val]

In [23]:
doc2vecmodel = train_doc2vec(stemmed_train)
with open('./ProcessedData/doc2vec.pkl', 'wb') as f:
    pickle.dump(f, doc2vecmodel)



start training doc2vec
end training doc2vec


TypeError: file must have a 'write' attribute

In [25]:
tr_text = [doc2vecmodel.infer_vector(x) for x in stemmed_train]
val_text = [doc2vecmodel.infer_vector(x) for x in stemmed_val]
te_text = [doc2vecmodel.infer_vector(x) for x in stemmed_test]


In [24]:
with open('./ProcessedData/doc2vec.pkl', 'wb') as f:
  pickle.dump(doc2vecmodel, f)


In [27]:
X_train = np.concatenate([X_train, tr_text], axis=1)
X_val = np.concatenate([X_val, val_text], axis=1)
X_test = np.concatenate([X_test, te_text], axis=1)


In [None]:
import pickle
with open('./ProcessedData/x_train_doc2vec.pkl','wb') as f:
  pickle.dump(X_train, f)
with open('./ProcessedData/x_val_doc2vec.pkl', 'wb') as f:
  pickle.dump(X_val, f)
with open('./ProcessedData/x_test_doc2vec.pkl', 'wb') as f:
  pickle.dump(X_test, f)


In [7]:
import pickle
# with open('./ProcessedData/x_train_doc2vec.pkl', 'rb') as f:
#   X_train = pickle.load(f)
# with open('./ProcessedData/x_val_doc2vec.pkl', 'rb') as f:
#   X_val = pickle.load(f)
with open('./ProcessedData/x_test_doc2vec.pkl', 'rb') as f:
  X_test = pickle.load(f)



In [6]:
X_train.shape


(826676, 629)

In [7]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).batch(512)

train_dataset.save('./ProcessedData/tf_train_dataset_doc2vec')


In [8]:
validation_dataset = tf.data.Dataset.from_tensor_slices(
    (X_val, Y_val)).batch(512)

validation_dataset.save('./ProcessedData/tf_validation_dataset_doc2vec')


In [8]:
test_dataset = tf.data.Dataset.from_tensor_slices(
    (X_test, Y_test)).batch(512)

test_dataset.save('./ProcessedData/tf_test_dataset_doc2vec')


In [9]:
train_dataset = tf.data.Dataset.load(
    './ProcessedData/tf_train_dataset_doc2vec')
validation_dataset = tf.data.Dataset.load(
    './ProcessedData/tf_validation_dataset_doc2vec')
test_dataset = tf.data.Dataset.load(
    './ProcessedData/tf_test_dataset_doc2vec')


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    (X_train, Y_train)).batch(512)

train_dataset.save('./ProcessedData/tf_train_dataset_doc2vec')


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    (X_train, Y_train)).batch(512)

tf.data.experimental.save(
    train_dataset, './ProcessedData/tf_train_dataset_doc2vec')


In [35]:
cherrypicked_seed = 42

input = Input(shape=(629,))
emb1 = tf.keras.layers.Embedding(20, 10, input_length=1)(input)  # EMBEDDING 1
gru1 = tf.keras.layers.GRU(64, return_sequences=True)(emb1)
flat1 = tf.keras.layers.Flatten()(gru1)


hidden3 = Dense(256, activation=LeakyReLU(alpha=0.3))(flat1)

# gru2 = tf.keras.layers.GRU(32, return_sequences=True)(flat2)
# flat2 = tf.keras.layersFlatten()(gru2)
hidden4 = Dense(128, activation=LeakyReLU(alpha=0.3))(hidden3)
dropout4 = Dropout(0.1, seed=cherrypicked_seed)(hidden4)

hidden5 = Dense(64, activation=LeakyReLU(alpha=0.3))(dropout4)
dropout5 = Dropout(0.1, seed=cherrypicked_seed)(hidden5)

hidden6 = Dense(32, activation=LeakyReLU(alpha=0.3))(dropout5)
dropout6 = Dropout(0.1, seed=cherrypicked_seed)(hidden6)

output = Dense(1, activation="linear")(dropout6)

model = Model(input, output)

opt = tf.keras.optimizers.Adam(clipnorm=1)
model.compile(optimizer=opt, loss="mean_squared_error", metrics='mse')
model.summary()

fBestModel = 'best_model_word2vec.h5'
early_stop = EarlyStopping(monitor='val_mse', mode="min", patience=15,
                           min_delta=0.01, verbose=1, restore_best_weights=True)
best_model = ModelCheckpoint(
    fBestModel, verbose=1, save_best_only=True, monitor='val_mse', mode="min")

# logdir = os.path.join("logs", "dropout_model")
# tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)



Model: "model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_16 (InputLayer)       [(None, 629)]             0         
                                                                 
 embedding_7 (Embedding)     (None, 629, 10)           200       
                                                                 
 gru_9 (GRU)                 (None, 629, 64)           14592     
                                                                 
 flatten_6 (Flatten)         (None, 40256)             0         
                                                                 
 dense_97 (Dense)            (None, 256)               10305792  
                                                                 
 dense_98 (Dense)            (None, 128)               32896     
                                                                 
 dropout_37 (Dropout)        (None, 128)               0  

In [36]:
batch_size = 512
n_workers = 12

model.fit(
    train_dataset,
    epochs=1000,
    batch_size=batch_size,
    validation_data=validation_dataset,
    callbacks=[best_model, early_stop],
    verbose=1,  
)


Epoch 1/1000
Epoch 1: val_mse improved from inf to 0.63289, saving model to best_model_word2vec.h5
Epoch 2/1000
Epoch 2: val_mse improved from 0.63289 to 0.63239, saving model to best_model_word2vec.h5
Epoch 3/1000
Epoch 3: val_mse did not improve from 0.63239
Epoch 4/1000
Epoch 4: val_mse improved from 0.63239 to 0.63206, saving model to best_model_word2vec.h5
Epoch 5/1000
Epoch 5: val_mse improved from 0.63206 to 0.63193, saving model to best_model_word2vec.h5
Epoch 6/1000
Epoch 6: val_mse did not improve from 0.63193
Epoch 7/1000
Epoch 7: val_mse did not improve from 0.63193
Epoch 8/1000

KeyboardInterrupt: 