Do the essential imports in the cell below.

In [1]:
import nltk
import string
import re
import numpy as np
import pandas as pd

In the next cell we read the training and testing data for our complete dataset.

In [2]:
train = pd.read_csv('D:\\Classes\\train.tsv', sep='\t')
test = pd.read_csv('D:\\Classes\\test.tsv', sep='\t')

Since we noticed a lot of our data is missing in the brand names column , we fill out the missing values with the string 'missing'.

In [3]:
train['brand_name'] = train['brand_name'].fillna('missing').astype(str)

# 2 LAYER NEURAL NETWORK MODEL:

We will train this model with tensorflow as the backend.

In [4]:
import time
import tensorflow as tf

#from lr_schedule import _cosine_decay_restarts, _exponential_decay
#from sklearn.metrics import rmse
#from nn_module import embed, encode, attend
#from nn_module import word_dropout
#from nn_module import dense_block, resnet_block
#from optimizer import LazyPowerSignOptimizer, LazyAddSignOptimizer, LazyAMSGradOptimizer, LazyNadamOptimizer
#from utils import _makedirs
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
class XNN(object):
    def __init__(self, params, target_scaler, logger):
        self.params = params
        self.target_scaler = target_scaler
        self.logger = logger
        _makedirs(self.params["model_dir"], force=True)
        self._init_graph()
        self.gvars_state_list = []

        # 14
        self.bias = 0.01228477
        self.weights = [
            0.00599607, 0.02999416, 0.05985384, 0.20137787, 0.03178938, 0.04612812,
            0.05384821, 0.10121514, 0.05915169, 0.05521121, 0.06448063, 0.0944233,
            0.08306157, 0.11769992
        ]
        self.weights = np.array(self.weights).reshape(-1, 1)

 Find root mean square logarithmic error for validation

In [6]:

def find_rmsle(h, y):
    return np.sqrt(np.square(np.log(h + 1) - np.log(y + 1)).mean())

Preprocessing missing values

In [7]:
def preprocess_missing(dataset):
    dataset.category_name.fillna(value="missing", inplace=True)
    dataset.brand_name.fillna(value="missing", inplace=True)
    dataset.item_description.fillna(value="missing", inplace=True)
    return dataset

Preprocessing categorical values

In [8]:
def preprocess_categories(train, test):
    le = LabelEncoder()

    le.fit(np.hstack([train.category_name, test.category_name]))
    train.category_name = le.transform(train.category_name)
    test.category_name = le.transform(test.category_name)

    le.fit(np.hstack([train.brand_name, test.brand_name]))
    train.brand_name = le.transform(train.brand_name)
    test.brand_name = le.transform(test.brand_name)

Tokenizing raw item descriptions

In [9]:
def preprocess_raw_inputs(train, test):
    raw_text = np.hstack([train.item_description.str.lower(), train.name.str.lower()])

    tok_raw = Tokenizer()
    tok_raw.fit_on_texts(raw_text)

    train["seq_item_description"] = tok_raw.texts_to_sequences(train.item_description.str.lower())
    test["seq_item_description"] = tok_raw.texts_to_sequences(test.item_description.str.lower())
    train["seq_name"] = tok_raw.texts_to_sequences(train.name.str.lower())
    test["seq_name"] = tok_raw.texts_to_sequences(test.name.str.lower())

Padding the data below.

In [10]:
def get_keras_data(dataset):
    X = {'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ),
         'item_desc': pad_sequences(dataset.seq_item_description, maxlen=MAX_ITEM_DESC_SEQ),
         'brand_name': np.array(dataset.brand_name),
         'category_name': np.array(dataset.category_name),
         'item_condition': np.array(dataset.item_condition_id),
         'num_vars': np.array(dataset[["shipping"]])}
    return X

Save model states.

In [11]:
def get_callbacks(filepath, patience=2):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]

Another RMSLE(Root mean square Logarithmic error) calculation.

In [12]:
def rmsle_cust(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))

# RNN model definition:--

In [13]:

def create_RNN_model():
    # Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand_name = Input(shape=[1], name="brand_name")
    category_name = Input(shape=[1], name="category_name")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")

    # Embeddings layers
    emb_name = Embedding(MAX_TEXT, 50)(name)
    emb_item_desc = Embedding(MAX_TEXT, 50)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
    emb_category_name = Embedding(MAX_CATEGORY, 10)(category_name)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)

    # RNN layers
    rnn_layer1 = GRU(16)(emb_item_desc)
    rnn_layer2 = GRU(8)(emb_name)

    # Main layer
    main_l = concatenate([
        Flatten()(emb_brand_name),
        Flatten()(emb_category_name),
        Flatten()(emb_item_condition),
        rnn_layer1,
        rnn_layer2,
        num_vars])

    main_l = Dropout(0.1)(Dense(128)(main_l))
    main_l = Dropout(0.1)(Dense(64)(main_l))

    # Output layer
    output = Dense(1, activation="linear")(main_l)

    # Compile model
    model = Model([name, item_desc, brand_name, category_name, item_condition, num_vars], output)
    model.compile(loss="mse", optimizer="adam", metrics=["mae", rmsle_cust])

    return model

In [14]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

Preprocessing the training and testing data.

In [15]:
train = preprocess_missing(train)
test = preprocess_missing(test)

preprocess_categories(train, test)
preprocess_raw_inputs(train, test)

Analyzing sequences in the training and testing data:

In [16]:

max_name_seq = np.max([np.max(train.seq_name.apply(lambda x: len(x))),
                       np.max(test.seq_name.apply(lambda x: len(x)))])

max_seq_item_description = np.max([np.max(train.seq_item_description.apply(lambda x: len(x))),
                                   np.max(test.seq_item_description.apply(lambda x: len(x)))])

In [None]:
Selecting maximal values based on the original work.

In [17]:
MAX_NAME_SEQ = 10
MAX_ITEM_DESC_SEQ = 75
MAX_TEXT = np.max([np.max(train.seq_name.max()),
                   np.max(test.seq_name.max()),
                   np.max(train.seq_item_description.max()),
                   np.max(test.seq_item_description.max())])+2
MAX_CATEGORY = np.max([train.category_name.max(),
                       test.category_name.max()])+1
MAX_BRAND = np.max([train.brand_name.max(),
                    test.brand_name.max()])+1
MAX_CONDITION = np.max([train.item_condition_id.max(),
                        test.item_condition_id.max()])+1

In [18]:
# Scaling values
train["target"] = np.log(train.price+1)
target_scaler = MinMaxScaler(feature_range=(-1, 1))
train["target"] = target_scaler.fit_transform(train.target.values.reshape(-1, 1))
pd.DataFrame(train.target).hist()

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000019503E68630>]],
      dtype=object)

In [19]:
from sklearn.cross_validation import train_test_split



In [20]:
# Split training data for crossval
dtrain, dvalid = train_test_split(train, random_state=123, train_size=0.99)

In [21]:
# Pad data for Keras
X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)
X_test = get_keras_data(test)

In [22]:
# Create model
model = create_RNN_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
brand_name (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
category_name (InputLayer)      (None, 1)            0                                            
__________________________________________________________________________________________________
item_condition (InputLayer)     (None, 1)            0                                            
__________________________________________________________________________________________________
item_desc (InputLayer)          (None, 75)           0                                            
__________________________________________________________________________________________________
name (Inpu

In [23]:
BATCH_SIZE = 30000
epochs = 5
model.fit(X_train, dtrain.target,
          epochs=epochs,
          batch_size=BATCH_SIZE,
          validation_data=(X_valid, dvalid.target),
          verbose=1)

Train on 1467709 samples, validate on 14826 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x19503d17a90>

In [24]:
# Validate model
val_preds = model.predict(X_valid)
val_preds = target_scaler.inverse_transform(val_preds)
val_preds = np.exp(val_preds)+1

In [25]:
# Find the RMSLE
y_true = np.array(dvalid.price.values)
y_pred = val_preds[:, 0]
v_rmsle = find_rmsle(y_true, y_pred)
print("RMSLE: " + str(v_rmsle))

RMSLE: 0.49363815774040193


In [26]:
# Test dataset validation
preds = model.predict(X_test, batch_size=BATCH_SIZE)
preds = target_scaler.inverse_transform(preds)
preds = np.exp(preds)-1

In [27]:
# Result to csv
result = test[["test_id"]]
result["price"] = preds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [28]:
result

Unnamed: 0,test_id,price
0,0,11.377471
1,1,11.302700
2,2,34.843052
3,3,16.273266
4,4,7.120021
5,5,10.668484
6,6,9.604812
7,7,40.218586
8,8,39.758251
9,9,10.539124


##### Out of all the models we tried, this model has the least RMSLE error and predicts the prices closer to the original training dataset prices.
####  0.49363815774040193