In [1]:
import nltk
import string
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, LSTM, GRU, Embedding, Flatten
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K

Using TensorFlow backend.


### Loading the data

In [2]:
train = pd.read_table('train.tsv')
test = pd.read_table('test_stg2.tsv')

In [3]:
print ('Number of data points in train: ', train.shape[0])
print ('Number of features/variables:', train.shape[1])

print ('Number of data points in test: ', test.shape[0])

train.head()

Number of data points in train:  1482535
Number of features/variables: 8
Number of data points in test:  3460725


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [4]:
#Removing the products whose price < 3
train =train[train['price']> 3]

print('Number of data points in train data after eliminating price<3.00 :', train.shape[0])

Number of data points in train data after eliminating price<3.00 : 1462958


### Handling the missing values

In [5]:
def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)

In [6]:
handle_missing_inplace(train)

In [7]:
handle_missing_inplace(test)

### Transforming Text Data

In [8]:
print("Transforming text data to sequences...")
train_text = np.hstack([train.item_description.str.lower(), train.name.str.lower(), train.category_name.str.lower()])
test_text = np.hstack([test.item_description.str.lower(), test.name.str.lower(), test.category_name.str.lower()])
print("   Fitting tokenizer...")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(train_text)
tok_raw.fit_on_texts(test_text)
print("   Transforming text to sequences...")
train['seq_item_description'] = tok_raw.texts_to_sequences(train.item_description.str.lower())
train['seq_name'] = tok_raw.texts_to_sequences(train.name.str.lower())
train['seq_category'] = tok_raw.texts_to_sequences(train.category_name.str.lower())
test['seq_item_description'] = tok_raw.texts_to_sequences(test.item_description.str.lower())
test['seq_name'] = tok_raw.texts_to_sequences(test.name.str.lower())
test['seq_category'] = tok_raw.texts_to_sequences(test.category_name.str.lower())

Transforming text data to sequences...
   Fitting tokenizer...
   Transforming text to sequences...


In [9]:
print(train['seq_name'][:5])

0    [2463, 9693, 7315, 70, 99, 7, 199]
1           [11268, 28928, 16890, 2715]
2                    [7745, 10561, 277]
3                      [227, 2717, 621]
4                [5002, 127, 1174, 340]
Name: seq_name, dtype: object


In [10]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,seq_item_description,seq_name,seq_category
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,10.0,1,No description yet,"[13, 88, 102]","[2463, 9693, 7315, 70, 99, 7, 199]","[77, 41, 70, 72]"
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,"[33, 2715, 11, 8, 50, 18, 1, 256, 65, 21, 1182...","[11268, 28928, 16890, 2715]","[62, 922, 828, 3318, 1378]"
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,"[692, 74, 10, 5, 5381, 12, 244, 1, 5, 1036, 13...","[7745, 10561, 277]","[2, 41, 75, 277]"
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,35.0,1,New with tags. Leather horses. Retail for [rm]...,"[6, 10, 80, 227, 6615, 283, 4, 22, 211, 1205, ...","[227, 2717, 621]","[37, 37, 197, 37, 197, 501]"
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,44.0,0,Complete with certificate of authenticity,"[892, 10, 7132, 12, 2111]","[5002, 127, 1174, 340]","[2, 109, 353]"


### Encoding the Categorical Features

In [11]:
from sklearn.preprocessing import LabelEncoder

print("Processing categorical data...")
le = LabelEncoder()

le.fit(train.category_name)
train.category_name = le.transform(train.category_name)

le.fit(train.brand_name)
train.brand_name = le.transform(train.brand_name)

Processing categorical data...


In [12]:
print("Processing categorical data for test data...")
le = LabelEncoder()

le.fit(test.category_name)
test.category_name = le.transform(test.category_name)

le.fit(test.brand_name)
test.brand_name = le.transform(test.brand_name)

Processing categorical data for test data...


### Feature Engineering

In [13]:
#Function to get name and description lengths
def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except: 
        return 0



train['desc_len'] = train['item_description'].apply(lambda x: wordCount(x))
test['desc_len'] = test['item_description'].apply(lambda x: wordCount(x))
train['name_len'] = train['name'].apply(lambda x: wordCount(x))
test['name_len'] = test['name'].apply(lambda x: wordCount(x))

train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,seq_item_description,seq_name,seq_category,desc_len,name_len
0,0,MLB Cincinnati Reds T Shirt Size XL,3,808,4767,10.0,1,No description yet,"[13, 88, 102]","[2463, 9693, 7315, 70, 99, 7, 199]","[77, 41, 70, 72]",0,7
1,1,Razer BlackWidow Chroma Keyboard,3,86,3541,52.0,0,This keyboard is in great condition and works ...,"[33, 2715, 11, 8, 50, 18, 1, 256, 65, 21, 1182...","[11268, 28928, 16890, 2715]","[62, 922, 828, 3318, 1378]",36,4
2,2,AVA-VIV Blouse,1,1254,4162,10.0,1,Adorable top with a hint of lace and a key hol...,"[692, 74, 10, 5, 5381, 12, 244, 1, 5, 1036, 13...","[7745, 10561, 277]","[2, 41, 75, 277]",29,2
3,3,Leather Horse Statues,1,485,4767,35.0,1,New with tags. Leather horses. Retail for [rm]...,"[6, 10, 80, 227, 6615, 283, 4, 22, 211, 1205, ...","[227, 2717, 621]","[37, 37, 197, 37, 197, 501]",32,3
4,4,24K GOLD plated rose,1,1181,4767,44.0,0,Complete with certificate of authenticity,"[892, 10, 7132, 12, 2111]","[5002, 127, 1174, 340]","[2, 109, 353]",5,4


### Preparing data for applying RNN

In [14]:
MAX_NAME_SEQ = 50
MAX_ITEM_DESC_SEQ = 200
MAX_CATEGORY_SEQ = 10
MAX_TEXT = np.max([
    np.max(train.seq_name.max()),
    np.max(train.seq_item_description.max()),   
    np.max(train.seq_category.max())]) + 100
MAX_CATEGORY = np.max(train.category_name.max()) + 1
MAX_BRAND = np.max(train.brand_name.max()) + 1
MAX_CONDITION = np.max(train.item_condition_id.max()) + 1

In [15]:
#Function to get data for RNN model
def Keras_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ),
        'item_desc': pad_sequences(dataset.seq_item_description, maxlen=MAX_ITEM_DESC_SEQ),
        'category_seq': pad_sequences(dataset.seq_category, maxlen=MAX_CATEGORY_SEQ),
        'brand_name': np.array(dataset.brand_name),
        'category_name': np.array(dataset.category_name),
        'item_condition': np.array(dataset.item_condition_id),
        'num_vars': np.array(dataset[["shipping"]]),
        'desc_len': np.array(dataset[["desc_len"]]),
        'name_len': np.array(dataset[["name_len"]])
    }
    return X

### Splitting the data

In [16]:
#Splitting the data into train and cv

y_true = train['price'].values

x_train, x_cv, y_train, y_cv = train_test_split(train, y_true, test_size=0.2,  random_state=0)

In [17]:
#Padding data for RNN
X_train = Keras_data(x_train)
X_cv = Keras_data(x_cv)
X_test = Keras_data(test)

In [18]:
#Scaling target variable to log.
y_train = np.log1p(y_train)
y_train = y_train.reshape(-1, 1)

In [19]:
y_cv = np.log1p(y_cv)
y_cv = y_cv.reshape(-1, 1)

In [20]:
#Y_true and Y_pred will already be in log scale,hence, there is no need to log them in the function.

def rmsle(Y_true, Y_pred):
    assert Y_true.shape == Y_pred.shape
    return np.sqrt(np.mean(np.square(Y_pred - Y_true)))

## Deep Learning Model

### LSTM - 3 Layers

In [21]:
np.random.seed(42)
def LSTM_model():
    # Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand_name = Input(shape=[1], name="brand_name")
    category_name = Input(shape=[1], name="category_name")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[1], name="num_vars")
    desc_len = Input(shape=[1], name="desc_len")
    name_len = Input(shape=[1], name="name_len")

    # Embeddings layers
    emb_name = Embedding(MAX_TEXT, 50)(name)
    emb_item_desc = Embedding(MAX_TEXT, 50)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
    emb_category_name = Embedding(MAX_TEXT, 20)(category_name)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    emb_shipping = Embedding(2, 5)(num_vars)

    # RNN layers
    rnn_layer1 = LSTM(32, dropout=0.6)(emb_item_desc)
    rnn_layer2 = LSTM(16, dropout=0.6)(emb_name)
    rnn_layer3 = LSTM(8, dropout=0.6) (emb_category_name)

    # Main layer
    main_l = concatenate([
        Flatten()(emb_brand_name),
        Flatten() (emb_shipping),
        Flatten()(emb_item_condition),
        num_vars,
        rnn_layer1,
        rnn_layer2,
        rnn_layer3,
        desc_len,
        name_len])

    main_l = Dropout(0.1)(Dense(512, activation='relu')(main_l))
    main_l = Dropout(0.1)(Dense(256, activation='relu')(main_l))
    
    # Output layer
    output = Dense(1, activation="relu")(main_l)

    # Compile model
    model = Model([name, item_desc, brand_name, category_name, item_condition, num_vars, desc_len, name_len], output)
    optimizer = Adam(lr=0.001, decay=0.0)
    model.compile(loss="mse", optimizer= optimizer)

    return model

In [22]:
model = LSTM_model()
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
brand_name (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
num_vars (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
item_condition (InputLayer)     (None, 1)            0                                            
__________________________________________________________________________________________________
item_desc (InputLayer

In [23]:
BATCH_SIZE = 1000
epochs = 3
model_1 = model.fit(X_train, y_train,
          epochs=epochs,
          batch_size=BATCH_SIZE,
          validation_data=(X_cv, y_cv),
          verbose=1)

Instructions for updating:
Use tf.cast instead.
Train on 1170366 samples, validate on 292592 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [24]:
y_pred = model.predict(X_cv, batch_size=BATCH_SIZE)
print("RMSLE error:", rmsle(y_cv, y_pred))

RMSLE error: 0.46064911587612867


In [None]:
rnn_pred = model.predict(X_test, batch_size=BATCH_SIZE)
rnn_pred = np.exp(rnn_pred)-1

In [None]:
submission = test[["test_id"]]
submission["price"] = rnn_pred

In [None]:
#Storing the predictions in CSV file
submission.to_csv("./rnnsubmission.csv", index=False)