In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
import math

#Scikit Learn 
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing

# Interactive componnents. 
from ipywidgets import interact_manual

# Keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras import backend as K
from keras.utils import to_categorical,Sequence
from keras.callbacks import LambdaCallback

# Word2Vec format. 
from gensim.models import KeyedVectors

In [3]:
trainDF = pd.read_csv("./input/avito-demand-prediction/train.csv")
testDF = pd.read_csv("./input/avito-demand-prediction/test.csv")
trainDF.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


In [4]:
# TrainDF
trainDF["description"] = trainDF["description"].fillna("")
trainDF["title"] = trainDF["title"].fillna("")
trainDF = trainDF.fillna(0)

trainDF["activation_date"] = pd.to_datetime(trainDF["activation_date"])
trainDF["activation_date"] = trainDF["activation_date"].map(lambda x: x.dayofweek)

# TestDF
testDF["description"] = testDF["description"].fillna("")
testDF["title"] = testDF["title"].fillna("")
testDF = testDF.fillna(0)

testDF["activation_date"] = pd.to_datetime(testDF["activation_date"])
testDF["activation_date"] = testDF["activation_date"].map(lambda x: x.dayofweek)


def makeEmbeddingMatrix(ru_model,tokenizer):
    def getEmbedding(x):
        if x in ru_model:
            return ru_model[x]
        else:
            return None

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        embedding_vector = getEmbedding(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

# Description Tokenizer Setup 
max_features = 40000
maxlen = 400
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(trainDF["description"].append(testDF["description"]))

# Huge file. Takes up about 5 gigs of ram. 
ru_model = KeyedVectors.load_word2vec_format('./input/fasttext-russian-2m/wiki.ru.vec')
embedding_matrix = makeEmbeddingMatrix(ru_model,tokenizer)
del ru_model

In [5]:
# Create a training and validation set. 
trainDF = trainDF.sample(frac=1)

nItems = trainDF.shape[0]
nValidation = int(nItems * 0.2)

validation = trainDF[0:nValidation]
train      = trainDF[nValidation:]

In [6]:
categoricalFeats = ["region","category_name","parent_category_name","user_type","activation_date","city"]
quantFeats = ["price"]
textFeats = ["description"]
targetFeat = ["deal_probability"]
feats = categoricalFeats + quantFeats + textFeats+targetFeat

def codexLookup(cats,inStr):
    if(inStr in cats):
        return int(np.where(cats==inStr)[0][0])
    else:
        return 0

def toCode(df,category,codex,uList):
    return df[category].map(lambda x: codex[category](uList[category],x))

def getCatSizes(df,catFeats):
    lenDict = {}
    codex = {}
    uniqueLists = {}
    for feat in catFeats:
        cats = df[feat].unique()
        uniqueLists[feat] =  cats 
        lenDict[feat] = cats.shape[0]
        codex[feat] = lambda refDict,inStr : codexLookup(refDict,inStr)
    
    return lenDict,uniqueLists,codex
 
def preprocessDat(df,catSizes,codex,uList,tokenizer):
    # Convert categorical variables to one-hot vectors. 
    catFeats = []
    
    for cat in catSizes:
        catFeats.append(to_categorical(toCode(df,cat,codex,uList),num_classes=catSizes[cat]))
    
    # Description processing
    descs = pad_sequences(tokenizer.texts_to_sequences(df["description"]), maxlen=maxlen)
    
    df = df.drop(categoricalFeats+textFeats+targetFeat,axis=1)
    
    df = df.replace('', 0, regex=True)
    df = df.fillna(0)
    
    
    return  df,catFeats,descs

In [7]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, df, catSizes, codex, uList, tokenizer, scaler, maxlen,batch_size=32):
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.catSizes = catSizes
        self.uList = uList
        self.codex = codex
        self.df = df
        self.scaler = scaler
       
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(self.df.shape[0] / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        batch = self.df[index*self.batch_size:(index+1)*self.batch_size]
        quant,categorical,desc = preprocessDat(batch,self.catSizes,self.codex,self.uList,self.tokenizer)
        quant_scaled = scaler.transform(quant)

        return [quant_scaled,desc]+categorical, batch["deal_probability"]
    
class TestGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, df, catSizes, codex, uList, tokenizer, scaler, maxlen,batch_size=32):
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.catSizes = catSizes
        self.uList = uList
        self.codex = codex
        self.df = df
        self.scaler = scaler
       
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(self.df.shape[0] / self.batch_size))+1

    def __getitem__(self, index):
        'Generate one batch of data'
        batch = self.df[index*self.batch_size:(index+1)*self.batch_size]
        
        def preprocessDat2(df,catSizes,codex,uList,tokenizer):
            # Convert categorical variables to one-hot vectors. 
            catFeats = []

            for cat in catSizes:
                catFeats.append(to_categorical(toCode(df,cat,codex,uList),num_classes=catSizes[cat]))

            # Description processing
            descs = pad_sequences(tokenizer.texts_to_sequences(df["description"]), maxlen=maxlen)

            df = df.drop(categoricalFeats+textFeats,axis=1)

            return  df,catFeats,descs
        
        quant,categorical,desc = preprocessDat2(batch,self.catSizes,self.codex,self.uList,self.tokenizer)

        quant_scaled = scaler.transform(quant)

        return [quant_scaled,desc]+categorical

In [8]:
# Scaler setup. 
scaler = preprocessing.RobustScaler().fit(trainDF[quantFeats].append(testDF[quantFeats]).replace('', 0, regex=True).fillna(0))

catSizes,uList,codex = getCatSizes(trainDF.append(testDF),categoricalFeats)

trainGen = DataGenerator(train[feats],catSizes,codex,uList,tokenizer,scaler,maxlen,300)
validGen = DataGenerator(validation[feats],catSizes,codex,uList,tokenizer,scaler,maxlen,300)

fullGen = DataGenerator(trainDF[feats],catSizes,codex,uList,tokenizer,scaler,maxlen,300)

In [9]:
from keras.layers import Dense,Input,Dropout,Concatenate,Dropout,LSTM,Embedding
from keras.layers import GlobalMaxPool1D,GlobalAveragePooling1D,BatchNormalization
from keras.models import Model
from keras import regularizers
from keras import backend

from keras.optimizers import Adam

layerSize = 600
embed_size = 300

def rmse(y_true, y_pred):
    return backend.sqrt(backend.mean(backend.square(y_pred - y_true)))

def binary_crossentropy(y_true, y_pred):
    return backend.mean(backend.binary_crossentropy(y_true, y_pred), axis=-1)

def mixLoss(y_true, y_pred):
    return backend.sqrt(binary_crossentropy(y_true, y_pred)+rmse(y_true, y_pred))

quants = Input(shape=(len(quantFeats), ))

catInps = []
for catSize in catSizes:
    catInps.append(Input(shape=(catSizes[catSize], )))

descInp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier
descEmb = Embedding(len(embedding_matrix), embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False)(descInp)
descLayer = LSTM(maxlen,dropout=0.2, recurrent_dropout=0.2,activation="sigmoid")
desc = descLayer(descEmb)

inp = Concatenate()([quants,desc]+catInps)
inp = BatchNormalization()(inp)
x = Dense(layerSize,activation="sigmoid")(inp)
x = Dropout(0.2)(x)

for i in range(6):
    x = Dense(layerSize,activation="sigmoid")(x)
    Dropout(0.3)(x)
    
x = Dense(1,activation="sigmoid")(x)

model = Model([quants,descInp]+catInps,x)

opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)

model.compile(loss="binary_crossentropy",optimizer=opt,metrics=[rmse])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 400, 300)     248193600   input_8[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 400)          1121600     embedding_1[0][0]                
__________________________________________________________________________________________________
input_2 (I

In [38]:
def validationTest(batch,loss):
    t = validGen[0]
    res = model.evaluate(t[0],t[1],verbose=0)
    #res["binary_crossentropy"]
    return 1
    
tmp = []
batch_validation_estimate = LambdaCallback(
    on_batch_end=lambda batch,logs: tmp.append(validationTest(batch,logs['loss'])))

hist = model.fit_generator(trainGen,steps_per_epoch=len(fullGen),epochs=20, verbose=1,callbacks=[batch_validation_estimate])


Epoch 1/20
   1/5011 [..............................] - ETA: 1:05:32 - loss: 0.3493 - rmse: 0.2229

  % delta_t_median)


   2/5011 [..............................] - ETA: 2:14:14 - loss: 0.3724 - rmse: 0.2361

  % delta_t_median)


   3/5011 [..............................] - ETA: 2:38:33 - loss: 0.3756 - rmse: 0.2362

  % delta_t_median)


   4/5011 [..............................] - ETA: 2:49:46 - loss: 0.3749 - rmse: 0.2375

  % delta_t_median)


   5/5011 [..............................] - ETA: 2:56:26 - loss: 0.3684 - rmse: 0.2343

  % delta_t_median)


   6/5011 [..............................] - ETA: 3:01:14 - loss: 0.3699 - rmse: 0.2372

  % delta_t_median)


   7/5011 [..............................] - ETA: 3:04:23 - loss: 0.3678 - rmse: 0.2369

  % delta_t_median)


   8/5011 [..............................] - ETA: 3:06:30 - loss: 0.3665 - rmse: 0.2364

  % delta_t_median)


   9/5011 [..............................] - ETA: 3:08:17 - loss: 0.3634 - rmse: 0.2359

KeyboardInterrupt: 

In [None]:
model.fit_generator(fullGen,steps_per_epoch=len(fullGen),epochs=20, verbose=1)

In [None]:
def learningCurves(hist):
    histAcc_train = hist.history['acc']
    histLoss_train = hist.history['loss']
    histAcc_validation = hist.history['val_acc']
    histLoss_validation = hist.history['val_loss']
    maxValAcc = np.max(histAcc_validation)
    minValLoss = np.min(histLoss_validation)

    plt.figure(figsize=(12,12))

    plt.plot(range(epochs),np.full(epochs,meanBaseline(trainDF)),label="Unbiased Estimator", color="red")

    plt.plot(range(epochs),histLoss_train, label="Training Loss", color="#acc6ef")
    plt.plot(range(epochs),histAcc_train, label="Training Accuracy", color = "#005ff9" )

    plt.plot(range(epochs),histLoss_validation, label="Validation Loss", color="#a7e295")
    plt.plot(range(epochs),histAcc_validation, label="Validation Accuracy",color="#3ddd0d")

    plt.scatter(np.argmax(histAcc_validation),maxValAcc,zorder=10,color="green")
    plt.scatter(np.argmin(histLoss_validation),minValLoss,zorder=10,color="green")

    plt.xlabel('Epochs',fontsize=14)
    plt.title("Learning Curves",fontsize=20)

    plt.legend()
    plt.show()

    print("Max validation accuracy: {0}".format(maxValAcc))
    print("Minimum validation loss: {0}".format(minValLoss))

learningCurves(hist)

In [10]:
testGen = TestGenerator(testDF[categoricalFeats + quantFeats + textFeats],catSizes,codex,uList,tokenizer,scaler,maxlen,300)

pred = model.predict_generator(testGen)

data_to_submit = pd.DataFrame.from_items([
    ('item_id',testDF["item_id"]),
    ('deal_probability', pd.Series(np.hstack(pred)))])

data_to_submit.to_csv('5_30_18_submit_4.csv', index = False)

In [14]:
model.save("Omega5_29_2018.h5")

In [13]:
model.fit_generator(trainGen,steps_per_epoch=len(trainGen),epochs=20, verbose=1, 
                    validation_data=validGen, validation_steps=len(validGen))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20

KeyboardInterrupt: 

In [None]:
x = None 

for layerInp in catInps:
    if x == None:
        x = Dense(layerSize,activation="relu")(layerInp)
        x = Dropout(0.2)(x)
    else: 
        x = Concatenate()([x,layerInp])
        x = Dense(layerSize,activation="relu")(layerInp)
        x = Dropout(0.2)(x)

for i in range(10):
    x = Dense(layerSize,activation="relu")(x)
    x = Dropout(0.3)(x)
    
x = Concatenate()([x,desc])

for i in range(10):
    x = Dense(layerSize,activation="relu")(x)
    x = Dropout(0.3)(x)
    
x = Concatenate()([x,quants])

for i in range(10):
    x = Dense(layerSize,activation="relu")(x)
    x = Dropout(0.3)(x)
    
x = Dense(1,activation="sigmoid")(x)

model = Model([quants,descInp]+catInps,x)
model.compile(loss="binary_crossentropy",optimizer='adam',metrics=[rmse])

model.summary()