In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from ipywidgets import interact_manual
import os
from sklearn import linear_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import Sequence, to_categorical
from skimage.io import imread
from skimage.transform import resize
import pickle
from gensim.models import KeyedVectors
import math

## Loading the Data

In [2]:
trainDF = pd.read_csv("./input/avito-demand-prediction/train.csv")
testDF = pd.read_csv("./input/avito-demand-prediction/test.csv")

trainDF.head()
#testDF.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


In [3]:
# Huge file. Takes up about 5 gigs of ram. 
ru_model = KeyedVectors.load_word2vec_format('input/fasttext-russian-2m/wiki.ru.vec')

## Preprocessing

In [33]:
def percentileOneHot(dealProb,nBuckets=10):
    val = math.floor((dealProb*nBuckets))
    if dealProb == 1: 
        val = nBuckets-1
        
    return val,to_categorical(val , num_classes=nBuckets)

features = ["price","description","image","region","title"]
selectFrame = trainDF[features+["deal_probability"]].dropna(how='any')
selectFrame = selectFrame[selectFrame["description"].map(len) <= 50]

selectFrame.loc[:,'region'], mapping_index_region = pd.Series(selectFrame['region']).factorize()
reverseRegionMap = {mapping_index_region[i]: i for i in range(0, len(mapping_index_region))}
#testDF['city'] = testDF['city'].map(reverseCityMap)

percentileRes = selectFrame["deal_probability"].map(percentileOneHot) 
selectFrame = selectFrame.assign(percentileClassID = percentileRes.apply(lambda x: x[0]), 
                                 percentileClass = percentileRes.apply(lambda x: x[1]))
selectFrame = selectFrame[selectFrame["deal_probability"]>=0.1]
selectFrame.head()

Unnamed: 0,price,description,image,region,title,deal_probability,percentileClassID,percentileClass
3,2200.0,Продам кресло от0-25кг,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,1,Автокресло,0.80323,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
4,40000.0,Все вопросы по телефону.,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2,"ВАЗ 2110, 2003",0.20797,2,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,1300.0,В хорошем состоянии,eb6ad1231c59d3dc7e4020e724ffe8e4d302023ddcbb99...,1,Авто люлька,0.80323,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
7,500.0,Бойфренды в хорошем состоянии.,9bab29a519e81c14f4582024adfebd4f11a4ac71d323a6...,3,Бойфренды colins,0.80323,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
31,130.0,продаю бычков на откорм веса от 150 до 220,d507fc90a72ae0a6c5b74e0ab6cd723680450c3af4f466...,8,Бычки,0.78501,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."


In [35]:
imagePath = "./input/avito-demand-prediction/train_jpg/data/competition_files/train_jpg/"
imageExt = ".jpg"
inpShape = (299,299,3)
max_features = 20000
maxlen = 50
embed_size = 300

class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, sFrame ,batch_size=32,shuffle=True):
        'Initialization'
        self.sFrame = sFrame
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.descTokenizer = Tokenizer(num_words=max_features)
        
        # Takes some time upon initilization of the data generator. 
        self.descTokenizer.fit_on_texts(sFrame["description"]) 

        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(self.sFrame.shape[0] / self.batch_size))
    
    def makeEmbeddingMatrix(self):
        def getEmbedding(x):
            if x in ru_model:
                return ru_model[x]
            else:
                return None

        word_index = self.descTokenizer.word_index
        embedding_matrix = np.zeros((len(word_index) + 1, embed_size))
        for word, i in word_index.items():
            embedding_vector = getEmbedding(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
                
        return embedding_matrix

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        sampleFrame =  self.sFrame[index*self.batch_size:(index+1)*self.batch_size]
        #sampleFrame =  self.sFrame.sample(n=self.batch_size)
        
        # Process Images
        imgURIs     = sampleFrame["image"].map(lambda x: imagePath+x+imageExt)
        imgReads    = imgURIs.map(imread)
        imgReads    = imgReads.map(lambda img : resize(img,inpShape))
        outImgs     = np.hstack(imgReads.as_matrix()).reshape((self.batch_size,inpShape[0],inpShape[1],inpShape[2]))
        
        # Process Descriptions
        descTokens =  pad_sequences(self.descTokenizer.texts_to_sequences(sampleFrame["description"]),maxlen = maxlen )
        titTokens  =  pad_sequences(self.descTokenizer.texts_to_sequences(sampleFrame["title"]),maxlen = maxlen )
        
        return [[sampleFrame["price"],descTokens,outImgs,sampleFrame["region"],titTokens], 
                np.vstack(sampleFrame["percentileClass"])]

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.sFrame = self.sFrame.sample(frac=1)
            
dataGen = DataGenerator(selectFrame)
embedding_matrix = dataGen.makeEmbeddingMatrix()

## Modeling

In [36]:
from keras.applications.inception_v3 import InceptionV3
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Concatenate
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D,BatchNormalization
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers


# Description Input
desInp = Input(shape=(maxlen, )) 
emb = Embedding(len(embedding_matrix), embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False)(desInp)
d = LSTM(maxlen, return_sequences=True,name='lstm_layer')(emb)
d = GlobalMaxPool1D()(d)

# Title Input
titInp = Input(shape=(maxlen, )) 
emb2 = Embedding(len(embedding_matrix), embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False)(titInp)
t = LSTM(maxlen, return_sequences=True,name='lstm_layer2_1')(emb2)
t = GlobalMaxPool1D()(t)

# Price Input
priceInp = Input(shape=(1, ))
y = Dense(10,activation="sigmoid")(priceInp)

# Region Input 
regionInp = Input(shape=(1, ))
r = Dense(len(mapping_index_region),activation="sigmoid")(regionInp)

# Image Input
imgInp = Input(shape=(299,299,3,))
incpetionModel = InceptionV3(weights='imagenet', classes=1000,include_top=True)
incpetionModel.trainable = False
incpetionModel.layers.pop()
z = incpetionModel(imgInp)

x = Concatenate(axis=-1)([d,z])
x = Dense(1000, activation="relu")(x)
x = Concatenate(axis=-1)([x,t])
x = Dense(1000, activation="relu")(x)
x = Concatenate(axis=-1)([x,y])
x = Dense(1000, activation="relu")(x)
x = Concatenate(axis=-1)([x,r])
x = Dense(1000, activation="relu")(x)
x = Dense(10, activation="softmax")(x)

model = Model(inputs=[priceInp,desInp,imgInp,regionInp,titInp], outputs=x)
model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 50, 300)      9858000     input_19[0][0]                   
__________________________________________________________________________________________________
lstm_layer (LSTM)               (None, 50, 50)       70200       embedding_7[0][0]                
__________________________________________________________________________________________________
input_23 (InputLayer)           (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
input_20 (

## Training 


In [37]:
hist = model.fit_generator(dataGen,steps_per_epoch=len(dataGen),epochs=1, verbose=1, workers=10, max_queue_size=50)

Epoch 1/1


  warn("The default mode, 'constant', will be changed to 'reflect' in "




KeyboardInterrupt: 

In [25]:
model.save_weights("fullModel50desc.h5")

In [None]:
history = hist
#  "Accuracy"
plt.plot(history.history['categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.show()
# "Loss"
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.show()