In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from ipywidgets import interact_manual
import os
from keras import backend as K
from sklearn.metrics import mean_squared_error
import math
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing

In [4]:
trainDF = pd.read_csv("./input/avito-demand-prediction/train.csv")

trainDF.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


In [5]:
# Convert categorical variables to one-hot vectors.  
trainDF["category_name"] = pd.Categorical(trainDF["category_name"]).codes
trainDF["city"] = pd.Categorical(trainDF["city"]).codes
trainDF["region"] = pd.Categorical(trainDF["region"]).codes
trainDF["parent_category_name"] = pd.Categorical(trainDF["parent_category_name"]).codes
trainDF["user_type"] = pd.Categorical(trainDF["user_type"]).codes

trainDF.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,19,460,4,42,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,1,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,17,1300,2,22,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,1,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,16,1276,0,2,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,1,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,21,940,4,42,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,0,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,4,317,6,0,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,1,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


In [6]:
# Create a training and validation set. 
trainDF = trainDF.sample(frac=1)

nItems = trainDF.shape[0]
nValidation = int(nItems * 0.2)

validation = trainDF[0:nValidation]
train      = trainDF[nValidation:]

In [7]:
print(train.shape)
print(validation.shape)

(1202740, 18)
(300684, 18)


In [8]:
feats = ["price","region","city","category_name","parent_category_name",
         "user_type","title","description","image_top_1","item_seq_number","activation_date","deal_probability"]

max_features = 20000
maxlen = 200
descTokenizer = Tokenizer(num_words=max_features)

def preprocessDat(df):
    select = df.dropna()
    select = select.assign(descLen = select["description"].map(len))
    select = select.assign(titleLen = select["title"].map(len))
    select["activation_date"] = pd.to_datetime(select["activation_date"])
    select["activation_date"] = select["activation_date"].map(lambda x: x.dayofweek)
    
    return select

# Training Preprocessing 
select_train = preprocessDat(train[feats])
descTokenizer.fit_on_texts(select_train["description"]) 
select_train_desc = pad_sequences(descTokenizer.texts_to_sequences(select_train["description"]),maxlen = maxlen )
select_train = select_train.drop(["description","title"],axis=1)

# Validation Preprocesing
select_val = preprocessDat(validation[feats])
select_val_desc = pad_sequences(descTokenizer.texts_to_sequences(select_val["description"]),maxlen = maxlen )
select_val = select_val.drop(["description","title"],axis=1)

scaler = preprocessing.RobustScaler().fit(select_train.drop("deal_probability",axis=1))
select_train_scaled = scaler.transform(select_train.drop("deal_probability",axis=1))
select_val_scaled   = scaler.transform(select_val.drop("deal_probability",axis=1))

select_train.head()

Unnamed: 0,price,region,city,category_name,parent_category_name,user_type,image_top_1,item_seq_number,activation_date,deal_probability,descLen,titleLen
500607,8500.0,19,460,41,0,1,2918.0,19,3,0.03486,209,20
1295999,50100150.0,13,1101,10,4,0,119.0,425,0,0.2067,342,18
801460,950.0,12,1048,10,4,0,52.0,95,6,0.0,364,41
22750,380000.0,9,18,13,5,1,2037.0,6,1,0.0,27,21
46857,300.0,27,1724,10,4,1,49.0,111,5,0.0,88,40


# NN

In [13]:
from keras.layers import Dense, Input,BatchNormalization,Dropout
from keras.models import Model
from keras import backend

def rmse(y_true, y_pred):
    return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))

inp1 = Input(shape=(len(feats)-1, ))
x = Dense(100,activation="sigmoid")(inp1)
x = Dense(100,activation="sigmoid")(x)
x = Dense(100,activation="sigmoid")(x)
x = Dense(100,activation="sigmoid")(x)
x = Dense(1,activation="sigmoid")(x)

model = Model(inp1,x)
model.compile(loss="binary_crossentropy",optimizer='RMSProp',metrics=[rmse])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 11)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 100)               1200      
_________________________________________________________________
dense_11 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_12 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 101       
Total params: 21,501
Trainable params: 21,501
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(select_train_scaled,select_train["deal_probability"],epochs=100,batch_size=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
151000/977920 [===>..........................] - ETA: 26s - loss: 0.3554 - rmse: 0.1639

KeyboardInterrupt: 

In [81]:
pred = model.predict(select_val_scaled)
true = select_val["deal_probability"]
math.sqrt(mean_squared_error(true,pred))

0.23772483705098388

In [17]:
train_mean = select_train["deal_probability"].mean()
pred = np.full(select_val["deal_probability"].shape,train_mean )
true = select_val["deal_probability"]
math.sqrt(mean_squared_error(true,pred))

0.2627653092426415

In [16]:
select_val["deal_probability"].shape

(261417,)