In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.preprocessing.text import Tokenizer
import os
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import math

In [None]:
trainDF = pd.read_csv("../input/train.csv")
testDF = pd.read_csv("../input/test.csv")

#trainDF.head()
testDF.head()

In [None]:
def percentileOneHot(dealProb,nBuckets=10):
    val = math.floor((dealProb*nBuckets))
    if dealProb == 1: 
        val = nBuckets-1
        
    return val,to_categorical(val , num_classes=nBuckets)

featureColumns = ["description","price","category_name"]
trainDF = trainDF[featureColumns+["deal_probability"]].dropna(axis=0, how='any')
testDF = testDF[["item_id"]+featureColumns].fillna(0)

max_features = 20000
maxlen = 200

tokenizer = Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(trainDF["description"])

trainDF["description"] = tokenizer.texts_to_sequences(trainDF["description"])
testDF["description"]  = tokenizer.texts_to_sequences(testDF["description"])

train_x = pad_sequences(trainDF["description"], maxlen=maxlen)
test_x = pad_sequences(testDF["description"], maxlen=maxlen)

trainDF['category_name'], mapping_index_category = pd.Series(trainDF['category_name']).factorize()
reverseCatMap = {mapping_index_category[i]: i for i in range(0, len(mapping_index_category))}
testDF['category_name'] = testDF['category_name'].map(reverseCatMap)

percentileRes = trainDF["deal_probability"].map(percentileOneHot) 
trainDF = trainDF.assign(percentileClassID = percentileRes.apply(lambda x: x[0]), percentileClass = percentileRes.apply(lambda x: x[1]))

In [None]:
trainDF.head()

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Concatenate
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

embed_size = 128

# Description 
inp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier
x = Embedding(max_features, embed_size)(inp)
x = LSTM(200, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)

# Price
inp2 = Input(shape=(1, ))
y = Dense(50, activation="sigmoid")(inp2)

# Category
inp3 = Input(shape=(1, ))
z = Dense(30, activation="sigmoid")(inp3)

x = Concatenate(axis=-1)([x,y,z])

x = Dense(200, activation="sigmoid")(x)
x = Dropout(0.2)(x)
x = Dense(100, activation="sigmoid")(x)
x = Dropout(0.1)(x)
x = Dense(10, activation="softmax")(x)

model = Model(inputs=[inp,inp2,inp3], outputs=x)
model.compile(loss='categorical_crossentropy',
                  optimizer='RMSprop',
                  metrics=['categorical_accuracy'])

model.summary()

In [None]:
hist = model.fit([train_x,trainDF["price"],trainDF["category_name"]],np.vstack(trainDF["percentileClass"].as_matrix()),epochs=1, batch_size=300,validation_split=0.1)
model.save("desc_precentile_classifier.h5")

In [None]:
history = hist
#  "Accuracy"
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# "Loss"
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
groups = trainDF.groupby("percentileClassID")
groups.mean()

In [None]:
groups = trainDF.groupby("percentileClassID")
groups["percentileClassID"].count()/groups["percentileClassID"].count().sum()


In [None]:
from keras import metrics
from keras.backend import eval
modelPred = model.predict([train_x[0:20],trainDF["price"][0:20],trainDF["category_name"][0:20]])

trainDF["percentileClass"][20]
for i in range(20):
    print(eval(metrics.categorical_accuracy(trainDF["percentileClass"][i],modelPred[i])))
    print(trainDF["percentileClass"][i])

#print( modelPred[5])
#trainDF[0:10]

In [None]:
modelPred = model.predict(test_x[0:10],batch_size=300).reshape(10)
#modelPred = model.predict(test_x,batch_size=300).reshape(len(test_x))

In [None]:
data_to_submit = pd.DataFrame.from_items([
    ('item_id',testDF["item_id"]),
    ('deal_probability', pd.Series(modelPred))])

data_to_submit.to_csv('csv_to_submit.csv', index = False)

#data_to_submit.head()

In [None]:
shape

#### 