In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from keras.preprocessing import sequence, text
from keras.models import Sequential
import keras.layers as layer 
from keras import regularizers
from keras.callbacks import EarlyStopping
from keras import backend as K

from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
df_all = pd.concat([df_train, df_test])
del(df_train, df_test)

In [4]:
text_features = [ 'title', 'description']

In [5]:
df_all = df_all[text_features + ['deal_probability']]

In [6]:
df_all['text'] = ""
for text_col in text_features:
    df_all['text'] += " " + df_all[text_col].fillna("")
    
pattern = re.compile('[^(?u)\w\s]+')
df_all['text'] =df_all['text'].apply(lambda x: re.sub(pattern, "", x).lower())

In [7]:
embeddings_index = {}
f = open('../data/cc.ru.300.vec')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 2000000 word vectors.


In [8]:
max_len = 30
tk = text.Tokenizer(num_words=50000)
tk.fit_on_texts(df_all['text'].str.lower().tolist())
X = tk.texts_to_sequences(df_all['text'].str.lower().values)
X = sequence.pad_sequences(X, maxlen=max_len)

In [9]:
df_all.drop(text_features, axis=1, inplace=True)

In [10]:
word_index = tk.word_index

In [11]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [12]:
df_train = df_all[df_all['deal_probability'].notnull()]
X_train, X_val, y_train, y_val  = train_test_split(X[:len(df_train)], df_train['deal_probability'].values, test_size=0.01)

In [13]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

In [14]:
def get_model():

    model = Sequential()
    print('input')
    model.add(layer.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix],input_shape=(max_len,),trainable=False))
    model.add(layer.LSTM(30, recurrent_dropout=0.2, dropout=0.2, kernel_regularizer=regularizers.l2(2e-5),
                activity_regularizer=regularizers.l1(2e-5)))
    print('LSTM')
    model.add(layer.Dense(32,  kernel_regularizer=regularizers.l2(2e-5),
                activity_regularizer=regularizers.l1(2e-5)))
    model.add(layer.PReLU())
    model.add(layer.Dropout(0.2))
    model.add(layer.BatchNormalization())
    print('LSTM')
    
    model.add(layer.Dense(32, kernel_regularizer=regularizers.l2(2e-5),
                activity_regularizer=regularizers.l1(2e-5)))
    model.add(layer.PReLU())
    model.add(layer.Dropout(0.2))
    model.add(layer.BatchNormalization())

    
    model.add(layer.Dense(1))
    model.add(layer.Activation('sigmoid'))
    model.compile(loss='mean_squared_error', optimizer='nadam')
    
    
    
    return model  

In [15]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=4, mode='auto')

In [16]:
model = get_model()

input
LSTM
LSTM


In [None]:
X_train.shape

In [17]:
model.fit(X_train, y=y_train, 
                     validation_data = (X_val, y_val),
                     batch_size=8, epochs=10000,
                     verbose=1, shuffle=True, callbacks=[early_stopping])

Train on 1488389 samples, validate on 15035 samples
Epoch 1/10000
 285712/1488389 [====>.........................] - ETA: 33:30 - loss: 0.0658

KeyboardInterrupt: 