In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
from sklearn.preprocessing import LabelEncoder
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,CuDNNLSTM,CuDNNGRU,GRU
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras import backend as K
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [None]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_true - y_pred)))

In [None]:
train_dtypes = {    
            'user_id': 'str',
            'region': 'str',
            'city': 'str',
            'parent_category_name': 'str',
            'category_name': 'str',
            'param_1' : 'str',
            'param_2':'str',
            'param_3':'str',
            'title':'str',
            'description':'str',
            'price':'float64',
            'item_seq_number':'int64',
            'activation_date':'str',
            'user_type':'str',
            'image':'str',
            'image_top_1':'float64',
            'deal_probability':'float64'
        }

test_dtypes = {    
            'user_id': 'str',
            'region': 'str',
            'city': 'str',
            'parent_category_name': 'str',
            'category_name': 'str',
            'param_1' : 'str',
            'param_2':'str',
            'param_3':'str',
            'title':'str',
            'description':'str',
            'price':'float64',
            'item_seq_number':'int64',
            'activation_date':'str',
            'user_type':'str',
            'image':'str',
            'image_top_1':'float64'
        }

In [None]:
EMBEDDING_FILE = 'cc.ru.300.vec'
train = pd.read_csv('train.csv', index_col = "item_id", dtype=train_dtypes, parse_dates = ["activation_date"])
test = pd.read_csv('test.csv', index_col = "item_id", dtype=test_dtypes, parse_dates = ["activation_date"])

In [None]:
max_features = 200000
maxlen = 100
embed_size = 300

In [None]:
train_idx = train.shape[0]
y_train = train['deal_probability']
train = train.drop('deal_probability',axis=1)
df = pd.concat([train,test],axis=0)

In [None]:
#Preprocess
df['price'] = df['price'].fillna(0).astype('float32')
df['category_name'] = df['category_name'].astype('category')
df['parent_category_name'] = df['parent_category_name'].astype('category')
df['region'] = df['region'].astype('category')
df['city'] = df['city'].astype('category')

df["Weekday"] = df['activation_date'].dt.weekday

In [None]:
training_index = df.loc[df.activation_date<=pd.to_datetime('2017-04-07')].index
validation_index = df.loc[df.activation_date>=pd.to_datetime('2017-04-08')].index
df.drop(["activation_date","image"],axis=1,inplace=True)

df['text_feat'] = df.apply(lambda row: ' '.join([str(row['param_1']), str(row['param_2']), str(row['param_3'])]),axis=1) # Group Param Features
df.drop(["param_1","param_2","param_3"],axis=1,inplace=True)

In [None]:
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type"]

lbl = LabelEncoder()
for col in cat_vars:
    df[col] = lbl.fit_transform(df[col].astype(str))
df['price'] = np.log1p(df['price'])

In [None]:
# Create new features
textfeats = ["description","text_feat", "title"]
for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('blank') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    df[cols + '_num_chars'] = df[cols].apply(len) # Count number of Characters
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words
    df[cols + '_num_letters'] = df[cols].apply(lambda comment: len(comment)) # Count number of Letters
    df[cols + '_num_alphabets'] = df[cols].apply(lambda comment: (comment.count(r'[a-zA-Z]'))) # Count number of Alphabets
    df[cols + '_num_alphanumeric'] = df[cols].apply(lambda comment: (comment.count(r'[A-Za-z0-9]'))) # Count number of AlphaNumeric
    df[cols + '_num_digits'] = df[cols].apply(lambda comment: (comment.count('[0-9]'))) # Count number of Digits

In [None]:
#Fit transform
df['title_description_textfeat']= (df['title']+" "+df['description']+" "+df['text_feat']).astype(str)
print("Start Tokenization.....")
tokenizer = text.Tokenizer(num_words = max_features,lower=True)
tokenizer.fit_on_texts(df['title_description_textfeat'])
df['seq_title_description_textfeat']= tokenizer.texts_to_sequences(df['title_description_textfeat'].str.lower())
del df['description'], df['title'], df['text_feat'],df['title_description_textfeat']
gc.collect()
print("End Tokenization.....")

In [None]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
word_index = tokenizer.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Embedding created')

In [None]:
seq_title_description_textfeat = Input(shape=(maxlen,), name="seq_title_description_textfeat")
region = Input(shape=[1], name="region")
city = Input(shape=[1], name="city")
category_name = Input(shape=[1], name="category_name")
parent_category_name = Input(shape=[1], name="parent_category_name")
price = Input(shape=[1], name="price")

emb_seq_title_description = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(seq_title_description_textfeat)
emb_region = Embedding(num_words, 10)(region)
emb_city = Embedding(num_words, 10)(city)
emb_category_name = Embedding(num_words, 10)(category_name)
emb_parent_category_name = Embedding(num_words, 10)(parent_category_name)

emb_seq_title_description = SpatialDropout1D(0.5)(emb_seq_title_description)
rnn_layer1 = Bidirectional(LSTM(64, return_sequences=True,recurrent_dropout=0.1))(emb_seq_title_description)
rnn_layer1 = Bidirectional(Conv1D(40, return_sequences=True,recurrent_dropout=0.1))(rnn_layer1)

avg_pool = GlobalAveragePooling1D()(rnn_layer1)
max_pool = GlobalMaxPooling1D()(rnn_layer1)
rnn_layer1 = concatenate([avg_pool,max_pool])
main_l = concatenate([
          rnn_layer1
        , Flatten() (emb_region)
        , Flatten() (emb_city)
        , Flatten() (emb_category_name)
        , Flatten() (emb_parent_category_name)
        , price
    ])
    
main_l = Dropout(0.1)(Dense(512,activation='relu') (main_l))
main_l = Dropout(0.1)(Dense(64,activation='relu') (main_l))

#output
output = Dense(1,activation="sigmoid") (main_l)
optimizer = Adam(clipnorm=0.8)
#model
model = Model([seq_title_description_textfeat, region, city, category_name, parent_category_name, price], output)
model.compile(optimizer = optimizer,
              loss= root_mean_squared_error,
              metrics = [root_mean_squared_error])

In [None]:
filepath="weights_avito.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_root_mean_squared_error', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_root_mean_squared_error", mode="min", patience=6)
callbacks_list = [checkpoint, early]

In [None]:
def get_data_frame(dataset):
    
    df1 = pd.DataFrame()
#     print(dataset.head())
    df1['category_name'] = np.array(dataset[:,4])
    df1['city'] = np.array(dataset[:,2])
    df1['parent_category_name'] = np.array(dataset[:,3])
    df1['price'] = np.array(dataset[:,5])
    df1['region'] = np.array(dataset[:,1])
    df1['seq_title_description_textfeat'] = np.array(dataset[:,22])
    
    return df1

def get_keras_data(dataset):
    X = {
        'seq_title_description_textfeat': sequence.pad_sequences(dataset.seq_title_description_textfeat, maxlen=maxlen)
        ,'region': np.array(dataset.region)
        ,'city': np.array(dataset.city)
        ,'category_name': np.array(dataset.category_name)
        ,'parent_category_name': np.array(dataset.parent_category_name)
        ,'price': np.array(dataset[["price"]])

    }
    
    print("Data ready for Vectorization")
    
    return X

In [None]:
X_train, X_test = df[:train_idx], df[train_idx:]
X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train, train_size=0.95, random_state=233)
X_train, X_val, X_test = np.array(X_tra), np.array(X_val), np.array(X_test)
y_tra, y_val = np.array(y_tra), np.array(y_val)

In [None]:
X_train_final = get_data_frame(X_train)
X_val_final = get_data_frame(X_val)
X_test_final = get_data_frame(X_test)
del X_train, X_val, X_test
gc.collect()

In [None]:
X_train_f = get_keras_data(X_train_final)
X_val_f = get_keras_data(X_val_final)
X_test_f = get_keras_data(X_test_final)

del X_train_final, X_val_final, X_test_final
gc.collect()

In [None]:
print('Starting model training')
model.fit(X_train_f, y_tra, batch_size=64, epochs=3, validation_data=(X_val_f, y_val),callbacks = callbacks_list,verbose=1)

model.save_weights(filepath)
# model.load_weights(filepath)
print('Ended model training')

In [None]:
y_pred = model.predict(X_test_f,batch_size=1024,verbose=1)
y_pred = y_pred.reshape(-1,1)

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission['deal_probability'] = y_pred
submission.to_csv('submission_64.csv', index=False)