In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from pymystem3 import Mystem
import re

Using TensorFlow backend.


In [2]:
df=pd.read_csv("train_split.csv").fillna("")
test=pd.read_csv("test_split.csv").fillna("")

m = Mystem()

def clean(x):
    return ''.join(m.lemmatize(re.sub('([^а-яa-z]+)',' ',x.lower()))).strip()

df.comment=df.comment.apply(clean)
df.commentNegative=df.commentNegative.apply(clean)
df.commentPositive=df.commentPositive.apply(clean)

test.comment=test.comment.apply(clean)
test.commentNegative=test.comment.apply(clean)
test.commentPositive=test.comment.apply(clean)

In [3]:
tkn=Tokenizer(filters="")
tkn.fit_on_texts(df.comment+df.commentNegative+df.commentPositive)
comments=tkn.texts_to_sequences(df.comment)
comments_neg=tkn.texts_to_sequences(df.commentNegative)
comments_pos=tkn.texts_to_sequences(df.commentPositive)

t_comments=tkn.texts_to_sequences(test.comment)
t_comments_neg=tkn.texts_to_sequences(test.commentNegative)
t_comments_pos=tkn.texts_to_sequences(test.commentPositive)

c_len=int(np.percentile(list(map(len,comments)),95))
cneg_len=int(np.percentile(list(map(len,comments_neg)),95))
cpos_len=int(np.percentile(list(map(len,comments_pos)),95))

c_pad=pad_sequences(comments,c_len)
cneg_pad=pad_sequences(comments_neg,cneg_len)
cpos_pad=pad_sequences(comments_pos,cpos_len)

t_c_pad=pad_sequences(t_comments,c_len)
t_cneg_pad=pad_sequences(t_comments_neg,cneg_len)
t_cpos_pad=pad_sequences(t_comments_pos,cpos_len)

y=((df.reting.values.astype(np.float32))-1)/4
t_y=((test.reting.values.astype(np.float32))-1)/4

In [12]:
from keras.models import Model
from keras.layers import LSTM, Concatenate, Dense, Input, Embedding, Bidirectional, AlphaDropout, Masking, GRU

comm=Input((c_pad.shape[1],))
cneg=Input((cneg_pad.shape[1],))
cpos=Input((cpos_pad.shape[1],))

m_comm=Masking()(comm)
m_cneg=Masking()(cneg)
m_cpos=Masking()(cpos)

enc_lstm=Bidirectional(LSTM(256,return_sequences=True,dropout=0.2))
enc2_lstm=LSTM(256,dropout=0.2)

emb=Embedding(len(tkn.word_index)+1,128)

comm_emb=emb(m_comm)
cneg_emb=emb(m_cneg)
cpos_emb=emb(m_cpos)

comm_enc=enc_lstm(comm_emb)
cneg_enc=enc_lstm(cneg_emb)
cpos_enc=enc_lstm(cpos_emb)

comm_enc2=enc2_lstm(comm_enc)
cneg_enc2=enc2_lstm(cneg_enc)
cpos_enc2=enc2_lstm(cpos_enc)

conc=Concatenate()([comm_enc2,cneg_enc2,cpos_enc2])

# res=Dense(512,activation="selu")(conc)
# res=AlphaDropout(0.15)(res)
res=Dense(256,activation="selu")(conc)
res=AlphaDropout(0.15)(res)
res=Dense(128,activation="selu")(res)
res=AlphaDropout(0.15)(res)
res=Dense(1,activation="sigmoid")(res)

model=Model([comm,cneg,cpos],res)
model.compile("adam","mse")

In [None]:
model.fit([c_pad,cneg_pad,cpos_pad],y,batch_size=512,epochs=50,validation_data=([t_c_pad,t_cneg_pad,t_cpos_pad],t_y))

Train on 13248 samples, validate on 2339 samples
Epoch 1/50
 3072/13248 [=====>........................] - ETA: 11s - loss: 0.1859

In [None]:
model.predict([c_pad[1].reshape(-1,123),cneg_pad[1],cpos_pad[1].reshape(-1,11)])

In [None]:
df