In [1]:
%matplotlib inline
from stack_nlp import *
from jupyter_mplsettings import *
matplotlib.rc('font', **font)

Using TensorFlow backend.


In [2]:
cfg = local_import("./laptop.py")
cfg.options["read"] = ["questions", "features"]

In [3]:
PrepareData(cfg)
data = cfg.data
data.keys()
qs = data["meta"]
conn = data["dbconn"]

Shape of question df (1696819, 21)
Shape of merged df (1000000, 34)
Selecting only questions with at least 5 meaningful words.
This removes 8582 questions.
Removing bad values with missing feature information.
This affects 19 questions.
Shape of answer df (2028240, 21)
Information from answer df was merged into question df, but original df is trying to be closed and deleted from memory! Please change the config options to keep it open!
Calculating normalized columns. They are available under usual column name + _norm.


In [4]:
embed_dim = 300
embeddings_path = "/home/alex/data/glove.6B.%id.txt" % embed_dim
word2vec_output_file = "./glove.6B.%id.txt.word2vec" % embed_dim

from gensim.models import KeyedVectors
gensimmodel = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [7]:
def GetAnswerTimeQuantiles(df, ncat):
    timecat_bins = np.linspace(-0.5, ncat + 0.5, ncat + 2)

    tmask = np.isfinite(df.dt_accanswer_hour)
    time_categories = mquantiles(df.loc[tmask].dt_accanswer_hour, prob=np.linspace(0, 1, ncat + 1))
    return time_categories

In [9]:
def AddTimeCategories(df, timequants):
    tmask = np.isfinite(df.dt_accanswer_hour)
    df["timecat"] = 0
    df.loc[tmask, "timecat"] = np.digitize(df.loc[tmask].dt_accanswer_hour, timequants) - 1
    df.loc[~tmask, "timecat"] = len(timequants) - 1

In [28]:
qs["goodscore"] = np.asarray(qs.Score > 0, dtype=int)

0    1
1    0
2    1
3    1
4    0
Name: goodscore, dtype: int64

In [5]:
qs["ispython"] = qs.Tags.apply(lambda x: "python" in x)

In [6]:
qssel = SelectUniformlyFromColumn(qs, "ispython", n=150000)

In [7]:
train = 100000
test = 50000
qstrain = qssel.iloc[:train]
qstest = qssel.iloc[-test:]
print "Length of the training set:", len(qstrain)
print "Length of the testing set:", len(qstest)

# label = "dt_accanswer_hour"
# label = "timecat"
# label = "quickanswer"
# label = "goodscore"
label = "ispython"

Length of the training set: 100000
Length of the testing set: 50000


In [8]:
posts_train = GetDBPosts(qstrain.Id.values, conn)
posts_test = GetDBPosts(qstest.Id.values, conn)
print len(posts_train)
print len(posts_test)

100000
50000


In [9]:
titles_train = np.squeeze(qstrain.Title.values)
titles_test = np.squeeze(qstest.Title.values)

In [14]:
conn.close()

In [10]:
# limiting the number of features / words and setting up the tokenizer
max_features = 50000

word_tokenizer = Tokenizer(max_features)
word_tokenizer.fit_on_texts(posts_train)

In [11]:
# actual tokenization using the tokenizer from above
posts_train_tf = word_tokenizer.texts_to_sequences(posts_train)
posts_test_tf = word_tokenizer.texts_to_sequences(posts_test)

# padding to a maximal question length for all questions
maxlen_posts = 1000
posts_train_tf = pad_sequences(posts_train_tf, maxlen=maxlen_posts, padding="post", truncating="post")
posts_test_tf = pad_sequences(posts_test_tf, maxlen=maxlen_posts, padding="post", truncating="post")

print(posts_train_tf[0])

[    1    85    12    24     2    33   904     1     9     2  1060    16
  1508    31    31  1508   142    78   217    25  6807  1156    25    36
    25 11464    25  2820    25 16224  3091 23057    17   914   174     6
     3    31   856    78  5987    25    36    25    36    36    78    25
 11464    25 11464 11464    78    25  2820    25  2820  2820    78    25
 16224    25 16224 16224  5276 23057    17   317   174    16     3    31
   856    78  2709    25    47    25    36    78    25    47    25 11464
    78    25    47    25  2820    78    25    47    25 16224 23057    92
   910   142    33    65   849    14   320   414   856    78   320  3491
   142    56    36  3491   142  2709    56 11464  3491   142    56  2820
  3491   142    56 16224  3491   142   157 23057    16     3   296    21
    36   157 23626    75     6  2930     3  1508   477 19741  1508 38914
  1508  1508  4616   477     6     3    75 23626   279 19741 23626   279
 38914 23626   279  4184   171   477    16     3   

In [27]:
titles_train_tf = word_tokenizer.texts_to_sequences(titles_train)
titles_test_tf = word_tokenizer.texts_to_sequences(titles_test)

# padding to a maximal title length
maxlen_titles = 50
titles_train_tf = pad_sequences(titles_train_tf, maxlen=maxlen_titles, padding="post", truncating="post")
titles_test_tf = pad_sequences(titles_test_tf, maxlen=maxlen_titles, padding="post", truncating="post")

print(titles_train_tf[0])

[   53   296   218    14   378    11 15779    16   398     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


In [13]:
# setting up weights matrix for embedding in keras
weights_matrix = np.zeros((max_features + 1, embed_dim))

for word, i in word_tokenizer.word_index.items():

    if i > max_features:
        continue
    try:
#         embedding_vector = embedding_vectors.get(word)
        embedding_vector = gensimmodel.word_vec(word)
        if embedding_vector is not None:
            weights_matrix[i] = embedding_vector
    except:
        weights_matrix[i] = np.zeros(embed_dim)

In [14]:
batch_size = 100
epochs = 20
split = 0.2

In [15]:
# setting up posts branch for modeling
posts_input = Input(shape=(maxlen_posts,), name="posts_input")
posts_embedding = Embedding(max_features + 1, embed_dim, weights=[weights_matrix])(posts_input)
posts_pooling = GlobalAveragePooling1D()(posts_embedding)

aux_output = Dense(1, activation="sigmoid", name="aux_out")(posts_pooling)

In [18]:
# setting up posts branch for modeling
titles_input = Input(shape=(maxlen_titles,), name="titles_input")
titles_embedding = Embedding(max_features + 1, embed_dim, weights=[weights_matrix])(titles_input)
titles_pooling = GlobalAveragePooling1D()(titles_embedding)

aux_output2 = Dense(1, activation="sigmoid", name="aux_out2")(titles_pooling)

In [19]:
# adding embeddings for other features
relcols = ["BodyNCodes", "BodyNQMarks", "BodySize", "titlelen", "nwords", "ordersum", "ordermean", "orderstd", "ratio"]
# todo: extend here to actually add all needed embeddings in dynamic way

meta_embedding_dims = 64

hours_input = Input(shape=(1,), name="hours_input")
hours_embedding = Embedding(24, meta_embedding_dims)(hours_input)
hours_reshape = Reshape((meta_embedding_dims,))(hours_embedding)

dayofweeks_input = Input(shape=(1,), name="dayofweeks_input")
dayofweeks_embedding = Embedding(7, meta_embedding_dims)(dayofweeks_input)
dayofweeks_reshape = Reshape((meta_embedding_dims,))(dayofweeks_embedding)

dayofyears_input = Input(shape=(1,), name="dayofyears_input")
dayofyears_embedding = Embedding(366, meta_embedding_dims)(dayofyears_input)
dayofyears_reshape = Reshape((meta_embedding_dims,))(dayofyears_embedding)

In [20]:
# connecting the different embeddings
merged = concatenate([posts_pooling, titles_pooling, hours_reshape, dayofweeks_reshape, dayofyears_reshape])

hidden_1 = Dense(256, activation="relu")(merged)
hidden_1 = BatchNormalization()(hidden_1)

main_output = Dense(1, activation="sigmoid", name="main_out")(hidden_1)

In [21]:
model = Model(inputs=[posts_input,
                      titles_input,
                      hours_input,
                      dayofweeks_input,
                      dayofyears_input], outputs=[main_output, aux_output, aux_output2])

model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"],
              loss_weights=[1, 0.2, 0.2])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
posts_input (InputLayer)        (None, 1000)         0                                            
__________________________________________________________________________________________________
titles_input (InputLayer)       (None, 50)           0                                            
__________________________________________________________________________________________________
hours_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
dayofweeks_input (InputLayer)   (None, 1)            0                                            
__________________________________________________________________________________________________
dayofyears

In [56]:
plot_model(model, to_file='./plots/keras_model.png')
plot_model(model, to_file='./plots/model_shapes.png', show_shapes=True)

In [22]:
print np.sum(qstrain[label]), qstrain[label].shape
print 1 - np.sum(qstrain[label]) * 1. / qstrain[label].shape[0]
print 1 - np.sum(qstest[label]) * 1. / qstest[label].shape[0]
print(1 - np.mean(qstrain[label][:(int(posts_train_tf.shape[0] * split))]))
print(1 - np.mean(qstest[label][:(int(posts_test_tf.shape[0] * split))]))

50104 (100000,)
0.49896
0.50208
0.50305
0.5049


In [24]:
csv_logger = CSVLogger('training.csv')

In [25]:
# fitting :)
model.fit([posts_train_tf, titles_train_tf, qstrain.dayhour.values, qstrain.weekday.values, qstrain.day.values],
          [qstrain[label], qstrain[label], qstrain[label]],
          batch_size=batch_size,
          epochs=5,
          validation_split=split, callbacks=[csv_logger])

Train on 80000 samples, validate on 20000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0396c77b90>

In [28]:
a = model.evaluate(x=[posts_test_tf, titles_test_tf, qstest.dayhour.values, qstest.weekday.values, qstest.day.values],
                   y=[qstest[label], qstest[label], qstest[label]])



In [33]:
model.save("keras_ispython.nnmodel")
model.save_weights("keras_ispython_weights.nnmodel")

In [29]:
print a

[0.35052877676486971, 0.23091761281058193, 0.30452059776306151, 0.29353521305084229, 0.93891999999999998, 0.89156000000000002, 0.87805999999999995]


In [49]:
preds = a[0]

In [60]:
preds_bin = np.around(preds).T[0]

In [64]:
print np.sum(preds_bin == label)
print len(preds_bin)

88057
100000
