In [1]:
%matplotlib inline
from stack_nlp import *
from jupyter_mplsettings import *
matplotlib.rc('font', **font)

In [2]:
cfg = local_import("./laptop.py")
cfg.options["read"] = ["questions", "features"]

In [3]:
PrepareData(cfg)
data = cfg.data
data.keys()
qs = data["meta"]

Shape of question df (1696819, 21)
Shape of merged df (1000000, 33)
Selecting only questions with at least 5 meaningful words.
This removes 8582 questions.
Removing bad values with missing feature information.
This affects 19 questions.
Shape of answer df (2028240, 21)
Information from answer df was merged into question df, but original df is trying to be closed and deleted from memory! Please change the config options to keep it open!
Calculating normalized columns. They are available under usual column name + _norm.


In [4]:
qs.Tags.head()

0                            [postgresql, grails, gsp]
1                              [python, shell, expect]
2                        [angularjs, provider, hybrid]
3                                [javascript, node.js]
4    [android, dataset, adapter, android-adapter, c...
Name: Tags, dtype: object

In [5]:
qs.columns

Index([u'AcceptedAnswerId', u'AnswerCount', u'BodyNCodes', u'BodyNQMarks',
       u'BodySize', u'ClosedDate', u'CommentCount', u'CommunityOwnedDate',
       u'CreationDate', u'FavoriteCount', u'Id', u'LastActivityDate',
       u'LastEditDate', u'LastEditorUserId', u'OwnerUserId', u'ParentId',
       u'PostTypeId', u'Score', u'Tags', u'Title', u'ViewCount', u'titlelen',
       u'hasAnswers', u'dt_created', u'Id_r', u'hot_indices', u'nwords',
       u'ordermean', u'orderstd', u'ordersum', u'prob_bern', u'prob_poiss',
       u'ratio', u'ParentId_first', u'CreationDate_first', u'Id_acc',
       u'CreationDate_acc', u'dayhour', u'weekday', u'dt_answer',
       u'dt_accanswer', u'dt_answer_hour', u'dt_accanswer_hour',
       u'BodyNCodes_norm', u'BodyNQMarks_norm', u'BodySize_norm',
       u'titlelen_norm', u'nwords_norm', u'ordersum_norm', u'ordermean_norm',
       u'orderstd_norm', u'ratio_norm'],
      dtype='object')

In [6]:
vocab_size = 1000
encoded_docs = qs.hot_indices.str.split(";").apply(lambda x: [int(xi) for xi in x[:-1] if int(xi) < vocab_size])

In [8]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split

In [7]:
# define class labels
labels = qs.hasAnswers
# pad documents to a max length of 4 words
max_length = 200
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

Using TensorFlow backend.


[[ 88   2  28 ...,   0   0   0]
 [237 220   5 ...,   0   0   0]
 [ 16  99  76 ...,   0   0   0]
 ..., 
 [213 668  36 ...,   0   0   0]
 [508 534  83 ...,   0   0   0]
 [ 98 402 229 ...,   0   0   0]]


In [10]:
padded_docs.shape

(991399, 200)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.40, random_state=42)

In [11]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
# model.add(Flatten())
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(X_train, y_train, nb_epoch=10, batch_size=100)
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy * 100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 100)          100000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 180,501
Trainable params: 180,501
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
 66900/594839 [==>...........................] - ETA: 35:07 - loss: 0.5443 - acc: 0.7670

KeyboardInterrupt: 

In [1]:
# LSTM for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

Using TensorFlow backend.


In [2]:
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, nb_epoch=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))



Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None




Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 87.52%
