In [14]:
import keras
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
import math

In [4]:
tagged_questions = pd.read_csv('tagged_questions.csv')

In [22]:
tagged_questions.head()

Unnamed: 0,Id,FullText,Tag
0,6,The Two Cultures: statistics vs. machine learn...,machine-learning
1,21,Forecasting demographic census What are some o...,forecasting
2,22,Bayesian and frequentist reasoning in plain En...,bayesian
3,31,What is the meaning of p values and t values i...,hypothesis-testing
4,36,Examples for teaching: Correlation does not me...,correlation


In [11]:
MAX_WORDS = 1000
BATCH_SIZE = 32
EPOCHS = 5

## Split Test and Training Data

- group questions by tag
- split each array of questions by `TEST_SPLIT` for each tag
- push each split into `test_set` or `train_set` buckets

In [13]:
TEST_SPLIT = 0.2

In [217]:
grouped_questions = {k: g["FullText"].tolist() for k, g in tagged_questions.groupby('Tag')}

In [289]:
test_set = []
train_set = []

for tag, questions in grouped_questions.items():
    # only take tags with more than 3 questions
    if len(questions) < 3:
        continue
    
    offset = math.ceil(len(questions) * TEST_SPLIT)
    
    for test_question in questions[:offset]:
        test_set.append([test_question, tag])
        
    for train_question in questions[offset:]:
        train_set.append([train_question, tag])

In [290]:
x_test = [i[0] for i in test_set]
y_test = [i[1] for i in test_set]

x_train = [i[0] for i in train_set]
y_train = [i[1] for i in train_set]

print("test sequences:", len(x_test))
print("train sequences:", len(x_train))

test sequences: 17164
train sequences: 67542


## Fit text from all questions and tags

We need to map our words to integers, similar to `pandas.factorize`, for our entire wordset.

```javascript
// input
[['The lazy dog jumped'], ['The lazy man walked around the dog']]

// output
[1, 2, 3, 4, 5, 6, 7]
```

Once we have fitted the text we map our test and training data.

```javascript
// input
[['The lazy dog jumped'], ['The lazy man walked around the dog']]

// output
[[1, 2, 3, 4], [1, 2, 5, 6, 7, 1, 3]] 
```

In [291]:
question_tokenizer = Tokenizer(num_words=MAX_WORDS)
question_tokenizer.fit_on_texts(tagged_questions['FullText'])

x_test_sequences = question_tokenizer.texts_to_sequences(x_test)
x_train_sequences = question_tokenizer.texts_to_sequences(x_train)

In [292]:
# filters='' will avoid splitting tags and creating new words
tag_tokenizer = Tokenizer(filters='')
tag_tokenizer.fit_on_texts(tagged_questions['Tag'])

y_test_sequences = tag_tokenizer.texts_to_sequences(y_test)
y_train_sequences = tag_tokenizer.texts_to_sequences(y_train)

## Vectorize sequence data

In [293]:
NUM_CLASSES = np.max(y_train_sequences) + 1
print('num classes:', NUM_CLASSES)

num classes: 578


In [294]:
x_train_matrices = question_tokenizer.sequences_to_matrix(x_train_sequences, mode='tfidf')
x_test_matrices = question_tokenizer.sequences_to_matrix(x_test_sequences, mode='tfidf')
print('x_train shape:', x_train_matrices.shape)
print('x_test_shape:', x_test_matrices.shape)

x_train shape: (67542, 1000)
x_test_shape: (17164, 1000)


In [295]:
y_train_categories = keras.utils.to_categorical(y_train_sequences, NUM_CLASSES)
y_test_categories = keras.utils.to_categorical(y_test_sequences, np.max(y_test_sequences) + 1)
print('y_train_categories shape:', y_train_categories.shape)
print('y_test_categories shape:', y_test_categories.shape)

y_train_categories shape: (67542, 578)
y_test_categories shape: (17164, 578)


## Build model

In [296]:
model = Sequential()
model.add(Dense(512, input_shape=(MAX_WORDS,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(NUM_CLASSES))
model.add(Activation('softmax'))

In [297]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train_matrices, y_train_categories,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    verbose=1,
                    validation_split=0.1)

Train on 60787 samples, validate on 6755 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [298]:
score = model.evaluate(x_test_matrices, y_test_categories,
                       batch_size=BATCH_SIZE, verbose=1)

print('Test score:', score[0])
print('Test accuracy:', score[1])

Test accuracy: 0.391284082964
