In [23]:
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import pandas as pd
from keras.models import Sequential
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import LSTM, SimpleRNN, GRU
import sys

In [24]:
# Read train data.
rawdata = pd.read_csv('/home/lich/Workspace/Rate-My-Professor/Data/train.csv')
rawdata['comments'] = rawdata['comments'].fillna('')
comment = [x.lower() for x in rawdata['comments']]
quality = rawdata['quality']
# Read test data
rawdata = pd.read_csv('/home/lich/Workspace/Rate-My-Professor/Data/test.csv')
rawdata['comments'] = rawdata['comments'].fillna('')
rawcomment_test = [x.lower() for x in rawdata['comments']]


In [25]:
# Configuration for Word
BASE_DIR = '/home/lich/Workspace/Learning/NLP_Learning/lstm'
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'
MAX_SEQUENCE_LENGTH = 200
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
batch_size = 32


In [26]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Indexing word vectors.
Found 400000 word vectors.


In [27]:

# second, prepare text samples and their labels
# get train and test text
print('Processing text dataset')

texts = comment  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = quality  # list of label ids

test_texts = rawcomment_test
# for name in sorted(os.listdir(TEXT_DATA_DIR)):
#     path = os.path.join(TEXT_DATA_DIR, name)
#     if os.path.isdir(path):
#         label_id = len(labels_index)
#         labels_index[name] = label_id
#         for fname in sorted(os.listdir(path)):
#             if fname.isdigit():
#                 fpath = os.path.join(path, fname)
#                 if sys.version_info < (3,):
#                     f = open(fpath)
#                 else:
#                     f = open(fpath, encoding='latin-1')
#                 texts.append(f.read())
#                 f.close()
#                 labels.append(label_id)

print('Found %s texts for training.' % len(texts))
print('Found %s tests for testing.' % len(test_texts))

Processing text dataset
Found 117811 texts.


In [28]:

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# get test text samples into a 2D interger tensor
test_tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
test_tokenizer.fit_on_texts(test_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

test_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(test_word_index))

Found 53528 unique tokens.


In [29]:
print(sequences[0])
print(word_index["love"])
print(np.max(map(lambda x: len(x.split()), comment)))

[29, 39, 55, 2078, 173, 20, 6, 937, 46, 1, 152, 296, 215, 146, 178, 20, 6, 2, 9, 59, 259, 391, 4, 1114, 24, 584, 5, 1, 70, 10, 1, 5126, 9, 183, 236, 164, 3, 69, 2, 347, 24, 1, 64, 17, 42, 143, 42, 15, 199, 33, 3, 424, 1595, 35, 15, 18, 1, 180, 66, 641, 1, 732, 2, 111, 19, 7, 149, 1, 4]
172
124


In [30]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
print('Shape of test data tensor:', test_)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
x_test = test_data[:]

print('Preparing embedding matrix.')


Shape of data tensor: (117811, 200)
Shape of label tensor: (117811, 11)
Preparing embedding matrix.


In [33]:
print(nb_validation_samples)

23562


In [32]:
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

# for test embedding matrix
test_nb_words = min(MAX_NB_WORDS, len(test_word_index))
test_embedding_matrix = np.zeros((test_nb_words + 1, EMBEDDING_DIM))
for word, i in test_word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)



(20001, 100)


In [None]:
embedding_layer = Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False,
                            dropout=0.2)
batch_size = 32

print('Build model...')
# sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences = embedding_layer()
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.add(Dense(11, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=1,
          validation_data=(x_val, y_val))
score, acc = model.evaluate(x_val, y_val,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Build model...
Train...
Train on 94249 samples, validate on 23562 samples
Epoch 1/5

In [None]:


y_first_model = first_model.predict(test_X)
mse = mean_squared_error(test_y, y_first_model)