In [6]:
import numpy as np
import data_helpers
from w2v import train_word2vec

from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D

np.random.seed(2)

# Parameters
# ==================================================
#
# Model Variations. See Kim Yoon's Convolutional Neural Networks for
# Sentence Classification, Section 3 for detail.


model_variation = 'CNN-rand'  # CNN-rand | CNN-non-static | CNN-static
print('Model variation is %s' % model_variation)

# Model Hyperparameters
sequence_length = 125
embedding_dim = 20
filter_sizes = (3, 4)
num_filters = 150
dropout_prob = (0.25, 0.5)
hidden_dims = 150

# Training parameters
batch_size = 32
num_epochs = 1
val_split = 0.2

# Word2Vec parameters, see train_word2vec
min_word_count = 1  # Minimum word count
context = 10        # Context window size

# Data Preparatopn
# ==================================================
#
# Load data
print("Loading data...")
x, y, vocabulary, vocabulary_inv = data_helpers.load_data()

Model variation is CNN-rand
Loading data...


In [7]:
print x.shape, y

(117811, 125) [[0 0 0 ..., 0 1 0]
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 1 0 0]
 ..., 
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 1]]


In [8]:

if model_variation == 'CNN-non-static' or model_variation == 'CNN-static':
    embedding_weights = train_word2vec(
        x, vocabulary_inv, embedding_dim, min_word_count, context)
    if model_variation == 'CNN-static':
        x = embedding_weights[0][x]
elif model_variation == 'CNN-rand':
    embedding_weights = None
else:
    raise ValueError('Unknown model variation')

# Shuffle data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices].argmax(axis=1)

print("Vocabulary Size: {:d}".format(len(vocabulary)))

# Building model
# ==================================================
#
# graph subnet with one input and one output,
# convolutional layers concateneted in parallel
graph_in = Input(shape=(sequence_length, embedding_dim))
convs = []
for fsz in filter_sizes:
    conv = Convolution1D(nb_filter=num_filters,
                         filter_length=fsz,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(graph_in)
    pool = MaxPooling1D(pool_length=2)(conv)
    flatten = Flatten()(pool)
    convs.append(flatten)

if len(filter_sizes) > 1:
    out = Merge(mode='concat')(convs)
else:
    out = convs[0]

graph = Model(input=graph_in, output=out)

# main sequential model
model = Sequential()
if not model_variation == 'CNN-static':
    model.add(Embedding(len(vocabulary), embedding_dim, input_length=sequence_length,
                        weights=embedding_weights))
model.add(Dropout(dropout_prob[0], input_shape=(
    sequence_length, embedding_dim)))
model.add(graph)
model.add(Dense(hidden_dims))
model.add(Dropout(dropout_prob[1]))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop', metrics=['accuracy'])

print x_shuffled.shape, x_shuffled[1]
print y_shuffled.shape, y_shuffled[1]
# Training model
# ==================================================
# model.fit(x_shuffled, y_shuffled, batch_size=batch_size,
#           nb_epoch=num_epochs, validation_split=val_split, verbose=1)


Vocabulary Size: 53530
(117811, 125) [  212  1714  1001    71  6677  3798    13    49   269    90     3  2025
    32    15   201 18300   100    16  2544   211   141    24    16  2725
   755  1232   244    10    43    19     7    60    31     1   119   168
     7    74    23     4 11661   244    10    94   488   322   166     2
   285   120   166     9   979   148    10  1592    12   624     2    17
   833  3361   240     9  1184     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0]
(117811,) 10


(69356, 125) (69356, 11)
Vocabulary Size: 41279
[   9  149  271  714   31   21    6    2  135    1 2319  455   10   71    3
   52  138 1719    5    4   38   35   32    9   69  162   56 1589    3  229
   44  268   12  143  798   16   15   56 1044   26    9  141   41    1  130
    2  271  743   18    1  172  349   94  150    1  120  573  149   58   27
    1   79   57   98  158    3   25   95   43   10   16    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0] [35540 62156 49799 ..., 59148 40106  9173]
(69356, 125) [   13    15    59    10     1    72   259    26   118    45    12     1
  5121   895    77     6   133     4    50    79  4842    82    46     2
    13   446     7    51     7   152     3    95    18   145   323   635
   120     1 25538   253   103    13     5

NameError: name 'predict' is not defined

In [9]:
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import pandas as pd


In [13]:
X_train, X_test, y_train, y_test = train_test_split(x_shuffled, y_shuffled)

# Make a pipeline to do unigrams then run linear regression
# http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
pipeline = Pipeline([
        ('regression', RandomForestRegressor(n_estimators=200,max_depth=25,n_jobs=-1))
])
cv = GridSearchCV(
    pipeline, {}
).fit(X_train, y_train)
mse = mean_squared_error(y_test, cv.predict(X_test))
print("MSE: {}".format(mse))

MSE: 5.34270836687


In [14]:
x_, y_, vocabulary_, vocabulary_inv_ = data_helpers.load_pre_data()
print x.shape, y.shape
predictions = cv.predict(x_)
y_quality = [0 for i in range(len(predictions))]
outfile = open('predict.out','wb')
for i in range(len(predictions)):
    y_quality[i] = predictions[i]
    outfile.write(str(y_quality[i]) + '\n')
outfile.close()

(117811, 125) (117811, 11)
