In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['movie-review-sentiment-analysis-kernels-only', 'glove-global-vectors-for-word-representation', 'fasttext-crawl-300d-2m']


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.tokenize import TweetTokenizer
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
pd.set_option('max_colwidth',400)

In [3]:
train = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/train.tsv',delimiter='\t',encoding='utf-8')
test = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/test.tsv',delimiter='\t',encoding='utf-8')
sub = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/sampleSubmission.csv', sep=",")

In [4]:
tokenizer = TweetTokenizer()

In [5]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(train['Phrase'].values) + list(test['Phrase'].values)
vectorizer.fit(full_text)
train_vectorized = vectorizer.transform(train['Phrase'])
test_vectorized = vectorizer.transform(test['Phrase'])

In [6]:
y = train['Sentiment']

In [7]:
logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)

In [8]:
%%time
ovr.fit(train_vectorized, y)



CPU times: user 6.88 s, sys: 8 ms, total: 6.88 s
Wall time: 6.9 s


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None)

#### n_jobs : int or None, optional (default=None)
The number of CPUs to use to do the computation. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details.
#### cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are:

None, to use the default 3-fold cross validation,
integer, to specify the number of folds in a (Stratified)KFold,
CV splitter,
An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if the estimator is a classifier and y is either binary or multiclass, StratifiedKFold is used. In all other cases, KFold is used.

Refer User Guide for the various cross-validation strategies that can be used her

In [9]:
scores = cross_val_score(ovr, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 56.55%, std 0.07.


In [10]:
%%time
svc = LinearSVC(dual=False)
scores = cross_val_score(svc, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 56.51%, std 0.68.
CPU times: user 60 ms, sys: 28 ms, total: 88 ms
Wall time: 20.1 s


In [11]:
ovr.fit(train_vectorized, y);
svc.fit(train_vectorized, y);



## Deep learning
And now let's try DL. DL should work better for text classification with multiple layers. I use an architecture similar to those which were used in toxic competition.

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, CuDNNGRU, CuDNNLSTM, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping

Using TensorFlow backend.


#### t.fit_texts()
The tokenizer provides:
- word counts
- word documensts
- word index
- document count

#### lower: boolean. Whether to convert the texts to lowercase.
#### filters: a string where each element is a character that will be filtered from the texts. The default is all punctuation, plus tabs and line breaks, minus the ' character.

In [13]:
tk = Tokenizer(lower = True, filters='')
tk.fit_on_texts(full_text)

#### texts_to_sequences()
only top "num_words" most frequent words will be taken into account. Only word known by the tokenizer will  be taken into account

In [14]:
train_tokenized = tk.texts_to_sequences(train['Phrase'])
test_tokenized = tk.texts_to_sequences(test['Phrase'])

In [15]:
max_len = 50
X_train = pad_sequences(train_tokenized, maxlen = max_len)
X_test = pad_sequences(test_tokenized, maxlen = max_len)

In [16]:
embedding_path = "../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt"

In [17]:
embed_size = 100
max_features = 30000

In [18]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
print(embedding_matrix[1])

[-0.038194   -0.24487001  0.72812003 -0.39961001  0.083172    0.043953
 -0.39140999  0.3344     -0.57545     0.087459    0.28786999 -0.06731
  0.30906001 -0.26383999 -0.13231    -0.20757     0.33395001 -0.33848
 -0.31742999 -0.48335999  0.1464     -0.37303999  0.34577     0.052041
  0.44946    -0.46970999  0.02628    -0.54154998 -0.15518001 -0.14106999
 -0.039722    0.28277001  0.14393     0.23464    -0.31020999  0.086173
  0.20397     0.52623999  0.17163999 -0.082378   -0.71787    -0.41531
  0.20334999 -0.12763     0.41367     0.55186999  0.57907999 -0.33476999
 -0.36559001 -0.54856998 -0.062892    0.26583999  0.30204999  0.99774998
 -0.80480999 -3.0243001   0.01254    -0.36941999  2.21670008  0.72201002
 -0.24978     0.92136002  0.034514    0.46744999  1.10790002 -0.19358
 -0.074575    0.23353    -0.052062   -0.22044     0.057162   -0.15806
 -0.30798    -0.41624999  0.37972     0.15006    -0.53211999 -0.20550001
 -1.25259995  0.071624    0.70564997  0.49744001 -0.42063001  0.26148
 -

In [19]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
y_ohe = ohe.fit_transform(y.values.reshape(-1, 1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


#### ModelCheckpoint:
Save the model after every epoch.
#### lr: float >= 0. Learning rate.
#### decay: float >= 0. Learning rate decay over each update.
#### units: Positive integer, dimensionality of the output space.
#### filters/conv_size: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution).

In [20]:
def build_model3(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    file_path = "best_model.hdf5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
    
    inp = Input(shape = (max_len,))
    x = Embedding(19479, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(spatial_dr)(x)

    x_gru = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x1)
    max_pool1_gru = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool3_gru = GlobalAveragePooling1D()(x3)
    max_pool3_gru = GlobalMaxPooling1D()(x3)
    
    x_lstm = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x1)
    max_pool1_lstm = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool3_lstm = GlobalAveragePooling1D()(x3)
    max_pool3_lstm = GlobalMaxPooling1D()(x3)
    
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool3_gru, max_pool3_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool3_lstm, max_pool3_lstm])
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    x = Dense(5, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 20, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [21]:
model8 = build_model3(lr = 1e-3, lr_d = 1e-10, units = 64, spatial_dr = 0.3, kernel_size1=3, kernel_size2=2, dense_units=32, dr=0.1, conv_size=32)

Train on 140454 samples, validate on 15606 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.34793, saving model to best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.34793 to 0.33104, saving model to best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.33104 to 0.32683, saving model to best_model.hdf5
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.32683
Epoch 5/20

Epoch 00005: val_loss improved from 0.32683 to 0.32286, saving model to best_model.hdf5
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.32286
Epoch 7/20

Epoch 00007: val_loss improved from 0.32286 to 0.31814, saving model to best_model.hdf5
Epoch 8/20

Epoch 00008: val_loss did not improve from 0.31814
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.31814
Epoch 10/20

Epoch 00010: val_loss improved from 0.31814 to 0.31683, saving model to best_model.hdf5
Epoch 11/20

Epoch 00011: val_loss improved from 0.31683 to 0.31598, saving model to best_model.hdf5

In [22]:
model9 = build_model3(lr = 1e-3, lr_d = 1e-10, units = 64, spatial_dr = 0.3, kernel_size1=3, kernel_size2=2, dense_units=32, dr=0.1, conv_size=32)

Train on 140454 samples, validate on 15606 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.34950, saving model to best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.34950 to 0.33244, saving model to best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.33244 to 0.32600, saving model to best_model.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 0.32600 to 0.31957, saving model to best_model.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 0.31957 to 0.31928, saving model to best_model.hdf5
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.31928
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.31928
Epoch 8/20

Epoch 00008: val_loss improved from 0.31928 to 0.31579, saving model to best_model.hdf5
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.31579
Epoch 10/20

Epoch 00010: val_loss did not improve from 0.31579
Epoch 11/20

Epoch 00011: val_loss did not improve from 0.31579


In [23]:
pred8 = model8.predict(X_test, batch_size = 1024, verbose = 1)
pred = pred8
pred9 = model9.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred9



In [24]:
predictions = np.round(np.argmax(pred, axis=1)).astype(int)
sub['Sentiment'] = predictions
sub.to_csv("blend.csv", index=False)