# LSTM classifier for abusive/sarcastic language

## Import libraries

In [31]:
import pandas as pd
import numpy as np
import math

from pymongo import MongoClient
from Preprocessing import config
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import StratifiedKFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
from keras.wrappers.scikit_learn import KerasClassifier



## MongoDB connection

In [137]:
def connectToMongoDB(collection_name):
    client = MongoClient(config.MONGODB['hostname'], config.MONGODB['port'])
    db = client[config.MONGODB['db']]
    collection = db[config.MONGODB[collection_name]]

## Get datasets

In [138]:
def getDatasetsFromMongoDB(collection_name):
	''' mongodb to pandas dataframe, export to csv and return'''
    connectToMongoDB(collection_name)
	results=collection.find()
	#strip and reshuflle
	df =  pd.DataFrame(list(results))
	df=df[['label','text']]
	df=df.reindex(np.random.permutation(df.index))
    filename = collection_name +'.csv'
	df.to_csv(filename,encoding='utf-8-sig')
	return df

In [4]:
def getDatasetsFromCsv():
	'''import csv, reshuffle and return it'''
	df=pd.read_csv('sarcasm_and_news_dataset.csv')
	df=df.reindex(np.random.permutation(df.index))
	return df[['label','text']]


In [5]:
data=getDatasetsFromCsv()
train,test = train_test_split(data,test_size=0.33,random_state=42)

## Embedding parameters

In [6]:
max_features = 1000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 10

## Tokenize

In [59]:
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
print(X)
X = pad_sequences(X)
print(X)

[[8, 2, 538, 237, 29, 17, 3454, 3, 2542, 19, 3, 9, 17, 10, 1, 7, 25, 1721, 6], [13, 1460, 1606, 2, 303, 8, 3455, 5, 12, 87, 645, 3, 363, 1, 2, 395, 25, 2835, 2428, 1460, 4], [39, 2429, 1360, 9, 16, 221, 7, 39, 624, 12, 3221, 1, 2, 761, 6], [2193, 2, 248, 79, 1, 194, 21, 662, 1461, 14, 6], [23, 14, 1122, 31, 195, 8, 2430, 9, 3030, 30, 3222, 288, 5, 2, 167, 9, 20, 12, 14, 29, 1122], [13, 616, 9, 2836, 17, 2, 1, 3031, 3031, 64, 147, 80, 7, 249, 32, 687, 4], [1, 8, 2, 264, 1361, 109, 25, 1360, 9, 2302, 45, 455, 241, 162, 63, 55, 23, 241], [14, 85, 1, 2, 30, 14, 714, 19, 1, 1935, 6], [18, 354, 9, 15, 1241, 1936, 20, 67, 2543, 4, 1310, 539, 11], [10, 2303, 1, 16, 2, 688, 3, 270, 1462, 150, 1, 4], [307, 377, 137, 1, 9, 3, 307, 4], [13, 945, 1083, 178, 1, 2194, 2679, 19, 3, 904, 1, 945, 9, 4], [13, 1557, 1, 5, 313, 1, 108, 21, 762, 2, 715, 52, 689, 8, 2, 946, 4], [11, 11, 14, 39, 409, 1362, 281, 30, 2837, 358, 1, 27, 1722, 75, 61, 21, 23, 2105], [8, 177, 271, 7, 3223, 76, 2, 2014, 991, 14, 64,

[[   0    0    0 ...   25 1721    6]
 [   0    0    0 ... 2428 1460    4]
 [   0    0    0 ...    2  761    6]
 ...
 [   0    0    0 ...    5    2    4]
 [   0    0    0 ...   15   73    6]
 [   0    0    0 ...   10    7    6]]


## Get labels, split dataset

In [48]:
Y = pd.get_dummies(data['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.33, 
    random_state=42)

validation_size = math.ceil(X_test.shape[0]/2)

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]

#Get shapes
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
print(X_validate.shape,Y_validate.shape)

label ['sarcasm' 'fact']
text ['A la  Liga casi lo sancionan por parar el balón con el trasero y por los pases de taquito que incitaron al rival <hashtags>'
 'EP Bin Laden declarará la guerra a Musharraf en un nuevo vídeo: El líder de la red Al Qaeda, Osama bin .. <link>'
 'Le dije "Hola" y se desconectó. Creo que le dio un infarto de la emoción. <hashtags>'
 ...
 'El derrumbe de una fachada deja cuatro heridos en Alcalá de Henares: Un "derrumbe espontáneo" en la facha.. <link>'
 'Ilianise,Julieta y yo tenemos la suerte mas cabrona del mundo. <hashtags>'
 'FC Barcelona tenemos competencia, ya no somos los unicos que compramos arbitros. <hashtags>']
(17592, 38) (17592, 2)
(4332, 38) (4332, 2)
(4333, 38) (4333, 2)


## Build model

In [12]:
model=None
model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train

In [13]:
model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

Epoch 1/10
 - 56s - loss: 0.1207 - acc: 0.9528
Epoch 2/10
 - 55s - loss: 0.0561 - acc: 0.9795
Epoch 3/10
 - 52s - loss: 0.0444 - acc: 0.9833
Epoch 4/10
 - 49s - loss: 0.0363 - acc: 0.9868
Epoch 5/10
 - 52s - loss: 0.0313 - acc: 0.9895
Epoch 6/10
 - 58s - loss: 0.0283 - acc: 0.9907
Epoch 7/10
 - 49s - loss: 0.0243 - acc: 0.9912
Epoch 8/10
 - 48s - loss: 0.0208 - acc: 0.9928
Epoch 9/10
 - 47s - loss: 0.0222 - acc: 0.9921
Epoch 10/10
 - 46s - loss: 0.0186 - acc: 0.9939


<keras.callbacks.History at 0x1ae38e6e6d8>

# Validate

In [14]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.07
acc: 0.98


## Predict

In [15]:
positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

Positive Accuracy 98.02904564315352 %
Negative Accuracy 97.83783783783784 %
1890
1928
2353
2405


## Train and get results, different embedding parameters

In [1]:
#1000 features
max_features = 1000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

NameError: name 'Sequential' is not defined

In [22]:
#2000 features
max_features = 2000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

Epoch 1/10
 - 78s - loss: 0.1174 - acc: 0.9549
Epoch 2/10
 - 74s - loss: 0.0572 - acc: 0.9796
Epoch 3/10
 - 72s - loss: 0.0440 - acc: 0.9840
Epoch 4/10
 - 72s - loss: 0.0383 - acc: 0.9868
Epoch 5/10
 - 73s - loss: 0.0320 - acc: 0.9889
Epoch 6/10
 - 74s - loss: 0.0295 - acc: 0.9901
Epoch 7/10
 - 66s - loss: 0.0248 - acc: 0.9910
Epoch 8/10
 - 65s - loss: 0.0234 - acc: 0.9917
Epoch 9/10
 - 69s - loss: 0.0184 - acc: 0.9937
Epoch 10/10
 - 69s - loss: 0.0171 - acc: 0.9940
score: 0.08
acc: 0.98
Positive Accuracy 97.82157676348547 %
Negative Accuracy 98.17047817047818 %
1886
1928
2361
2405


In [21]:
#4000 features
max_features = 4000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

Epoch 1/10
 - 79s - loss: 0.1280 - acc: 0.9508
Epoch 2/10
 - 77s - loss: 0.0580 - acc: 0.9781
Epoch 3/10
 - 78s - loss: 0.0440 - acc: 0.9847
Epoch 4/10
 - 75s - loss: 0.0352 - acc: 0.9874
Epoch 5/10
 - 72s - loss: 0.0318 - acc: 0.9884
Epoch 6/10
 - 78s - loss: 0.0290 - acc: 0.9901
Epoch 7/10
 - 69s - loss: 0.0255 - acc: 0.9907
Epoch 8/10
 - 77s - loss: 0.0226 - acc: 0.9910
Epoch 9/10
 - 77s - loss: 0.0206 - acc: 0.9931
Epoch 10/10
 - 77s - loss: 0.0174 - acc: 0.9940
score: 0.08
acc: 0.98
Positive Accuracy 98.39211618257261 %
Negative Accuracy 97.92099792099792 %
1897
1928
2355
2405


In [16]:
#8000 features
max_features = 8000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

Epoch 1/10
 - 62s - loss: 0.1251 - acc: 0.9512
Epoch 2/10
 - 61s - loss: 0.0572 - acc: 0.9791
Epoch 3/10
 - 62s - loss: 0.0438 - acc: 0.9849
Epoch 4/10
 - 60s - loss: 0.0372 - acc: 0.9862
Epoch 5/10
 - 61s - loss: 0.0307 - acc: 0.9893
Epoch 6/10
 - 61s - loss: 0.0289 - acc: 0.9897
Epoch 7/10
 - 63s - loss: 0.0247 - acc: 0.9908
Epoch 8/10
 - 63s - loss: 0.0235 - acc: 0.9917
Epoch 9/10
 - 63s - loss: 0.0195 - acc: 0.9928
Epoch 10/10
 - 62s - loss: 0.0184 - acc: 0.9931
score: 0.08
acc: 0.98
Positive Accuracy 97.82157676348547 %
Negative Accuracy 97.83783783783784 %
1886
1928
2353
2405


In [None]:
#16000 features
max_features = 16000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

In [18]:
#2000 features, embedding dim 256
max_features = 2000
embedding_dim = 256
lstm_out_dim = 196
batch_size = 32
epochs = 15

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

Epoch 1/15
 - 88s - loss: 0.1100 - acc: 0.9608
Epoch 2/15
 - 85s - loss: 0.0511 - acc: 0.9809
Epoch 3/15
 - 84s - loss: 0.0421 - acc: 0.9857
Epoch 4/15
 - 84s - loss: 0.0337 - acc: 0.9883
Epoch 5/15
 - 81s - loss: 0.0294 - acc: 0.9898
Epoch 6/15
 - 86s - loss: 0.0252 - acc: 0.9908
Epoch 7/15
 - 101s - loss: 0.0218 - acc: 0.9932
Epoch 8/15
 - 97s - loss: 0.0188 - acc: 0.9930
Epoch 9/15
 - 103s - loss: 0.0168 - acc: 0.9944
Epoch 10/15
 - 103s - loss: 0.0137 - acc: 0.9955
Epoch 11/15


KeyboardInterrupt: 

In [19]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

score: 0.09
acc: 0.98
Positive Accuracy 96.99170124481327 %
Negative Accuracy 98.46153846153847 %
1870
1928
2368
2405


In [20]:
#4000 features, embedding dim 64
max_features = 4000
embedding_dim = 64
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

Epoch 1/10
 - 59s - loss: 0.1355 - acc: 0.9440
Epoch 2/10
 - 61s - loss: 0.0646 - acc: 0.9760
Epoch 3/10
 - 60s - loss: 0.0489 - acc: 0.9827
Epoch 4/10
 - 59s - loss: 0.0405 - acc: 0.9852
Epoch 5/10
 - 63s - loss: 0.0375 - acc: 0.9867
Epoch 6/10
 - 60s - loss: 0.0353 - acc: 0.9875
Epoch 7/10
 - 62s - loss: 0.0293 - acc: 0.9897
Epoch 8/10
 - 56s - loss: 0.0278 - acc: 0.9906
Epoch 9/10
 - 60s - loss: 0.0250 - acc: 0.9915
Epoch 10/10
 - 57s - loss: 0.0227 - acc: 0.9922
score: 0.09
acc: 0.97
Positive Accuracy 95.33195020746888 %
Negative Accuracy 98.54469854469855 %
1838
1928
2370
2405


## 10 fold Cross-validation

In [58]:
max_features = 4000
embedding_dim = 64
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activa tion='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])
n_folds = 10
#print(Y.shape)
#print(Y)
Y_reshaped=[]
for item in Y:
    Y_reshaped.append(item[0])
    #print(item[0])
skf = StratifiedKFold(Y_reshaped, n_folds=n_folds, shuffle=True)


for i, (train, test) in enumerate(skf):
        print ("Running Fold", i+1, "/10",)
        model=None
        model = Sequential()
        model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
        model.add(SpatialDropout1D(0.4))
        model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
        model.add(Dense(2,activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam',
                      metrics=['accuracy'])
        print(model)
        model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)
        score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
        print("score: %.2f" % (score))
        print("acc: %.2f" % (acc))
print(skf)

Running Fold 1 /10
Epoch 1/10
 - 103s - loss: 0.1408 - acc: 0.9432
Epoch 2/10
 - 97s - loss: 0.0616 - acc: 0.9773
Epoch 3/10
 - 97s - loss: 0.0478 - acc: 0.9835
Epoch 4/10
 - 97s - loss: 0.0419 - acc: 0.9847
Epoch 5/10
 - 97s - loss: 0.0366 - acc: 0.9869
Epoch 6/10
 - 87s - loss: 0.0318 - acc: 0.9891
Epoch 7/10
 - 86s - loss: 0.0314 - acc: 0.9893
Epoch 8/10
 - 89s - loss: 0.0272 - acc: 0.9907
Epoch 9/10
 - 94s - loss: 0.0249 - acc: 0.9914
Epoch 10/10
 - 99s - loss: 0.0232 - acc: 0.9919
score: 0.08
acc: 0.98
Running Fold 2 /10
Epoch 1/10
 - 111s - loss: 0.1367 - acc: 0.9449
Epoch 2/10
 - 105s - loss: 0.0601 - acc: 0.9780
Epoch 3/10
 - 105s - loss: 0.0487 - acc: 0.9822
Epoch 4/10
 - 104s - loss: 0.0398 - acc: 0.9868
Epoch 5/10
 - 104s - loss: 0.0361 - acc: 0.9875
Epoch 6/10
 - 105s - loss: 0.0326 - acc: 0.9887
Epoch 7/10
 - 105s - loss: 0.0290 - acc: 0.9889
Epoch 8/10
 - 106s - loss: 0.0281 - acc: 0.9891
Epoch 9/10
 - 107s - loss: 0.0247 - acc: 0.9914
Epoch 10/10
 - 111s - loss: 0.0225 -

In [None]:
Running Fold 1 /10
Epoch 1/10
 - 103s - loss: 0.1408 - acc: 0.9432
Epoch 2/10
 - 97s - loss: 0.0616 - acc: 0.9773
Epoch 3/10
 - 97s - loss: 0.0478 - acc: 0.9835
Epoch 4/10
 - 97s - loss: 0.0419 - acc: 0.9847
Epoch 5/10
 - 97s - loss: 0.0366 - acc: 0.9869
Epoch 6/10
 - 87s - loss: 0.0318 - acc: 0.9891
Epoch 7/10
 - 86s - loss: 0.0314 - acc: 0.9893
Epoch 8/10
 - 89s - loss: 0.0272 - acc: 0.9907
Epoch 9/10
 - 94s - loss: 0.0249 - acc: 0.9914
Epoch 10/10
 - 99s - loss: 0.0232 - acc: 0.9919
score: 0.08
acc: 0.98
Running Fold 2 /10
Epoch 1/10
 - 111s - loss: 0.1367 - acc: 0.9449
Epoch 2/10
 - 105s - loss: 0.0601 - acc: 0.9780
Epoch 3/10
 - 105s - loss: 0.0487 - acc: 0.9822
Epoch 4/10
 - 104s - loss: 0.0398 - acc: 0.9868
Epoch 5/10
 - 104s - loss: 0.0361 - acc: 0.9875
Epoch 6/10
 - 105s - loss: 0.0326 - acc: 0.9887
Epoch 7/10
 - 105s - loss: 0.0290 - acc: 0.9889
Epoch 8/10
 - 106s - loss: 0.0281 - acc: 0.9891
Epoch 9/10
 - 107s - loss: 0.0247 - acc: 0.9914
Epoch 10/10
 - 111s - loss: 0.0225 - acc: 0.9924
score: 0.07
acc: 0.98
Running Fold 3 /10
Epoch 1/10
 - 133s - loss: 0.1343 - acc: 0.9467
Epoch 2/10
 - 123s - loss: 0.0616 - acc: 0.9781
Epoch 3/10
 - 113s - loss: 0.0475 - acc: 0.9829
Epoch 4/10
 - 115s - loss: 0.0387 - acc: 0.9861
Epoch 5/10
 - 115s - loss: 0.0348 - acc: 0.9876
Epoch 6/10
 - 109s - loss: 0.0318 - acc: 0.9893
Epoch 7/10
 - 113s - loss: 0.0299 - acc: 0.9887
Epoch 8/10
 - 100s - loss: 0.0255 - acc: 0.9915
Epoch 9/10
 - 114s - loss: 0.0246 - acc: 0.9910
Epoch 10/10
 - 109s - loss: 0.0225 - acc: 0.9922
score: 0.07
acc: 0.98
Running Fold 4 /10
Epoch 1/10
 - 130s - loss: 0.1349 - acc: 0.9447
Epoch 2/10
 - 113s - loss: 0.0608 - acc: 0.9785
Epoch 3/10
 - 117s - loss: 0.0493 - acc: 0.9819
Epoch 4/10
 - 114s - loss: 0.0414 - acc: 0.9860
Epoch 5/10
 - 123s - loss: 0.0383 - acc: 0.9876
Epoch 6/10
 - 116s - loss: 0.0334 - acc: 0.9881
Epoch 7/10
 - 118s - loss: 0.0317 - acc: 0.9882
Epoch 8/10
 - 117s - loss: 0.0278 - acc: 0.9905
Epoch 9/10
 - 117s - loss: 0.0258 - acc: 0.9910
Epoch 10/10
 - 128s - loss: 0.0229 - acc: 0.9925
score: 0.08
acc: 0.98
Running Fold 5 /10
Epoch 1/10
 - 132s - loss: 0.1385 - acc: 0.9422
Epoch 2/10
 - 124s - loss: 0.0603 - acc: 0.9785
Epoch 3/10
 - 132s - loss: 0.0471 - acc: 0.9824
Epoch 4/10
 - 135s - loss: 0.0420 - acc: 0.9843
Epoch 5/10
 - 134s - loss: 0.0383 - acc: 0.9867
Epoch 6/10
 - 137s - loss: 0.0315 - acc: 0.9895
Epoch 7/10
 - 122s - loss: 0.0298 - acc: 0.9891
Epoch 8/10
 - 118s - loss: 0.0261 - acc: 0.9915
Epoch 9/10
 - 118s - loss: 0.0241 - acc: 0.9914
Epoch 10/10
 - 122s - loss: 0.0238 - acc: 0.9913
score: 0.07
acc: 0.98
Running Fold 6 /10
Epoch 1/10
 - 143s - loss: 0.1376 - acc: 0.9515
Epoch 2/10
 - 140s - loss: 0.0627 - acc: 0.9775
Epoch 3/10
 - 130s - loss: 0.0496 - acc: 0.9815
Epoch 4/10
 - 138s - loss: 0.0406 - acc: 0.9854
Epoch 5/10
 - 136s - loss: 0.0373 - acc: 0.9869
Epoch 6/10
 - 131s - loss: 0.0344 - acc: 0.9877
Epoch 7/10
 - 130s - loss: 0.0293 - acc: 0.9898
Epoch 8/10
 - 137s - loss: 0.0287 - acc: 0.9891
Epoch 9/10
 - 126s - loss: 0.0257 - acc: 0.9906
Epoch 10/10
 - 131s - loss: 0.0227 - acc: 0.9919
score: 0.07
acc: 0.98
Running Fold 7 /10
Epoch 1/10
 - 165s - loss: 0.1376 - acc: 0.9443
Epoch 2/10
 - 148s - loss: 0.0633 - acc: 0.9775
Epoch 3/10
 - 155s - loss: 0.0488 - acc: 0.9820
Epoch 4/10
 - 151s - loss: 0.0410 - acc: 0.9847
Epoch 5/10
 - 166s - loss: 0.0349 - acc: 0.9872
Epoch 6/10
 - 154s - loss: 0.0321 - acc: 0.9887
Epoch 7/10
 - 151s - loss: 0.0289 - acc: 0.9898
Epoch 8/10
 - 147s - loss: 0.0267 - acc: 0.9910
Epoch 9/10
 - 149s - loss: 0.0265 - acc: 0.9914
Epoch 10/10
 - 154s - loss: 0.0235 - acc: 0.9916
score: 0.06
acc: 0.98
Running Fold 8 /10
Epoch 1/10
 - 154s - loss: 0.1368 - acc: 0.9451
Epoch 2/10
 - 153s - loss: 0.0596 - acc: 0.9787
Epoch 3/10
 - 146s - loss: 0.0520 - acc: 0.9812
Epoch 4/10
 - 145s - loss: 0.0443 - acc: 0.9841
Epoch 5/10
 - 151s - loss: 0.0362 - acc: 0.9869
Epoch 6/10
 - 166s - loss: 0.0335 - acc: 0.9882
Epoch 7/10
 - 158s - loss: 0.0298 - acc: 0.9890
Epoch 8/10
 - 168s - loss: 0.0282 - acc: 0.9897
Epoch 9/10
 - 165s - loss: 0.0268 - acc: 0.9906
Epoch 10/10
 - 163s - loss: 0.0234 - acc: 0.9918
score: 0.08
acc: 0.98
Running Fold 9 /10
Epoch 1/10
 - 167s - loss: 0.1353 - acc: 0.9442
Epoch 2/10
 - 167s - loss: 0.0628 - acc: 0.9778
Epoch 3/10
 - 169s - loss: 0.0483 - acc: 0.9822
Epoch 4/10
 - 195s - loss: 0.0416 - acc: 0.9853
Epoch 5/10
 - 183s - loss: 0.0391 - acc: 0.9870
Epoch 6/10
 - 163s - loss: 0.0323 - acc: 0.9885
Epoch 7/10
 - 160s - loss: 0.0321 - acc: 0.9892
Epoch 8/10
 - 161s - loss: 0.0291 - acc: 0.9892
Epoch 9/10
 - 162s - loss: 0.0247 - acc: 0.9911
Epoch 10/10
 - 164s - loss: 0.0241 - acc: 0.9921
score: 0.07
acc: 0.98
Running Fold 10 /10
Epoch 1/10
 - 177s - loss: 0.1364 - acc: 0.9459
Epoch 2/10
 - 172s - loss: 0.0611 - acc: 0.9790
Epoch 3/10
 - 192s - loss: 0.0491 - acc: 0.9831
Epoch 4/10
 - 176s - loss: 0.0415 - acc: 0.9861
Epoch 5/10
 - 169s - loss: 0.0369 - acc: 0.9864
Epoch 6/10
 - 174s - loss: 0.0328 - acc: 0.9890
Epoch 7/10
 - 172s - loss: 0.0291 - acc: 0.9904
Epoch 8/10
 - 170s - loss: 0.0295 - acc: 0.9892
Epoch 9/10
 - 168s - loss: 0.0254 - acc: 0.9904
Epoch 10/10
 - 172s - loss: 0.0239 - acc: 0.9919
score: 0.07
acc: 0.98
sklearn.cross_validation.StratifiedKFold(labels=[0 1 0 ... 1 0 0], n_folds=10, shuffle=True, random_state=None)