# LSTM classifier for abusive/sarcastic language

## Import libraries

In [66]:
import pandas as pd
import numpy as np
import math
import keras_metrics

from pymongo import MongoClient
from Preprocessing import config
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import StratifiedKFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
from keras.wrappers.scikit_learn import KerasClassifier

## MongoDB connection

In [13]:
collection_name="news_sarcasm_abusive_normal"
#collection_name="sarcasm_abusive_normal"
#collection_name="abusive_normal"
#collection_name="news_abusive"
#collection_name="sarcasm_normal"

In [19]:
def connectToMongoDB(collection_name):
    client = MongoClient(config.MONGODB['hostname'], config.MONGODB['port'])
    db = client[config.MONGODB['db']]
    collection = db[config.MONGODB[collection_name]]

## Get datasets

In [29]:
def getDatasetsFromMongoDB(collection_name):
    ''' mongodb to pandas dataframe, export to csv and return'''
    connectToMongoDB(collection_name)
    results=collection.find()
    #strip and reshuflle
    df =  pd.DataFrame(list(results))
    df=df[['label','text']]
    df=df.reindex(np.random.permutation(df.index))
    filename = collection_name +'.csv'
    df.to_csv(filename,encoding='utf-8-sig')
    return df

In [30]:
def getDatasetsFromCsv():
	'''import csv, reshuffle and return it'''
	df=pd.read_csv('sarcasm_and_news_dataset.csv')
	df=df.reindex(np.random.permutation(df.index))
	return df[['label','text']]


In [67]:
#first time, extract from MongoDB
#data=getDatasetsFromMongoDB('news_sarcasm_abusive_normal')
data=getDatasetsFromCsv()
print(data)
train,test = train_test_split(data,test_size=0.33,random_state=42)

         label                                               text
20272   normal  No importa si no les gusta él, o Topp Dogg, o ...
38747  sarcasm  a veces no entiendo los twees de joel si quier...
30377  sarcasm  Métete tu sarcasmo por el centro del orto y ve...
25977     fact  José Tomás indulta un toro en Barcelona: Barce...
46386   normal  Poco a poco vemos como las cosas van tomando f...
54655     fact   ÚLTIMAHORA Un ataque aéreo de EEUU mata al lí...
27654   normal  Mi mamá es de esas personas que se come los ga...
14992  abusive  Yo, aquí como pendejo pensando en ti, y tu bie...
54610   normal  La esencia del Guadalajara se ha forjado graci...
15882  sarcasm  Esos que ponen en sus bios "sarcasmo es mi seg...
36282   normal  Voy a hacer una saga de libros de fantasía con...
59063   normal  No tiene una poquita de vergüenza, enserio?tan...
63472  sarcasm  Prefiero que me manden a comer mierda y me put...
12258   normal  Marcha por Brandon y Glen En el SPN de Tipitap...
34083  sar

## Embedding parameters

In [68]:
max_features = 1000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 10

## Tokenize

In [69]:
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
print(X)
X = pad_sequences(X)
print(X)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[[  0   0   0 ... 577   6  16]
 [  0   0   0 ...  47 247 580]
 [  0   0   0 ... 302 497  11]
 ...
 [  0   0   0 ...   2  17 445]
 [  0   0   0 ... 170 226  16]
 [  0   0   0 ...  49  19   3]]


## Get labels, split dataset

In [70]:
Y = pd.get_dummies(data['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.33, 
    random_state=42)

validation_size = math.ceil(X_test.shape[0]/2)

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]

#Get shapes
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
print(X_validate.shape,Y_validate.shape)

(51729, 63) (51729, 4)
(12739, 63) (12739, 4)
(12740, 63) (12740, 4)


## Build model

In [71]:
model=None
model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(4,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy',keras_metrics.precision(), keras_metrics.recall()])

# Train

In [72]:
model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

Epoch 1/10
 - 290s - loss: 0.3012 - acc: 0.8977 - precision: 0.9064 - recall: 0.8657
Epoch 2/10
 - 282s - loss: 0.2093 - acc: 0.9295 - precision: 0.9387 - recall: 0.9217
Epoch 3/10
 - 279s - loss: 0.1884 - acc: 0.9370 - precision: 0.9447 - recall: 0.9302
Epoch 4/10
 - 277s - loss: 0.1776 - acc: 0.9399 - precision: 0.9474 - recall: 0.9334
Epoch 5/10
 - 285s - loss: 0.1665 - acc: 0.9432 - precision: 0.9504 - recall: 0.9372
Epoch 6/10
 - 285s - loss: 0.1579 - acc: 0.9454 - precision: 0.9525 - recall: 0.9396
Epoch 7/10
 - 300s - loss: 0.1506 - acc: 0.9480 - precision: 0.9538 - recall: 0.9420
Epoch 8/10
 - 320s - loss: 0.1427 - acc: 0.9507 - precision: 0.9568 - recall: 0.9458
Epoch 9/10
 - 290s - loss: 0.1358 - acc: 0.9523 - precision: 0.9581 - recall: 0.9473
Epoch 10/10
 - 303s - loss: 0.1279 - acc: 0.9544 - precision: 0.9599 - recall: 0.9501


<keras.callbacks.History at 0x23839a77940>

# Validate

In [55]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.20
acc: 0.93


## Predict

In [56]:
print(pd.get_dummies(data['label']).values)
l1_count = 0
l2_count = 0
l3_count = 0
l4_count = 0
l1_correct = 0
l2_correct = 0
l3_correct = 0
l4_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            l1_correct += 1
        elif np.argmax(Y_validate[x]) == 1:
            l2_correct += 1
        elif np.argmax(Y_validate[x]) == 2:
            l3_correct += 1
        else:
            l4_correct += 1
            

    if np.argmax(Y_validate[x]) == 0:
        l1_count += 1
    elif np.argmax(Y_validate[x]) == 1:
        l2_count += 1
    elif np.argmax(Y_validate[x]) == 2:
        l3_count += 1
    else:
        l4_count += 1

print("1 Accuracy", l1_correct/l1_count*100, "%")
print("2 Accuracy", l2_correct/l2_count*100, "%")
print("3 Accuracy", l3_correct/l3_count*100, "%")
print("4 Accuracy", l4_correct/l4_count*100, "%")
print(l1_correct)
print(l1_count)
print(l2_correct)
print(l2_count)
print(l3_correct)
print(l3_count)
print(l4_correct)
print(l4_count)

[[1 0 0 0]
 [0 0 1 0]
 [0 0 0 1]
 ...
 [0 0 0 1]
 [0 0 1 0]
 [0 0 1 0]]
1 Accuracy 93.83033419023135 %
2 Accuracy 93.06405548755609 %
3 Accuracy 94.03935185185185 %
4 Accuracy 94.24316514781063 %
2190
2334
2281
2451
3250
3456
4240
4499


In [None]:
print(model)

## Train and get results, different embedding parameters

In [62]:
#250 features, embedding dim 64
max_features = 2000
embedding_dim = 128
lstm_out_dim = 50
batch_size = 30
epochs = 6

train,test = train_test_split(data,test_size=0.33,random_state=42)
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
print(X)
X = pad_sequences(X)
print(X)

Y = pd.get_dummies(data['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.33, 
    random_state=42)

validation_size = math.ceil(X_test.shape[0]/2)

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]

#Get shapes
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
print(X_validate.shape,Y_validate.shape)

model=None
model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(4,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[[  0   0   0 ...   5  21  20]
 [  0   0   0 ... 273  11  16]
 [  0   0   0 ... 741 629  16]
 ...
 [  0   0   0 ... 695  13  16]
 [  0   0   0 ...   4 969  16]
 [  0   0   0 ... 141   1  12]]
(51729, 63) (51729, 4)
(12739, 63) (12739, 4)
(12740, 63) (12740, 4)
Epoch 1/6
 - 142s - loss: 0.3041 - acc: 0.8957
Epoch 2/6
 - 146s - loss: 0.1902 - acc: 0.9361
Epoch 3/6
 - 131s - loss: 0.1689 - acc: 0.9429
Epoch 4/6
 - 119s - loss: 0.1538 - acc: 0.9483
Epoch 5/6
 - 136s - loss: 0.1409 - acc: 0.9522
Epoch 6/6
 - 133s - loss: 0.1322 - acc: 0.9559
score: 0.18
acc: 0.94
Positive Accuracy 94.33019411877763 %
Negative Accuracy 94.17309340188518 %
9816
10406
2198
2334


In [64]:
print(pd.get_dummies(data['label']))

       abusive  fact  normal  sarcasm
47531        1     0       0        0
29525        0     0       1        0
20997        0     0       0        1
37791        0     0       0        1
5511         0     0       0        1
35350        0     0       0        1
25814        0     0       1        0
73592        0     0       0        1
52798        1     0       0        0
70399        0     0       0        1
9777         0     0       0        1
26321        0     0       1        0
73848        0     0       1        0
41459        0     1       0        0
37179        0     0       1        0
2654         0     0       0        1
65348        0     0       1        0
27881        0     0       0        1
70677        0     1       0        0
32769        0     0       0        1
14535        0     0       1        0
1774         0     0       1        0
62431        0     0       0        1
40329        0     0       0        1
16008        0     1       0        0
63165       

In [22]:
#2000 features
max_features = 2000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

Epoch 1/10
 - 78s - loss: 0.1174 - acc: 0.9549
Epoch 2/10
 - 74s - loss: 0.0572 - acc: 0.9796
Epoch 3/10
 - 72s - loss: 0.0440 - acc: 0.9840
Epoch 4/10
 - 72s - loss: 0.0383 - acc: 0.9868
Epoch 5/10
 - 73s - loss: 0.0320 - acc: 0.9889
Epoch 6/10
 - 74s - loss: 0.0295 - acc: 0.9901
Epoch 7/10
 - 66s - loss: 0.0248 - acc: 0.9910
Epoch 8/10
 - 65s - loss: 0.0234 - acc: 0.9917
Epoch 9/10
 - 69s - loss: 0.0184 - acc: 0.9937
Epoch 10/10
 - 69s - loss: 0.0171 - acc: 0.9940
score: 0.08
acc: 0.98
Positive Accuracy 97.82157676348547 %
Negative Accuracy 98.17047817047818 %
1886
1928
2361
2405


In [21]:
#4000 features
max_features = 4000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

Epoch 1/10
 - 79s - loss: 0.1280 - acc: 0.9508
Epoch 2/10
 - 77s - loss: 0.0580 - acc: 0.9781
Epoch 3/10
 - 78s - loss: 0.0440 - acc: 0.9847
Epoch 4/10
 - 75s - loss: 0.0352 - acc: 0.9874
Epoch 5/10
 - 72s - loss: 0.0318 - acc: 0.9884
Epoch 6/10
 - 78s - loss: 0.0290 - acc: 0.9901
Epoch 7/10
 - 69s - loss: 0.0255 - acc: 0.9907
Epoch 8/10
 - 77s - loss: 0.0226 - acc: 0.9910
Epoch 9/10
 - 77s - loss: 0.0206 - acc: 0.9931
Epoch 10/10
 - 77s - loss: 0.0174 - acc: 0.9940
score: 0.08
acc: 0.98
Positive Accuracy 98.39211618257261 %
Negative Accuracy 97.92099792099792 %
1897
1928
2355
2405


In [16]:
#8000 features
max_features = 8000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

Epoch 1/10
 - 62s - loss: 0.1251 - acc: 0.9512
Epoch 2/10
 - 61s - loss: 0.0572 - acc: 0.9791
Epoch 3/10
 - 62s - loss: 0.0438 - acc: 0.9849
Epoch 4/10
 - 60s - loss: 0.0372 - acc: 0.9862
Epoch 5/10
 - 61s - loss: 0.0307 - acc: 0.9893
Epoch 6/10
 - 61s - loss: 0.0289 - acc: 0.9897
Epoch 7/10
 - 63s - loss: 0.0247 - acc: 0.9908
Epoch 8/10
 - 63s - loss: 0.0235 - acc: 0.9917
Epoch 9/10
 - 63s - loss: 0.0195 - acc: 0.9928
Epoch 10/10
 - 62s - loss: 0.0184 - acc: 0.9931
score: 0.08
acc: 0.98
Positive Accuracy 97.82157676348547 %
Negative Accuracy 97.83783783783784 %
1886
1928
2353
2405


In [None]:
#16000 features
max_features = 16000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

In [18]:
#2000 features, embedding dim 256
max_features = 2000
embedding_dim = 256
lstm_out_dim = 196
batch_size = 32
epochs = 15

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

Epoch 1/15
 - 88s - loss: 0.1100 - acc: 0.9608
Epoch 2/15
 - 85s - loss: 0.0511 - acc: 0.9809
Epoch 3/15
 - 84s - loss: 0.0421 - acc: 0.9857
Epoch 4/15
 - 84s - loss: 0.0337 - acc: 0.9883
Epoch 5/15
 - 81s - loss: 0.0294 - acc: 0.9898
Epoch 6/15
 - 86s - loss: 0.0252 - acc: 0.9908
Epoch 7/15
 - 101s - loss: 0.0218 - acc: 0.9932
Epoch 8/15
 - 97s - loss: 0.0188 - acc: 0.9930
Epoch 9/15
 - 103s - loss: 0.0168 - acc: 0.9944
Epoch 10/15
 - 103s - loss: 0.0137 - acc: 0.9955
Epoch 11/15


KeyboardInterrupt: 

In [19]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

score: 0.09
acc: 0.98
Positive Accuracy 96.99170124481327 %
Negative Accuracy 98.46153846153847 %
1870
1928
2368
2405


In [20]:
#4000 features, embedding dim 64
max_features = 4000
embedding_dim = 64
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

positive_count = 0
negative_count = 0
positive_correct = 0
negative_correct = 0

for x in range(len(X_validate)):

    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1

    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

print("Positive Accuracy", positive_correct/positive_count*100, "%")
print("Negative Accuracy", negative_correct/negative_count*100, "%")
print(positive_correct)
print(positive_count)
print(negative_correct)
print(negative_count)

Epoch 1/10
 - 59s - loss: 0.1355 - acc: 0.9440
Epoch 2/10
 - 61s - loss: 0.0646 - acc: 0.9760
Epoch 3/10
 - 60s - loss: 0.0489 - acc: 0.9827
Epoch 4/10
 - 59s - loss: 0.0405 - acc: 0.9852
Epoch 5/10
 - 63s - loss: 0.0375 - acc: 0.9867
Epoch 6/10
 - 60s - loss: 0.0353 - acc: 0.9875
Epoch 7/10
 - 62s - loss: 0.0293 - acc: 0.9897
Epoch 8/10
 - 56s - loss: 0.0278 - acc: 0.9906
Epoch 9/10
 - 60s - loss: 0.0250 - acc: 0.9915
Epoch 10/10
 - 57s - loss: 0.0227 - acc: 0.9922
score: 0.09
acc: 0.97
Positive Accuracy 95.33195020746888 %
Negative Accuracy 98.54469854469855 %
1838
1928
2370
2405


## 10 fold Cross-validation

In [58]:
max_features = 4000
embedding_dim = 64
lstm_out_dim = 196
batch_size = 32
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activa tion='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])
n_folds = 10
#print(Y.shape)
#print(Y)
Y_reshaped=[]
for item in Y:
    Y_reshaped.append(item[0])
    #print(item[0])
skf = StratifiedKFold(Y_reshaped, n_folds=n_folds, shuffle=True)


for i, (train, test) in enumerate(skf):
        print ("Running Fold", i+1, "/10",)
        model=None
        model = Sequential()
        model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
        model.add(SpatialDropout1D(0.4))
        model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
        model.add(Dense(2,activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam',
                      metrics=['accuracy'])
        print(model)
        model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)
        score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
        print("score: %.2f" % (score))
        print("acc: %.2f" % (acc))
print(skf)

Running Fold 1 /10
Epoch 1/10
 - 103s - loss: 0.1408 - acc: 0.9432
Epoch 2/10
 - 97s - loss: 0.0616 - acc: 0.9773
Epoch 3/10
 - 97s - loss: 0.0478 - acc: 0.9835
Epoch 4/10
 - 97s - loss: 0.0419 - acc: 0.9847
Epoch 5/10
 - 97s - loss: 0.0366 - acc: 0.9869
Epoch 6/10
 - 87s - loss: 0.0318 - acc: 0.9891
Epoch 7/10
 - 86s - loss: 0.0314 - acc: 0.9893
Epoch 8/10
 - 89s - loss: 0.0272 - acc: 0.9907
Epoch 9/10
 - 94s - loss: 0.0249 - acc: 0.9914
Epoch 10/10
 - 99s - loss: 0.0232 - acc: 0.9919
score: 0.08
acc: 0.98
Running Fold 2 /10
Epoch 1/10
 - 111s - loss: 0.1367 - acc: 0.9449
Epoch 2/10
 - 105s - loss: 0.0601 - acc: 0.9780
Epoch 3/10
 - 105s - loss: 0.0487 - acc: 0.9822
Epoch 4/10
 - 104s - loss: 0.0398 - acc: 0.9868
Epoch 5/10
 - 104s - loss: 0.0361 - acc: 0.9875
Epoch 6/10
 - 105s - loss: 0.0326 - acc: 0.9887
Epoch 7/10
 - 105s - loss: 0.0290 - acc: 0.9889
Epoch 8/10
 - 106s - loss: 0.0281 - acc: 0.9891
Epoch 9/10
 - 107s - loss: 0.0247 - acc: 0.9914
Epoch 10/10
 - 111s - loss: 0.0225 -

In [None]:
Running Fold 1 /10
Epoch 1/10
 - 103s - loss: 0.1408 - acc: 0.9432
Epoch 2/10
 - 97s - loss: 0.0616 - acc: 0.9773
Epoch 3/10
 - 97s - loss: 0.0478 - acc: 0.9835
Epoch 4/10
 - 97s - loss: 0.0419 - acc: 0.9847
Epoch 5/10
 - 97s - loss: 0.0366 - acc: 0.9869
Epoch 6/10
 - 87s - loss: 0.0318 - acc: 0.9891
Epoch 7/10
 - 86s - loss: 0.0314 - acc: 0.9893
Epoch 8/10
 - 89s - loss: 0.0272 - acc: 0.9907
Epoch 9/10
 - 94s - loss: 0.0249 - acc: 0.9914
Epoch 10/10
 - 99s - loss: 0.0232 - acc: 0.9919
score: 0.08
acc: 0.98
Running Fold 2 /10
Epoch 1/10
 - 111s - loss: 0.1367 - acc: 0.9449
Epoch 2/10
 - 105s - loss: 0.0601 - acc: 0.9780
Epoch 3/10
 - 105s - loss: 0.0487 - acc: 0.9822
Epoch 4/10
 - 104s - loss: 0.0398 - acc: 0.9868
Epoch 5/10
 - 104s - loss: 0.0361 - acc: 0.9875
Epoch 6/10
 - 105s - loss: 0.0326 - acc: 0.9887
Epoch 7/10
 - 105s - loss: 0.0290 - acc: 0.9889
Epoch 8/10
 - 106s - loss: 0.0281 - acc: 0.9891
Epoch 9/10
 - 107s - loss: 0.0247 - acc: 0.9914
Epoch 10/10
 - 111s - loss: 0.0225 - acc: 0.9924
score: 0.07
acc: 0.98
Running Fold 3 /10
Epoch 1/10
 - 133s - loss: 0.1343 - acc: 0.9467
Epoch 2/10
 - 123s - loss: 0.0616 - acc: 0.9781
Epoch 3/10
 - 113s - loss: 0.0475 - acc: 0.9829
Epoch 4/10
 - 115s - loss: 0.0387 - acc: 0.9861
Epoch 5/10
 - 115s - loss: 0.0348 - acc: 0.9876
Epoch 6/10
 - 109s - loss: 0.0318 - acc: 0.9893
Epoch 7/10
 - 113s - loss: 0.0299 - acc: 0.9887
Epoch 8/10
 - 100s - loss: 0.0255 - acc: 0.9915
Epoch 9/10
 - 114s - loss: 0.0246 - acc: 0.9910
Epoch 10/10
 - 109s - loss: 0.0225 - acc: 0.9922
score: 0.07
acc: 0.98
Running Fold 4 /10
Epoch 1/10
 - 130s - loss: 0.1349 - acc: 0.9447
Epoch 2/10
 - 113s - loss: 0.0608 - acc: 0.9785
Epoch 3/10
 - 117s - loss: 0.0493 - acc: 0.9819
Epoch 4/10
 - 114s - loss: 0.0414 - acc: 0.9860
Epoch 5/10
 - 123s - loss: 0.0383 - acc: 0.9876
Epoch 6/10
 - 116s - loss: 0.0334 - acc: 0.9881
Epoch 7/10
 - 118s - loss: 0.0317 - acc: 0.9882
Epoch 8/10
 - 117s - loss: 0.0278 - acc: 0.9905
Epoch 9/10
 - 117s - loss: 0.0258 - acc: 0.9910
Epoch 10/10
 - 128s - loss: 0.0229 - acc: 0.9925
score: 0.08
acc: 0.98
Running Fold 5 /10
Epoch 1/10
 - 132s - loss: 0.1385 - acc: 0.9422
Epoch 2/10
 - 124s - loss: 0.0603 - acc: 0.9785
Epoch 3/10
 - 132s - loss: 0.0471 - acc: 0.9824
Epoch 4/10
 - 135s - loss: 0.0420 - acc: 0.9843
Epoch 5/10
 - 134s - loss: 0.0383 - acc: 0.9867
Epoch 6/10
 - 137s - loss: 0.0315 - acc: 0.9895
Epoch 7/10
 - 122s - loss: 0.0298 - acc: 0.9891
Epoch 8/10
 - 118s - loss: 0.0261 - acc: 0.9915
Epoch 9/10
 - 118s - loss: 0.0241 - acc: 0.9914
Epoch 10/10
 - 122s - loss: 0.0238 - acc: 0.9913
score: 0.07
acc: 0.98
Running Fold 6 /10
Epoch 1/10
 - 143s - loss: 0.1376 - acc: 0.9515
Epoch 2/10
 - 140s - loss: 0.0627 - acc: 0.9775
Epoch 3/10
 - 130s - loss: 0.0496 - acc: 0.9815
Epoch 4/10
 - 138s - loss: 0.0406 - acc: 0.9854
Epoch 5/10
 - 136s - loss: 0.0373 - acc: 0.9869
Epoch 6/10
 - 131s - loss: 0.0344 - acc: 0.9877
Epoch 7/10
 - 130s - loss: 0.0293 - acc: 0.9898
Epoch 8/10
 - 137s - loss: 0.0287 - acc: 0.9891
Epoch 9/10
 - 126s - loss: 0.0257 - acc: 0.9906
Epoch 10/10
 - 131s - loss: 0.0227 - acc: 0.9919
score: 0.07
acc: 0.98
Running Fold 7 /10
Epoch 1/10
 - 165s - loss: 0.1376 - acc: 0.9443
Epoch 2/10
 - 148s - loss: 0.0633 - acc: 0.9775
Epoch 3/10
 - 155s - loss: 0.0488 - acc: 0.9820
Epoch 4/10
 - 151s - loss: 0.0410 - acc: 0.9847
Epoch 5/10
 - 166s - loss: 0.0349 - acc: 0.9872
Epoch 6/10
 - 154s - loss: 0.0321 - acc: 0.9887
Epoch 7/10
 - 151s - loss: 0.0289 - acc: 0.9898
Epoch 8/10
 - 147s - loss: 0.0267 - acc: 0.9910
Epoch 9/10
 - 149s - loss: 0.0265 - acc: 0.9914
Epoch 10/10
 - 154s - loss: 0.0235 - acc: 0.9916
score: 0.06
acc: 0.98
Running Fold 8 /10
Epoch 1/10
 - 154s - loss: 0.1368 - acc: 0.9451
Epoch 2/10
 - 153s - loss: 0.0596 - acc: 0.9787
Epoch 3/10
 - 146s - loss: 0.0520 - acc: 0.9812
Epoch 4/10
 - 145s - loss: 0.0443 - acc: 0.9841
Epoch 5/10
 - 151s - loss: 0.0362 - acc: 0.9869
Epoch 6/10
 - 166s - loss: 0.0335 - acc: 0.9882
Epoch 7/10
 - 158s - loss: 0.0298 - acc: 0.9890
Epoch 8/10
 - 168s - loss: 0.0282 - acc: 0.9897
Epoch 9/10
 - 165s - loss: 0.0268 - acc: 0.9906
Epoch 10/10
 - 163s - loss: 0.0234 - acc: 0.9918
score: 0.08
acc: 0.98
Running Fold 9 /10
Epoch 1/10
 - 167s - loss: 0.1353 - acc: 0.9442
Epoch 2/10
 - 167s - loss: 0.0628 - acc: 0.9778
Epoch 3/10
 - 169s - loss: 0.0483 - acc: 0.9822
Epoch 4/10
 - 195s - loss: 0.0416 - acc: 0.9853
Epoch 5/10
 - 183s - loss: 0.0391 - acc: 0.9870
Epoch 6/10
 - 163s - loss: 0.0323 - acc: 0.9885
Epoch 7/10
 - 160s - loss: 0.0321 - acc: 0.9892
Epoch 8/10
 - 161s - loss: 0.0291 - acc: 0.9892
Epoch 9/10
 - 162s - loss: 0.0247 - acc: 0.9911
Epoch 10/10
 - 164s - loss: 0.0241 - acc: 0.9921
score: 0.07
acc: 0.98
Running Fold 10 /10
Epoch 1/10
 - 177s - loss: 0.1364 - acc: 0.9459
Epoch 2/10
 - 172s - loss: 0.0611 - acc: 0.9790
Epoch 3/10
 - 192s - loss: 0.0491 - acc: 0.9831
Epoch 4/10
 - 176s - loss: 0.0415 - acc: 0.9861
Epoch 5/10
 - 169s - loss: 0.0369 - acc: 0.9864
Epoch 6/10
 - 174s - loss: 0.0328 - acc: 0.9890
Epoch 7/10
 - 172s - loss: 0.0291 - acc: 0.9904
Epoch 8/10
 - 170s - loss: 0.0295 - acc: 0.9892
Epoch 9/10
 - 168s - loss: 0.0254 - acc: 0.9904
Epoch 10/10
 - 172s - loss: 0.0239 - acc: 0.9919
score: 0.07
acc: 0.98
sklearn.cross_validation.StratifiedKFold(labels=[0 1 0 ... 1 0 0], n_folds=10, shuffle=True, random_state=None)