# LSTM classifier for abusive/sarcastic language

## Import libraries

In [136]:
import pandas as pd
import numpy as np
import math

from pymongo import MongoClient
from Preprocessing import config
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
from keras.wrappers.scikit_learn import KerasClassifier

## MongoDB connection

In [137]:
client = MongoClient(config.MONGODB['hostname'], config.MONGODB['port'])
db = client[config.MONGODB['db']]
collection = db[config.MONGODB['collection_news_and_sarcasm']]

## Get datasets

In [138]:
def getDatasetsFromMongoDB():
	''' mongodb to pandas dataframe, export to csv and return'''
	results=collection.find()
	#strip and reshuflle
	df =  pd.DataFrame(list(results))
	df=df[['label','text']]
	df=df.reindex(np.random.permutation(df.index))
	df.to_csv('sarcasm_and_news_dataset.csv',encoding='utf-8-sig')
	return df

In [139]:
def getDatasetsFromCsv():
	'''import csv, reshuffle and return it'''
	df=pd.read_csv('sarcasm_and_news_dataset.csv')
	df=df.reindex(np.random.permutation(df.index))
	return df[['label','text']]


In [141]:
data=getDatasetsFromCsv()
train,test = train_test_split(data,test_size=0.33,random_state=42)

## Tokenize

In [142]:
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

## Get labels, split dataset

In [143]:
Y = pd.get_dummies(data['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.33, 
    random_state=42)

validation_size = math.ceil(X_test.shape[0]/2)

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]

#Get shapes
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
print(X_validate.shape,Y_validate.shape)

(17592, 38) (17592, 2)
(4332, 38) (4332, 2)
(4333, 38) (4333, 2)


## Build model

In [144]:
def createModel(max_features, embedding_dim, lstm_out_dim, batch_size, epochs):
    '''creates a model for 2 labels'''
    model = Sequential()
    model.add(Embedding(max_features, embedding_dim,input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out_dim, dropout=0.2,recurrent_dropout=0.2))
    model.add(Dense(2,activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam',
                  metrics=['accuracy'])
    return model

# Train

In [145]:
def train(model,epochs,batch_size):
    '''train a model'''
    model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)

# Validate

In [146]:
def validate(model, batch_size):
    '''validate a model'''
    score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

    print("score: %.2f" % (score))
    print("acc: %.2f" % (acc))

## Predict

In [147]:
def predict(model):
    '''predict with a model'''
    positive_count = 0
    negative_count = 0
    positive_correct = 0
    negative_correct = 0

    for x in range(len(X_validate)):

        result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]

        if np.argmax(result) == np.argmax(Y_validate[x]):
            if np.argmax(Y_validate[x]) == 0:
                negative_correct += 1
            else:
                positive_correct += 1

        if np.argmax(Y_validate[x]) == 0:
            negative_count += 1
        else:
            positive_count += 1

    print("Positive Accuracy", positive_correct/positive_count*100, "%")
    print("Negative Accuracy", negative_correct/negative_count*100, "%")
    print(positive_correct)
    print(positive_count)
    print(negative_correct)
    print(negative_count)

## Train and get results, different embedding parameters

In [None]:
#1000 features
max_features = 1000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 15

model=None
model=createModel(max_features=max_features, embedding_dim=embedding_dim, lstm_out_dim=lstm_out_dim, 
                batch_size=batch_size, epochs=epochs)
print(model.summary())
train(model,epochs=epochs,batch_size=1)
validate(model,batch_size)
predict(model)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_22 (Embedding)     (None, 38, 128)           128000    
_________________________________________________________________
spatial_dropout1d_22 (Spatia (None, 38, 128)           0         
_________________________________________________________________
lstm_22 (LSTM)               (None, 196)               254800    
_________________________________________________________________
dense_22 (Dense)             (None, 2)                 394       
Total params: 383,194
Trainable params: 383,194
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
 - 1232s - loss: 0.0949 - acc: 0.9656
Epoch 2/15
 - 1083s - loss: 0.0582 - acc: 0.9804
Epoch 3/15
 - 1001s - loss: 0.0451 - acc: 0.9850
Epoch 4/15


In [None]:
#2000 features
max_features = 2000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 15

model=None
model=createModel(max_features=max_features, embedding_dim=embedding_dim, lstm_out_dim=lstm_out_dim, 
                batch_size=batch_size, epochs=epochs)
print(model.summary())
train(model,epochs=epochs,batch_size=1)
validate(model,batch_size)
predict(model)

In [None]:
#4000 features
max_features = 4000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 15

model=None
model=createModel(max_features=max_features, embedding_dim=embedding_dim, lstm_out_dim=lstm_out_dim, 
                batch_size=batch_size, epochs=epochs)
print(model.summary())
train(model,epochs=epochs,batch_size=1)
validate(model,batch_size)
predict(model)

In [None]:
#8000 features
max_features = 8000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 15

model=None
model=createModel(max_features=max_features, embedding_dim=embedding_dim, lstm_out_dim=lstm_out_dim, 
                batch_size=batch_size, epochs=epochs)
print(model.summary())
train(model,epochs=epochs,batch_size=1)
validate(model,batch_size)
predict(model)

In [None]:
#16000 features
max_features = 16000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 15

model=None
model=createModel(max_features=max_features, embedding_dim=embedding_dim, lstm_out_dim=lstm_out_dim, 
                batch_size=batch_size, epochs=epochs)
print(model.summary())
train(model,epochs=epochs,batch_size=1)
validate(model,batch_size)
predict(model)

In [None]:
#4000 features, embedding dim 256
max_features = 4000
embedding_dim = 128
lstm_out_dim = 196
batch_size = 32
epochs = 15

model=None
model=createModel(max_features=max_features, embedding_dim=embedding_dim, lstm_out_dim=lstm_out_dim, 
                batch_size=batch_size, epochs=epochs)
print(model.summary())
train(model,epochs=epochs,batch_size=1)
validate(model,batch_size)
predict(model)

## 10 fold Cross-validation

In [40]:
n_folds = 10
skf = StratifiedKFold(Y, n_folds=n_folds, shuffle=True)

for i, (train, test) in enumerate(skf):
        print ()"Running Fold", i+1, "/", n_folds)
        X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.33, 
    random_state=42)
        model = None # Clearing the NN.
        model = create_model()
        model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=2)
        score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
        print("score: %.2f" % (score))
        print("acc: %.2f" % (acc))
print(skf)

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.