## Setup

### Imports

In [1]:
import re
import numpy as np
import pandas as pd
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


### Read in and preprocess data

In [2]:
df = pd.read_csv('data.csv')
def clean_text(txt: str) -> str:
    clean = re.sub('[^a-zA-Z0-9\s]','',txt.lower())
    clean = clean.replace('rt', ' ')
    return clean
df['text'] = df['text'].apply(clean_text)

print(f'shape: {df.shape}')
entries = df.shape[0]
positives = df[df['sentiment'] == 0].shape[0]
print(f'+ ex: {positives}\n- ex: {entries - positives}')
df.head()

shape: (1343, 2)
+ ex: 272
- ex: 1071


Unnamed: 0,text,sentiment
0,we have got a huge giveaway for everybody toda...,0
1,that we have tunnel bear is the simple vpn app...,0
2,squarespace featuring squarespace domains a ne...,0
3,thanks for watching guys if this video sucked ...,0
4,so thanks for watching guys if you just liked ...,0


In [3]:
MAX_FEATURES = 2000
tokenizer = Tokenizer(num_words=MAX_FEATURES, split=' ')
texts_vals = df['text'].values
tokenizer.fit_on_texts(texts_vals)
X = tokenizer.texts_to_sequences(texts_vals)
X = pad_sequences(X)
Y = pd.get_dummies(df['sentiment']).values

## F1 score code
This needs to be defined before the model in order to use F1 score as a metric

In [4]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## LSTM Model

In [5]:
EMBED_DIM = 128
LSTM_OUT = 196
seq_len = X.shape[1]

In [10]:
model = Sequential()
model.add(Embedding(MAX_FEATURES, EMBED_DIM, input_length=seq_len))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(LSTM_OUT, dropout=0.2, recurrent_dropout=0.2, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', f1])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 3835, 128)         256000    
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 3835, 128)         0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


## 5-fold cross validation

In [None]:
def 