# NLP - LASER Embeddings + Keras

This approach encodes the tweets using [LASER](https://github.com/yannvgn/laserembeddings) multilingual sentence embeddings,
followed by a [TF Keras](https://www.tensorflow.org/api_docs/python/tf/keras) dense neural network.

In [1]:
!pip install -q laserembeddings laserembeddings[zh] laserembeddings[ja]
!pip install -q ftfy

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
# import fasttext
import ftfy
import html
import laserembeddings
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import sys

from fastcache import clru_cache
from laserembeddings import Laser
from typing import List, Union
from urllib.parse import unquote
from sklearn.model_selection import train_test_split

In [3]:
df_train = pd.read_csv('../input/nlp-getting-started/train.csv', index_col=0).fillna('')
df_test  = pd.read_csv('../input/nlp-getting-started/test.csv',  index_col=0).fillna('')
df_train

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


# Preprocess Text

Keyword, location and text fields into a single string. 

Simple preprocessing is performed to remove HTML and encoded elements, @usernames, hashtag prefixes and urls.

In [4]:
def preprocess_text(df):
    texts = df[['keyword', 'location', 'text']].agg(' '.join, axis=1)
    texts = texts.apply(ftfy.fix_text)   # fix \x89
    texts = texts.apply(html.unescape)  
    texts = texts.apply(unquote)         # remove %20
    texts = texts.apply(lambda s: re.sub('@\w+', ' ', s))            # remove @usernames
    texts = texts.apply(lambda s: re.sub('#',    ' ', s))            # remove hashtag prefixes    
    texts = texts.apply(lambda s: re.sub('\n',   ' ', s))            # remove newlines
    texts = texts.apply(lambda s: re.sub('\w+://\S+',  '<URL>', s))  # remove urls    
    texts = texts.apply(lambda s: re.sub('\s+',  ' ', s))            # remove multiple spaces    
    return list(texts)
    
preprocess_text(df_train)[:10]
preprocess_text(df_test)[:10]

[' Just happened a terrible car crash',
 ' Heard about earthquake is different cities, stay safe everyone.',
 ' there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all',
 ' Apocalypse lighting. Spokane wildfires',
 ' Typhoon Soudelor kills 28 in China and Taiwan',
 " We're shaking...It's an earthquake",
 " They'd probably still show more life than Arsenal did yesterday, eh? EH?",
 ' Hey! How are you?',
 ' What a nice hat?',
 ' Fuck off!']

# LASER Embeddings

This encodes each of the strings as a LASER embedding (1024 dimentional vector)

In [5]:
%%bash
# DOCS: https://github.com/facebookresearch/LASER/blob/master/install_models.sh

mkdir -p models/laser/
# for FILE in bilstm.eparl21.2018-11-19.pt eparl21.fcodes eparl21.fvocab bilstm.93langs.2018-12-26.pt 93langs.fcodes 93langs.fvocab; do
for FILE in bilstm.93langs.2018-12-26.pt 93langs.fcodes 93langs.fvocab; do
    wget -cq https://dl.fbaipublicfiles.com/laser/models/$FILE -O models/laser/$FILE
done

In [6]:
# from config import config
# from src.utils.fasttest_model import language_detect
# from src.utils.punkt_tokenizer import punkt_tokenize_sentences

config = {
    "laser": {
        "base_dir":  "./models/laser",
        "bpe_codes": "./models/laser/93langs.fcodes",
        "bpe_vocab": "./models/laser/93langs.fvocab",
        "encoder":   "./models/laser/bilstm.93langs.2018-12-26.pt",
    }
}

# Instantiate encoder
# BUG: CUDA GPU memory is exceeded if both laser and labse are loaded together 
# @clru_cache(None)
def get_laser_model():
    laser_model = Laser(
        bpe_codes = config['laser']['bpe_codes'],
        bpe_vocab = config['laser']['bpe_vocab'],
        encoder   = config['laser']['encoder'],
        tokenizer_options = None,
        embedding_options = None
    )
    return laser_model


def laser_encode(text: Union[str, List[str]], lang='en', normalize=True) -> np.ndarray:
    """
    Encodes a corpus of text using LASER
    :param text: Large block of text (will be tokenized), or list of pre-tokenized sentences
    :param lang: 2 digit language code (optional autodetect)
    :return:     embedding matrix
    """
    laser_model = get_laser_model()
    
    # lang = lang or language_detect(text, threshold=0.0)
    if isinstance(text, str):
        # sentences = punkt_tokenize_sentences(text, lang=lang)
        sentences = [ text ]
    else:
        sentences = list(text)

    embedding = laser_model.embed_sentences(sentences, lang=lang)
    
    if normalize:
        embedding = embedding / np.sqrt(np.sum(embedding**2, axis=1)).reshape(-1,1)
        
    return embedding

In [7]:
%%time 

X_train = laser_encode(preprocess_text(df_train))
Y_train = df_train['target']

print('X_train.shape', X_train.shape)
print('Y_train.shape', Y_train.shape)

X_train.shape (7613, 1024)
Y_train.shape (7613,)
CPU times: user 4min 34s, sys: 7.28 s, total: 4min 41s
Wall time: 2min 26s


In [8]:
df_train

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


In [9]:
X_train

array([[ 3.0764327e-03,  2.1392587e-04,  2.8938532e-03, ...,
         1.2719455e-02,  2.9375229e-02,  1.3058826e-02],
       [ 7.4967649e-03, -1.0227719e-04,  2.0214453e-02, ...,
         2.0243460e-02,  1.0186631e-02,  5.6830747e-03],
       [ 4.1807637e-02, -1.1875321e-03, -5.3810547e-03, ...,
        -3.9238170e-05,  3.8756292e-02,  1.2007614e-02],
       ...,
       [ 4.7112587e-03, -7.2170304e-05,  9.9966973e-02, ...,
         1.2328808e-02,  1.9540163e-02,  4.4700809e-02],
       [ 2.0398971e-02,  1.1204826e-03,  5.1898682e-03, ...,
         4.5956508e-03,  1.5736414e-02,  4.8153952e-02],
       [ 2.3192553e-02, -7.7466539e-06,  3.8043603e-03, ...,
         1.4387380e-02,  2.6999321e-02,  5.1070951e-02]], dtype=float32)

In [10]:
Y_train

id
1        1
4        1
5        1
6        1
7        1
        ..
10869    1
10870    1
10871    1
10872    1
10873    1
Name: target, Length: 7613, dtype: int64

# Neural Network - TF Keras

Define and train a dense neural network. 

This inputs a 1024 LASER embedding and outputs a 1 bit classification prediction.

A triangular shaped architecture is used, including Dropout and BatchNorm.

In [11]:
# DOCS: https://keras.io/examples/keras_recipes/antirectifier/

# Build the model
model = tf.keras.Sequential([
    tf.keras.Input(shape=(1024,)),
    tf.keras.layers.Dense(512, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(8, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid),
])
"""
model = tf.keras.Sequential([
    tf.keras.Input(shape=(1024,)),
    tf.keras.layers.Dense(512, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(8, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid),
])
"""

def model_compile_fit(model, X, Y):
    model.summary()
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)
        
    # Compile the model
    model.compile(
        loss      = tf.keras.losses.BinaryCrossentropy(from_logits=True),
        # optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001),
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5),
        metrics   = [ tf.keras.metrics.BinaryAccuracy() ],
    )
    
    # Train the model
    model.fit(
        X_train, Y_train, 
        batch_size = 32, 
        epochs     = 1000, 
        # validation_split = 0.2,
        callbacks = [
            # tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10),
            # tf.keras.callbacks.ModelCheckpoint('model.h5',  monitor='binary_accuracy', mode='max', verbose=0, save_best_only=True)
        ],
        verbose=2
    )
    model.save('model.h5')
    
    print()
    print('Train Accuracy')
    model.evaluate(X_train, Y_train)

    print('Test Accuracy')
    model.evaluate(X_test, Y_test)

    
model_compile_fit(model, X_train, Y_train)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               524800    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 128)               512       
_________________________________________________________________
dense_2 (Dense)              (None, 32)                4

# Submission

In [12]:
preprocess_text(df_test)

[' Just happened a terrible car crash',
 ' Heard about earthquake is different cities, stay safe everyone.',
 ' there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all',
 ' Apocalypse lighting. Spokane wildfires',
 ' Typhoon Soudelor kills 28 in China and Taiwan',
 " We're shaking...It's an earthquake",
 " They'd probably still show more life than Arsenal did yesterday, eh? EH?",
 ' Hey! How are you?',
 ' What a nice hat?',
 ' Fuck off!',
 " No I don't like cold!",
 " NOOOOOOOOO! Don't do that!",
 " No don't tell me that!",
 ' What if?!',
 ' Awesome!',
 "ablaze London Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market <URL>",
 "ablaze Niall's place | SAF 12 SQUAD | will you wear shorts for race ablaze ?",
 'ablaze NIGERIA PreviouslyOnDoyinTv: Toke Makinwa‰Ûªs marriage crisis sets Nigerian Twitter ablaze... <URL>',
 'ablaze Live On Webcam Check these out: <URL> <URL> <URL> <URL> nsfw',
 'ablaze Los A

In [13]:
%%time 

X_test = laser_encode(preprocess_text(df_test))
Y_test = tf.math.round( model.predict(X_test) ).numpy().astype(np.int32)

CPU times: user 2min 3s, sys: 4.84 s, total: 2min 8s
Wall time: 1min 6s


In [14]:
pd.read_csv('../input/nlp-getting-started/sample_submission.csv', index_col=0)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,0
3,0
9,0
11,0
...,...
10861,0
10865,0
10868,0
10874,0


In [15]:
df_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv', index_col=0).fillna('')
df_submission['target'] = Y_test
df_submission.to_csv('submission.csv')
!head submission.csv

id,target
0,1
2,1
3,0
9,1
11,1
12,1
21,1
22,0
27,0


In [16]:
df_submission

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,0
9,1
11,1
...,...
10861,0
10865,1
10868,1
10874,1


# Further Reading

This notebook is part of a series exploring Natural Language Processing
- 0.74164 - [NLP Logistic Regression](https://www.kaggle.com/jamesmcguigan/disaster-tweets-logistic-regression)
- 0.76677 - [NLP LASER Embeddings + Keras](https://www.kaggle.com/jamesmcguigan/nlp-laser-embeddings-keras)
- 0.77536 - [NLP TF-IDF Classifier](https://www.kaggle.com/jamesmcguigan/disaster-tweets-tf-idf-classifier)
- 0.79742 - [NLP Naive Bayes](https://www.kaggle.com/jamesmcguigan/nlp-naive-bayes)