# Introduction
This notebook proposes a model to predict binary labels for tweets to indicate whether the tweet refers to a disaster or not.

In [1]:
import os
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

import string
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.preprocessing import sequence
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping

input_path = Path('/kaggle/input')
for dirname, _, filenames in os.walk(input_path):
    for filename in filenames:
        print(Path(dirname) / filename)

Using TensorFlow backend.


/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/sample_submission.csv


# Load data
Read data from csvs provided into pandas.DataFrame.

In [2]:
data_path = input_path / 'nlp-getting-started'
train_df = pd.read_csv(data_path / 'train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
test_df = pd.read_csv(data_path / 'test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


Take a quick look at missing values

In [4]:
print(f'Rows missing keyword: {train_df.keyword.isnull().mean() * 100:.2f}%')
print(f'Rows missing keyword with target = 1: {train_df.loc[train_df.target==1].keyword.isnull().mean() * 100:.2f}%')
print(f'Rows missing keyword with target = 0: {train_df.loc[train_df.target==0].keyword.isnull().mean() * 100:.2f}%')

print('')
print(f'Rows missing location: {train_df.location.isnull().mean() * 100:.2f}%')
print(f'Rows missing location with target = 1: {train_df.loc[train_df.target==1].location.isnull().mean() * 100:.2f}%')
print(f'Rows missing location with target = 0: {train_df.loc[train_df.target==0].location.isnull().mean() * 100:.2f}%')

Rows missing keyword: 0.80%
Rows missing keyword with target = 1: 1.28%
Rows missing keyword with target = 0: 0.44%

Rows missing location: 33.27%
Rows missing location with target = 1: 32.86%
Rows missing location with target = 0: 33.58%


__Notes__:
* A small fraction of rows are missing a keyword
* A third of rows are missing a location. Also from above location data is quite dirty.
* There aren't particularly noticeable differences in either case between target = 0 and 1

# Preprocessing
Clean tweets and transform to sequences of integers for modelling.

In [5]:
def is_number(word): 
        try:
            float(word.replace(',', ''))
            return True
        except ValueError:
            return False

def preprocess_text(raw_df):
    df = raw_df.copy()
    
    # Replace mentions & links
    df['cleaned_text'] = df.text.str.replace('@\S+', 'mention')
    df.cleaned_text = df.cleaned_text.str.replace('http\S+', 'http')
    
    # Remove hash from hashtag
    df.cleaned_text = df.cleaned_text.str.replace('#', '')
    
    # Tokenize
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
    df['words'] = df.cleaned_text.apply(tokenizer.tokenize)

    # Remove punctuation
    df.words = df.words.apply(lambda word_list: [w for w in word_list if w not in string.punctuation])

    # Stemming
    ps = PorterStemmer()
    df.words = df.words.apply(lambda word_list: [ps.stem(w) for w in word_list])

    # Split stop words and rest
    df['stop_words'] = df.words.apply(lambda word_list: [w for w in word_list if w in set(stopwords.words('english'))])
    df.words = df.words.apply(lambda word_list: [w for w in word_list if w not in set(stopwords.words('english'))])

    # Remove numbers
    df.words = df.words.apply(lambda word_list: [w for w in word_list if not is_number(w)])
    
    # Get cleaned text
    df.cleaned_text = df.words.apply(lambda x: ' '.join(x))
    
    return df

train_df = preprocess_text(train_df)
test_df = preprocess_text(test_df)
train_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,words,stop_words
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason thi earthquak may allah forgiv us,"[deed, reason, thi, earthquak, may, allah, for...","[our, are, the, of, all]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,"[forest, fire, near, la, rong, sask, canada]",[]
2,5,,,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...,"[resid, ask, shelter, place, notifi, offic, ev...","[all, to, in, are, be, by, no, other, or, in, ..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california,"[peopl, receiv, wildfir, evacu, order, califor...",[in]
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent thi photo rubi alaska smoke wildfir p...,"[got, sent, thi, photo, rubi, alaska, smoke, w...","[just, from, as, from, into, a]"


In [6]:
def stem_keyword(keyword):
    ps = PorterStemmer()
    # Handle split keywords
    words = keyword.split('%20')
    return '%20'.join([ps.stem(w) for w in words])

def preprocess_keywords(raw_df):
    df = raw_df.copy()
    df['cleaned_keyword'] = df.keyword.fillna('no%20keyword')
    df.cleaned_keyword = df.cleaned_keyword.apply(stem_keyword)
    return df

train_df = preprocess_keywords(train_df)
test_df = preprocess_keywords(test_df)
train_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,words,stop_words,cleaned_keyword
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason thi earthquak may allah forgiv us,"[deed, reason, thi, earthquak, may, allah, for...","[our, are, the, of, all]",no%20keyword
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,"[forest, fire, near, la, rong, sask, canada]",[],no%20keyword
2,5,,,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...,"[resid, ask, shelter, place, notifi, offic, ev...","[all, to, in, are, be, by, no, other, or, in, ...",no%20keyword
3,6,,,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california,"[peopl, receiv, wildfir, evacu, order, califor...",[in],no%20keyword
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent thi photo rubi alaska smoke wildfir p...,"[got, sent, thi, photo, rubi, alaska, smoke, w...","[just, from, as, from, into, a]",no%20keyword


Use prepocessed text and keywords to get inputs.

In [7]:
# Get columns of training set to form inputs and outputs.
train_text = train_df.cleaned_text.values
train_keywords = train_df.cleaned_keyword.values
train_targets = train_df.target.values

# Get columns of test set to form inputs and outputs.
test_text = test_df.cleaned_text.values
test_keywords = test_df.cleaned_keyword.values

# Training & validation
Train and validate a model.

In [8]:
# Vectorise text
word_vec = TfidfVectorizer(
    ngram_range=(1, 1),
    max_df=0.99,
    min_df=2,
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=False,
    norm='l2'
)
word_vec.fit(np.hstack([train_text, test_text]))
train_text_transf = word_vec.transform(train_text).toarray()
test_text_transf = word_vec.transform(test_text).toarray()

# Vectorise keywords
keyword_vec = TfidfVectorizer(
    ngram_range=(1, 1),
    max_df=0.99,
    min_df=2,
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=False,
    norm='l1'
)
keyword_vec.fit(np.hstack([train_keywords, test_keywords]))
train_keywords_transf = word_vec.transform(train_keywords).toarray()
test_keywords_transf = word_vec.transform(test_keywords).toarray()

# Model inputs
train_inputs = np.hstack([train_keywords_transf, train_text_transf])
test_inputs = np.hstack([test_keywords_transf, test_text_transf])

# Split training and validataion sets
X_train, X_val, y_train, y_val = train_test_split(train_inputs, train_targets, test_size=0.2, stratify=train_targets)
X_test = test_inputs

In [9]:
# Stop early and save the best model
model_path = f'{datetime.utcnow():%Y%m%d%H%M%S}_best_model.hdf5'
check_point = ModelCheckpoint(model_path, monitor = "val_loss", verbose=1, save_best_only=True, mode="min")
early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=3)

# Build sequential model
model = Sequential()
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.9))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Fitting & validation on training set.
model.fit(X_train, y_train,
          validation_data=(X_val, y_val),
          batch_size=100,
          epochs=100,
          callbacks=[early_stop, check_point])

Train on 6090 samples, validate on 1523 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.65881, saving model to 20200127215220_best_model.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.65881 to 0.60720, saving model to 20200127215220_best_model.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.60720 to 0.54731, saving model to 20200127215220_best_model.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 0.54731 to 0.50823, saving model to 20200127215220_best_model.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 0.50823 to 0.49323, saving model to 20200127215220_best_model.hdf5
Epoch 6/100

Epoch 00006: val_loss improved from 0.49323 to 0.48634, saving model to 20200127215220_best_model.hdf5
Epoch 7/100

Epoch 00007: val_loss improved from 0.48634 to 0.47860, saving model to 20200127215220_best_model.hdf5
Epoch 8/100

Epoch 00008: val_loss improved from 0.47860 to 0.47461, saving model to 20200127215220_best_model.hdf5
Epoch 9/100

Epoch 00009: va

<keras.callbacks.callbacks.History at 0x7f548a5e1c88>

# Generate predictions
Retrain model, generate predictions and submit.

In [10]:
def predict(X, model):
    return (model.predict(X) > 0.5).ravel().astype(int)

y_pred = predict(X_test, model)

In [11]:
sample_sub_df = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

sub_df = pd.DataFrame({
    'id': sample_sub_df['id'].values.tolist(),
    'target': y_pred.ravel()
})
sub_df.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [12]:
sub_df.to_csv('submission.csv', index=False)