<a href="https://colab.research.google.com/github/darasliwinski/nlp_disaster/blob/update_lower/nlpDisaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries and load data

In [3]:
import pandas as pd
import numpy as np

# For cleaning the text
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import regex as re
import string

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_colwidth', None)

# For building our model
import tensorflow.keras
import sklearn
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D

In [4]:
import tensorflow as tf
import tensorflow_hub as hub

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
train = pd.read_csv('/content/drive/MyDrive/data/train.csv') 
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [7]:
test = pd.read_csv('/content/drive/MyDrive/data/test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [8]:
ids = test.id

print('Total length of the dataset: ', len(train)+len(test))
print('shape of training set: ', train.shape)
print('shape of testing set: ', test.shape)

Total length of the dataset:  10876
shape of training set:  (7613, 5)
shape of testing set:  (3263, 4)


## Clean Data

In [9]:
nlp = spacy.load('en_core_web_sm')

nltk.download('stopwords')
nltk.download('punkt')

# spacy (362 words)
spacy_st = nlp.Defaults.stop_words
# nltk(179 words)
nltk_st = stopwords.words('english')

def clean(tweet, http = True, punc = True, lem = True, stop_w = True):
    
    if http is True:
        tweet = re.sub("https?:\/\/t.co\/[A-Za-z0-9]*", '', tweet)

    # stop words
    if stop_w == 'nltk':
        tweet = [word for word in word_tokenize(tweet) if not word.lower() in nltk_st]
        tweet = ' '.join(tweet)

    elif stop_w == 'spacy':
        tweet = [word for word in word_tokenize(tweet) if not word.lower() in spacy_st]
        tweet = ' '.join(tweet)

    # lemmitizing
    if lem == True:
        lemmatized = [word.lemma_ for word in nlp(tweet)]
        tweet = ' '.join(lemmatized)

    # punctuation removal
    if punc is True:
        tweet = tweet.translate(str.maketrans('', '', string.punctuation))
        
    # removing extra space
    tweet = re.sub("\s+", ' ', tweet)
    
    return tweet

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# apply function to clean data
train['cleaned_text'] = train.text.apply(lambda x: clean(x, lem = False, stop_w = 'nltk', http = True, punc = True))
test['cleaned_text'] = test.text.apply(lambda x: clean(x, lem = False, stop_w = 'nltk', http = True, punc = True))

In [11]:
# convert to lower
train['cleaned_text'] = train['cleaned_text'].apply(lambda x:x.lower())
test['cleaned_text'] = test['cleaned_text'].apply(lambda x:x.lower())


In [12]:
cleaned_train = train.iloc[:,3:6]
cleaned_test = test.iloc[:,3:6]

In [13]:
cleaned_train.head()

Unnamed: 0,text,target,cleaned_text
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,deeds reason earthquake may allah forgive us
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,residents asked shelter place notified officers evacuation shelter place orders expected
3,"13,000 people receive #wildfires evacuation orders in California",1,13000 people receive wildfires evacuation orders california
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,got sent photo ruby alaska smoke wildfires pours school


### Tokenize

In [14]:
from bert import bert_tokenization
BertTokenizer = bert_tokenization.FullTokenizer

In [15]:
BertTokenizer = bert_tokenization.FullTokenizer

bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [16]:
# Test tokenizer
sentence = 'Terrorist will crush the Tower'
print('Tokenized version of {} is : \n {} '.format(sentence, tokenizer.tokenize(sentence)))

Tokenized version of Terrorist will crush the Tower is : 
 ['terrorist', 'will', 'crush', 'the', 'tower'] 


In [17]:
def tokenize_tweets(text_):
    return tokenizer.convert_tokens_to_ids(['[CLS]'] + tokenizer.tokenize(text_) + ['[SEP]'])

In [18]:
train['tokenized_tweets'] = train.cleaned_text.apply(lambda x: tokenize_tweets(x))
train.head(2)

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokenized_tweets
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,deeds reason earthquake may allah forgive us,"[101, 15616, 3114, 8372, 2089, 16455, 9641, 2149, 102]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[101, 3224, 2543, 2379, 2474, 6902, 3351, 21871, 2243, 2710, 102]"


In [19]:
# what is the maximum length of our tokenized text?
max_len = len(max(train.tokenized_tweets, key = len))
print('The maximum length of each sequence besed on tokenized tweets is:', max_len)

The maximum length of each sequence besed on tokenized tweets is: 53


### Padded Tweets

In [20]:
train['padded_tweets'] = train.tokenized_tweets.apply(lambda x: x + [0] * (max_len - len(x)))
train.head(2)

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokenized_tweets,padded_tweets
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,deeds reason earthquake may allah forgive us,"[101, 15616, 3114, 8372, 2089, 16455, 9641, 2149, 102]","[101, 15616, 3114, 8372, 2089, 16455, 9641, 2149, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[101, 3224, 2543, 2379, 2474, 6902, 3351, 21871, 2243, 2710, 102]","[101, 3224, 2543, 2379, 2474, 6902, 3351, 21871, 2243, 2710, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [32]:
# create validation set
train = cleaned_train.iloc[:-1000] # train
valid = cleaned_train.iloc[6613:] # valid


## Constructing the Model

In [33]:
class TweetClassifier:
    
    def __init__(self, tokenizer, bert_layer, max_len, lr = 0.0001,
                 epochs = 15, batch_size = 32,
                 activation = 'sigmoid', optimizer = 'SGD',
                 beta_1=0.9, beta_2=0.999, epsilon=1e-07,
                 metrics = 'accuracy', loss = 'binary_crossentropy'):
        
        self.lr = lr
        self.epochs = epochs
        self.max_len = max_len
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.bert_layer = bert_layer
        

        self.activation = activation
        self.optimizer = optimizer
        
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon =epsilon
        
        self.metrics = metrics
        self.loss = loss

        
    def encode(self, texts):
        
        all_tokens = []
        masks = []
        segments = []
        
        for text in texts:
            
            tokenized = self.tokenizer.convert_tokens_to_ids(['[CLS]'] + self.tokenizer.tokenize(text) + ['[SEP]'])
            
            len_zeros = self.max_len - len(tokenized)
            
            
            padded = tokenized + [0] * len_zeros
            mask = [1] * len(tokenized) + [0] * len_zeros
            segment = [0] * self.max_len
            
            all_tokens.append(padded)
            masks.append(mask)
            segments.append(segment)
        
        return np.array(all_tokens), np.array(masks), np.array(segments)


    def make_model(self):
        
        # Shaping the inputs to our model
        
        input_ids = Input(shape = (self.max_len, ), dtype = tf.int32, name = 'input_ids')
        
        input_mask = Input(shape = (self.max_len, ), dtype = tf.int32, name = 'input_mask')
        
        segment_ids = Input(shape = (self.max_len, ), dtype = tf.int32,  name = 'segment_ids')

        
        pooled_output, sequence_output = bert_layer([input_ids, input_mask, segment_ids] )



        clf_output = sequence_output[:, 0, :]
        
        out = tf.keras.layers.Dense(1, activation = self.activation)(clf_output)
        
        
        model = Model(inputs = [input_ids, input_mask, segment_ids], outputs = out)
        
        # define the optimizer

        if self.optimizer is 'SGD':
            optimizer = SGD(learning_rate = self.lr)

        elif self.optimizer is 'Adam': 
            optimizer = Adam(learning_rate = self.lr, beta_1=self.beta_1, beta_2=self.beta_2, epsilon=self.epsilon)

        model.compile(loss = self.loss, optimizer = self.optimizer, metrics = [self.metrics])
        
        print('Model is compiled with {} optimizer'.format(self.optimizer))
        
        return model
    
    
    
    
    def train(self, x):    
        
        checkpoint = ModelCheckpoint('model.h5', monitor='val_loss',
                                     save_best_only=True)
            
        
        model = self.make_model()
        
        X = self.encode(x['cleaned_text'])
        Y = x['target']
        
        model.fit(X, Y, shuffle = True, validation_split = 0.2, 
                  batch_size=self.batch_size, epochs = self.epochs,
                  callbacks=[checkpoint])
                
        print('Model is fit!')
        
            
    def predict(self, x):
        
        X_test_encoded = self.encode(x['cleaned_text'])
        best_model = tf.keras.models.load_model('model.h5',custom_objects={'KerasLayer':hub.KerasLayer})
        y_pred = best_model.predict(X_test_encoded)
        
        
        return y_pred

# Models

In [None]:
# classifier.train(cleaned_train)  #lr = 0.001, act = sigmoid, opt

Model is compiled with SGD optimizer
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model is fit!


In [None]:
# need to run
classifier = TweetClassifier(tokenizer = tokenizer, bert_layer = bert_layer,
                              max_len = max_len, lr = 0.001,
                              epochs = 3,  activation = 'relu',
                              batch_size = 32,optimizer = 'ADAM',
                              beta_1=0.9, beta_2=0.999, epsilon=1e-07)