In [1]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import pandas as pd

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
df = pd.concat([train_df,test_df])
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0
1,4,,,Forest fire near La Ronge Sask. Canada,1.0
2,5,,,All residents asked to 'shelter in place' are ...,1.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0


**1.Text cleaning**

In [6]:
import spacy
from spacy.lang.en.examples import sentences 
from nltk.stem.wordnet import WordNetLemmatizer

In [7]:
# Import Spacy and english initialisation
import en_core_web_sm
sp = spacy.load("en_core_web_sm")

In [8]:
# Import Stop words 
all_stopwords = sp.Defaults.stop_words
print(len(all_stopwords))

326


In [9]:
from nltk.tokenize import word_tokenize
def do_cleaning(x, correct_spelling=True, remove_stop_words=True):
  #lowercase the words
  x = x.lower().strip()
  #remove urls
  url = re.compile(r'https?://\S+|www\.\S+')
  x = url.sub(r'',x)
  #remove html tags
  html = re.compile(r'<.*?>')
  x = html.sub(r'',x)
  #remove punctuation
  punct = str.maketrans('','',string.punctuation)
  x = x.translate(punct)
  #remove stop_words
  words = [word for word in x.split() if word.lower() not in all_stopwords]
  x = " ".join(words)
  #lemmatization
  lemmat = sp(x)
  x = " ".join([token.lemma_ for token in lemmat])
  return x

In [10]:
df['clean_text'] = df['text'].apply(do_cleaning)

In [11]:
df

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0,deed reason earthquake allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1.0,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1.0,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,13000 people receive wildfire evacuation order...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0,got send photo ruby alaska smoke wildfire pour...
...,...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,,earthquake safety los angeles ûò safety faste...
3259,10865,,,Storm in RI worse than last hurricane. My city...,,storm ri bad hurricane cityamp3other hard hit ...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,,green line derailment chicago
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,,meg issue hazardous weather outlook hwo


In [12]:
train = df[~df['target'].isna()]
train['target'] = train['target'].astype(int)
test = df[df['target'].isna()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
train.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfire evacuation order...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got send photo ruby alaska smoke wildfire pour...


In [14]:
test.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,0,,,Just happened a terrible car crash,,happen terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",,hear earthquake different city stay safe
2,3,,,"there is a forest fire at spot pond, geese are...",,forest fire spot pond geese flee street save
3,9,,,Apocalypse lighting. #Spokane #wildfires,,apocalypse lighting spokane wildfire
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,,typhoon soudelor kill 28 china taiwan


In [15]:
X_train, X_val, y_train, y_val = train_test_split(train, train['target'], test_size=0.2, random_state=42)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_val.shape)

Shape of train (6090, 6)
Shape of Validation  (1523, 6)


**2. Text vectorization** 



The word embeddings of our dataset can be learned while training a neural network on the classification problem. Before it can be presented to the network, the text data is first encoded so that each word is represented by a unique integer.

In [16]:
from tensorflow.keras.preprocessing import sequence
from keras.preprocessing import text

# Vectorization parameters
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Limit on the length of text sequences. 
# Sequences longer than this will be truncated.
# and less than it will be padded
MAX_SEQUENCE_LENGTH = 50

class CustomTokenizer:
    def __init__(self, train_texts):
        self.train_texts = train_texts
        self.tokenizer = Tokenizer(num_words=TOP_K)
        
    def train_tokenize(self):
        # Get max sequence length.
        max_length = len(max(self.train_texts , key=len))
        self.max_length = min(max_length, MAX_SEQUENCE_LENGTH)
    
        # Create vocabulary with training texts.
        self.tokenizer.fit_on_texts(self.train_texts)
        
    def vectorize_input(self, tweets):
        # Vectorize training and validation texts.
        
        tweets = self.tokenizer.texts_to_sequences(tweets)
        # Fix sequence length to max value. Sequences shorter than the length are
        # padded in the beginning and sequences longer are truncated
        # at the beginning.
        tweets = sequence.pad_sequences(tweets, maxlen=self.max_length, truncating='post',padding='post')
        return tweets
    
tokenizer = CustomTokenizer(train_texts = X_train['clean_text'])
# fit o the train
tokenizer.train_tokenize()
tokenized_train = tokenizer.vectorize_input(X_train['clean_text'])
tokenized_val = tokenizer.vectorize_input(X_val['clean_text'])
tokenized_test = tokenizer.vectorize_input(test['clean_text'])

**3. Construct an embedding Matrix**

We will use pre-trained GloVe vectors from Stanford to create an index of words mapped to known embeddings.

In [17]:
import requests
import zipfile
import tqdm

# we can use a Glove pre-trained embedding
URL = "http://nlp.stanford.edu/data/glove.42B.300d.zip"

url=URL
target_file='glove.zip'
delete_zip=False

def fetch_data(url=url, target_file=target_file, delete_zip=False):
    #if the dataset already exists exit
    if os.path.isfile(target_file):
        print("datasets already downloded :) ")
        return

response = requests.get(url, stream=True)
handle = open(target_file, "wb")
for chunk in tqdm.tqdm(response.iter_content(chunk_size=512)):
  if chunk:  
    handle.write(chunk)
handle.close()  
print("  Download completed ;) :") 
#extract zip_file
zf = zipfile.ZipFile(target_file)
print("1. Extracting {} file".format(target_file))
zf.extractall()
if delete_zip:
  print("2. Deleting {} file".format(trained+".zip"))
  os.remove(path=zip_file)

fetch_data()

3667580it [05:52, 10413.96it/s]


  Download completed ;) :
1. Extracting glove.zip file
datasets already downloded :) 


Then we will create an embedding matrix we will map each word index to its corresponding embedding vector:

In [18]:
import numpy as np

glove_file = 'glove.42B.300d.txt'

EMBEDDING_VECTOR_LENGTH = 50 # <=200
def construct_embedding_matrix(glove_file, word_index):
    embedding_dict = {}
    with open(glove_file,'r') as f:
        for line in f:
            values=line.split()
            # get the word
            word=values[0]
            if word in word_index.keys():
                # get the vector
                vector = np.asarray(values[1:], 'float32')
                embedding_dict[word] = vector
    ###  oov words (out of vacabulary words) will be mapped to 0 vectors

    num_words=len(word_index)+1
    #initialize it to 0
    embedding_matrix=np.zeros((num_words, EMBEDDING_VECTOR_LENGTH))

    for word,i in tqdm.tqdm(word_index.items()):
        if i < num_words:
            vect=embedding_dict.get(word, [])
            if len(vect)>0:
                embedding_matrix[i] = vect[:EMBEDDING_VECTOR_LENGTH]
    return embedding_matrix

embedding_matrix =  construct_embedding_matrix(glove_file, tokenizer.tokenizer.word_index)
print(embedding_matrix.shape)

100%|██████████| 13649/13649 [00:00<00:00, 481019.50it/s]

(13650, 50)





In [19]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.022975  ,  0.087888  , -0.24247999, ..., -0.23213001,
        -0.072726  , -0.18074   ],
       [-0.014949  ,  0.17910001, -0.28883001, ...,  0.018862  ,
         0.19157   , -0.11791   ],
       ...,
       [ 0.70998001, -0.05809   , -0.032732  , ...,  0.13149001,
        -0.35714999,  0.091179  ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.62963998, -0.44130999, -0.089173  , ...,  0.33790001,
        -0.44161999,  0.28178   ]])

**4. Model**

In [20]:
from tensorflow.keras.optimizers import Adam
from keras.initializers import Constant
from keras.layers import Embedding,LSTM,Dense,Dropout

model=Sequential()
optimzer=Adam(clipvalue=0.5)

embedding=Embedding(len(tokenizer.tokenizer.word_index)+1, EMBEDDING_VECTOR_LENGTH, embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_SEQUENCE_LENGTH, trainable=False)

model.add(embedding)
model.add(Dropout(0.2))
#model.add(Dense(30, activation='relu'))#, kernel_constraint=maxnorm(3)))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer=optimzer, loss='binary_crossentropy', metrics=['acc'])

In [21]:
history=model.fit(tokenized_train,y_train, 
                  batch_size=35, epochs=30, 
                  validation_data=(tokenized_val,y_val), 
                  verbose=2)

Epoch 1/30
174/174 - 12s - loss: 0.5874 - acc: 0.6962 - val_loss: 0.5350 - val_acc: 0.7433 - 12s/epoch - 69ms/step
Epoch 2/30
174/174 - 9s - loss: 0.5369 - acc: 0.7468 - val_loss: 0.5187 - val_acc: 0.7590 - 9s/epoch - 52ms/step
Epoch 3/30
174/174 - 9s - loss: 0.5327 - acc: 0.7524 - val_loss: 0.5285 - val_acc: 0.7590 - 9s/epoch - 51ms/step
Epoch 4/30
174/174 - 9s - loss: 0.5272 - acc: 0.7553 - val_loss: 0.5091 - val_acc: 0.7669 - 9s/epoch - 50ms/step
Epoch 5/30
174/174 - 9s - loss: 0.5274 - acc: 0.7522 - val_loss: 0.5147 - val_acc: 0.7663 - 9s/epoch - 49ms/step
Epoch 6/30
174/174 - 8s - loss: 0.5229 - acc: 0.7540 - val_loss: 0.5041 - val_acc: 0.7557 - 8s/epoch - 47ms/step
Epoch 7/30
174/174 - 9s - loss: 0.5171 - acc: 0.7573 - val_loss: 0.5023 - val_acc: 0.7663 - 9s/epoch - 49ms/step
Epoch 8/30
174/174 - 8s - loss: 0.5184 - acc: 0.7570 - val_loss: 0.5345 - val_acc: 0.7597 - 8s/epoch - 45ms/step
Epoch 9/30
174/174 - 8s - loss: 0.5087 - acc: 0.7634 - val_loss: 0.5009 - val_acc: 0.7741 - 8s

In [22]:
loss, accuracy = model.evaluate(tokenized_val, y_val, verbose=0)

In [23]:
print(f'Accuracy : {"%.3f"%accuracy}')
print(f'Loss : {"%.3f"%loss}')

Accuracy : 0.798
Loss : 0.456


In [24]:
# generate scores
test['scores'] = model.predict(tokenized_test)
# generate deisions
test['prediction'] = np.round(test['scores']).astype(int)
test = test.drop('target', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [25]:
test

Unnamed: 0,id,keyword,location,text,clean_text,scores,prediction
0,0,,,Just happened a terrible car crash,happen terrible car crash,0.909281,1
1,2,,,"Heard about #earthquake is different cities, s...",hear earthquake different city stay safe,0.923783,1
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese flee street save,0.882429,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,0.119556,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill 28 china taiwan,0.969014,1
...,...,...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,earthquake safety los angeles ûò safety faste...,0.955814,1
3259,10865,,,Storm in RI worse than last hurricane. My city...,storm ri bad hurricane cityamp3other hard hit ...,0.955623,1
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,green line derailment chicago,0.968409,1
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,meg issue hazardous weather outlook hwo,0.851888,1
