Make imports

In [32]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Visualize few rows of the training data

In [33]:
train_data = pd.read_csv('train.csv')
print(f"The training data has {train_data.shape[0]} rows")
train_data.head(10)

The training data has 7613 rows


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


Visualize few rows of testing data

In [34]:
test_data = pd.read_csv('test.csv')
print(f"The testing data has {len(test_data)} rows")
test_data.head()

The testing data has 3263 rows


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


So, our input is the text of the tweet, keyword and location, but the keyword and location may be NaN. So we need to handle that.

In [35]:
#Handling NaN values in training data and testing data
train_data['keyword']=train_data['keyword'].fillna('')
train_data['location']=train_data['location'].fillna('')
test_data['keyword']=test_data['keyword'].fillna('')
test_data['location']=test_data['location'].fillna('')

print(train_data.head())
print(test_data.head())

   id keyword location                                               text  \
0   1                   Our Deeds are the Reason of this #earthquake M...   
1   4                              Forest fire near La Ronge Sask. Canada   
2   5                   All residents asked to 'shelter in place' are ...   
3   6                   13,000 people receive #wildfires evacuation or...   
4   7                   Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
   id keyword location                                               text
0   0                                  Just happened a terrible car crash
1   2                   Heard about #earthquake is different cities, s...
2   3                   there is a forest fire at spot pond, geese are...
3   9                            Apocalypse lighting. #Spokane #wildfires
4  11                       Typhoon Soudelor kills 28 in China and Taiwan


Firstly, make a list of all the tweets, keywords and locations that are in the training set. 

Then, make a list of all the tweets,keywords and locations that are in the test set.

In [36]:
training_tweets = train_data.iloc[:, -2].tolist()
training_keywords=train_data.iloc[:, 1].tolist()
training_locations=train_data.iloc[:, 2].tolist()
training_target=train_data.iloc[:, -1].tolist()
testing_tweets=test_data.iloc[:, -1].tolist()
testing_keywords=test_data.iloc[:, 1].tolist()
testing_locations=test_data.iloc[:, 2].tolist()
print("Sample Training Tweets")
print(training_tweets[:5])
print("Sample Training Keywords")
print(training_keywords[:5])
print("Sample Training Locations")
print(training_locations[:5])
print("Sample Training Target")
print(training_target[:5])
print("Sample Testing Tweets")
print(testing_tweets[:5])
print("Sample Testing Keywords")
print(testing_keywords[:5])
print("Sample Testing Locations")
print(testing_locations[:5])



Sample Training Tweets
['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', 'Forest fire near La Ronge Sask. Canada', "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected", '13,000 people receive #wildfires evacuation orders in California ', 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ']
Sample Training Keywords
['', '', '', '', '']
Sample Training Locations
['', '', '', '', '']
Sample Training Target
[1, 1, 1, 1, 1]
Sample Testing Tweets
['Just happened a terrible car crash', 'Heard about #earthquake is different cities, stay safe everyone.', 'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all', 'Apocalypse lighting. #Spokane #wildfires', 'Typhoon Soudelor kills 28 in China and Taiwan']
Sample Testing Keywords
['', '', '', '', '']
Sample Testing Locations
['', '', '', '', '']


Tokenize tweets, keywords and locations separately

In [46]:
#make tokenizer object called tokenizer_1 for tweets and keywords
tokenizer_1= Tokenizer(oov_token= 'OOV')
tokenizer_1.fit_on_texts(training_tweets) #fitting only on training data
word_index_1 = tokenizer_1.word_index


#convert sentences to sequences of integers
tweet_sequences = tokenizer_1.texts_to_sequences(training_tweets) #uses the vocabulary learned from fitting on training data tweets
padded_tweet_sequences = pad_sequences(tweet_sequences, padding= 'post') #pads sequences to maxlen

#check the shape of padded sequences
print(f"small part of the tweet sequences for visualization:\n{padded_tweet_sequences[:2]}")
print(type(padded_tweet_sequences))
print(f"Number of sequences:{padded_tweet_sequences.shape[0]}")
print(f"Size of each padded sequence:{padded_tweet_sequences.shape[1]}")


small part of the tweet sequences for visualization:
[[ 120 4634   25    5  869    9   22  264  139 1620 4635   90   41    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]
 [ 190   46  230  800 6955 6956 1405    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]]
<class 'numpy.ndarray'>
Number of sequences:7613
Size of each padded sequence:33


In [47]:
#tokenize keywords
keyword_sequences=tokenizer_1.texts_to_sequences(training_keywords)
padded_keyword_sequences=pad_sequences(keyword_sequences,padding='post')
print(f"small part of keyword sequences for visualization \n{padded_keyword_sequences[:2]}")
print(f"datatype of padded_keyword_sequences: {type(padded_keyword_sequences)}")
print(f"Number of keyword sequences: {padded_keyword_sequences.shape[0]}")
print(f"Number of keywords in each sequence: {padded_keyword_sequences.shape[1]}")

small part of keyword sequences for visualization 
[[0 0 0]
 [0 0 0]]
datatype of padded_keyword_sequences: <class 'numpy.ndarray'>
Number of keyword sequences: 7613
Number of keywords in each sequence: 3


In [48]:
#tokenize locations
#make tokenizer object called tokenizer_2 for locations
tokenizer_2= Tokenizer(oov_token= 'OOV')
tokenizer_2.fit_on_texts(training_locations) #fitting only on training data
word_index_2 = tokenizer_2.word_index


#convert sentences to sequences of integers
location_sequences = tokenizer_2.texts_to_sequences(training_locations) #uses the vocabulary learned from fitting on training data tweets
padded_location_sequences = pad_sequences(location_sequences, padding= 'post') #pads sequences to maxlen

#check the shape of padded sequences
print(f"small part of location sequences for visualization \n {padded_location_sequences[0:2]}")
print(type(padded_location_sequences))
print(f"Number of sequences:{padded_location_sequences.shape[0]}")
print(f"Size of each padded sequence:{padded_location_sequences.shape[1]}")


small part of location sequences for visualization 
 [[0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]]
<class 'numpy.ndarray'>
Number of sequences:7613
Size of each padded sequence:12


Now have to embed the tweets, keywords and locations into a vector space. I will use smae vector embedding for tweets and keywords, and a separate one for locations.