# Setup and Import

In [74]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re
sns.set
import spacy
import nltk
import Tweet_Normalizer as tn
import gensim
import gensim.downloader
from gensim.models.fasttext import FastText
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import pickle
import prepare_embeddings as pe

# Load the Training and Test Data

In [75]:
tweets_train = pd.read_csv("data/train.csv")
tweets_test = pd.read_csv("data/test.csv")

# Clean the Training Data

In [3]:
%%time
#USe tweet scrubber function to clean the data
tweets_train = tn.tweet_scrubber(tweets_train, verbose = True)

Running tweet scrubber...

Dropping unnecessary columns
Successfully dropped columns!

Normalizing the tweets
Successfully normalized tweets!

Removing invalid and mispelled words
Successfully removed invalid and mispelled words!

Successfully scrubbed tweets!

Wall time: 2min 15s


In [4]:
#Check for blank rows after cleaning. We expect 5
tweets_train = tweets_train.replace(r'^(\s)+$', np.nan, regex = True)
#Drop the empty rows
tweets_train.dropna(subset=["Clean Tweets"], inplace = True)
#Reset the index in place
tweets_train.reset_index(drop = True, inplace = True)

# Clean the Test Data

In [3]:
tweets_test.shape

(3263, 4)

In [8]:
pd.set_option('display.max_colwidth', 2)
tweets_test.head()

Unnamed: 0,text,Clean Tweets
0,Just happened a terrible car crash,happen terrible car crash
1,"Heard about #earthquake is different cities, stay safe everyone.",hear earthquake different city stay safe
2,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",forest fire spot pond geese flee across street I not save
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon kill china taiwan


In [76]:
tweets_test = tn.tweet_scrubber(tweets_test, verbose = True, train = False)

Running tweet scrubber...

Dropping unnecessary columns
Successfully dropped columns!

Normalizing the tweets
Successfully normalized tweets!

Removing invalid and mispelled words
Successfully removed invalid and mispelled words!

Successfully scrubbed tweets!



In [77]:
tweets_test.tail()

Unnamed: 0,text,Clean Tweets
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTENERS XrWn,earthquake safety safety fastener
3259,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it was bombed. Around 20000K still without power,storm ri bad last hurricane hard hit yard look like bomb around k still power
3260,Green Line derailment in Chicago http://t.co/UtbXLcBIuY,green line derailment chicago
3261,MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3,meg issue hazardous weather outlook
3262,#CityofCalgary has activated its Municipal Emergency Plan. #yycstorm,activate municipal emergency plan


In [78]:
#Replace empty tweets with NaN
tweets_test = tweets_test.replace(r'^(\s)+$', np.nan, regex = True)
#Count the number missing
tweets_test["Clean Tweets"].isnull().sum()

2

In [79]:
#Take a look at the missing tweets
tweets_test.loc[tweets_test["Clean Tweets"].isnull()]

Unnamed: 0,text,Clean Tweets
748,Vamos Newells,
759,@edsheeran tf is innit,


In [82]:
#Set them to a value
tweets_test.loc[tweets_test["Clean Tweets"].isnull(), "Clean Tweets"] = "no"

# Tokenize the Test Data

In [11]:
#Tokenize the training and validation set
tokenizer = ToktokTokenizer()
tokenized_train = [tokenizer.tokenize(text) for text in tweets_train["Clean Tweets"]]
tokenized_test = [tokenizer.tokenize(text) for text in tweets_test["Clean Tweets"]]

# Word2Vec Model

### Generate dense embeddings based on training data

In [12]:
#Number of features to use
w2v_num_features = 300
#Create the Word2Vec model
w2v_model = gensim.models.Word2Vec(tokenized_train, vector_size=w2v_num_features,
                                   window = 250, epochs=100, min_count=0, sample=1e-3,
                                   sg=1, workers=10)
#Creat the training data
X_train = pe.document_vectorizer(corpus=tokenized_train, model=w2v_model, num_features=w2v_num_features)

#Create the test data
X_test = pe.document_vectorizer(corpus=tokenized_test, model=w2v_model, num_features=w2v_num_features)


In [13]:
#Check test data shape
print(X_test.shape)

(3263, 300)


### Load in Word2Vec Model

In [17]:
word2vec_model = pickle.load(open("best_w2v_model.sav", 'rb'))

In [18]:
word2vec_predictions = word2vec_model.predict(X_test)

# FastText Model

### Generate dense embeddings based on training data

In [19]:
%%time
#Number of Features
ft_num_features = 300


ft_model = FastText(tokenized_train, vector_size = ft_num_features, window = 250, min_count = 0, 
                    sample=1e-3, sg=1, epochs=100, workers=10)


#Create the training set
X_train = pe.document_vectorizer(corpus=tokenized_train, model=ft_model, num_features=ft_num_features)

#Create the test data
X_test = pe.document_vectorizer(corpus=tokenized_test, model=ft_model, num_features=ft_num_features)

Wall time: 1min 37s


In [20]:
#Check test data shape
print(X_test.shape)

(3263, 300)


### Load in FastText Model

In [21]:
fasttext_model = pickle.load(open("best_ft_model.sav", "rb"))

In [22]:
fasttext_predictions = fasttext_model.predict(X_test)

# GloVe Model

### Load in pre-generated dense embeddings

In [23]:
%%time
glove_vectors = gensim.downloader.load('glove-twitter-50')
gv_num_features = glove_vectors.vector_size

Wall time: 5min 16s


In [24]:
#Create training data
X_train = pe.document_vectorizer_glove(corpus=tokenized_train, model=glove_vectors, num_features=gv_num_features)


#Create test data
X_test = pe.document_vectorizer_glove(corpus=tokenized_test, model=glove_vectors, num_features=gv_num_features)

In [25]:
#Check test data shape
print(X_test.shape)

(3263, 50)


### Load in GloVe Model

In [26]:
glove_model = pickle.load(open("best_glove_model.sav", "rb"))

In [27]:
glove_predictions = glove_model.predict(X_test)

# Combine the Results

In [53]:
test_df = pd.DataFrame({"text": tweets_test["text"], "Clean Tweets": tweets_test["Clean Tweets"], "Word2Vec": word2vec_predictions, "FastText": fasttext_predictions,
                        "GloVe": glove_predictions})

In [55]:
pd.set_option('display.max_colwidth', 2)
test_df.tail(n=15)

Unnamed: 0,text,Clean Tweets,Word2Vec,FastText,GloVe
3248,Smackdown tyme this should put me in a good mood again since it got wrecked smh,put I good mood got wreck,0,0,0
3249,@thrillhho jsyk I haven't stopped thinking abt remus slumped against the bathroom door all day I was wrecked ??????????,I not stop think remus slump bathroom door day I wreck,0,0,0
3250,@stighefootball Begovic has been garbage. He got wrecked by a Red Bull reserve team and everyone else this preseason,garbage got wreck red bull reserve team preseason,0,0,1
3251,Wrecked today got my hattrick ????,wreck today get,0,0,0
3252,#Ebola #EbolaOutbreak Ebola Virus: Birmingham Ala. Firefighters Quarantined After Possible Exposure Officials Say http://t.co/tjpYlU9fOX,ebola ebola virus birmingham ala firefighter quarantine possible exposure official say,1,1,1
3253,Malaysian PM confirms debris is from missing flight MH370 http://t.co/pfAvW5QyqE,malaysian pm confirm debris miss flight,1,1,1
3254,Officials: Alabama home quarantined over possible Ebola case - Washington Times,official alabama home quarantine possible ebola case washington time,1,1,1
3255,See the 16yr old PKK suicide bomber who detonated bomb in Turkey Army trench released: Harun Ìàekdar ... http://t.co/hKuT5mSdtP @MsOreo_,see yr old suicide bomber detonate bomb turkey army trench release,1,1,1
3256,To conference attendees! The blue line from the airport has DERAILED - please look into taking a taxi to the hotel! See you soon!,conference attendee blue line airport derail please look take taxi hotel see soon,1,1,0
3257,The death toll in a #IS-suicide car bombing on a #YPG position in the Village of Rajman in the eastern province of Hasaka has risen to 9,death toll suicide car bombing position village eastern province rise,1,1,1


### Check which predictions Word2Vec and FastText

In [56]:
mask = (test_df["Word2Vec"] == 1) & (test_df["FastText"] == 1)
test_df.loc[mask]

Unnamed: 0,text,Clean Tweets,Word2Vec,FastText,GloVe
0,Just happened a terrible car crash,happen terrible car crash,1,1,1
1,"Heard about #earthquake is different cities, stay safe everyone.",hear earthquake different city stay safe,1,1,1
2,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",forest fire spot pond geese flee across street I not save,1,1,1
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,1,1,1
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon kill china taiwan,1,1,1
...,...,...,...,...,...
3257,The death toll in a #IS-suicide car bombing on a #YPG position in the Village of Rajman in the eastern province of Hasaka has risen to 9,death toll suicide car bombing position village eastern province rise,1,1,1
3259,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it was bombed. Around 20000K still without power,storm ri bad last hurricane hard hit yard look like bomb around k still power,1,1,1
3260,Green Line derailment in Chicago http://t.co/UtbXLcBIuY,green line derailment chicago,1,1,1
3261,MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3,meg issue hazardous weather outlook,1,1,1


### Check which predictions Word2Vec and GloVe agree

In [57]:
mask = (test_df["Word2Vec"] == 1) & (test_df["GloVe"] == 1)
test_df.loc[mask]

Unnamed: 0,text,Clean Tweets,Word2Vec,FastText,GloVe
0,Just happened a terrible car crash,happen terrible car crash,1,1,1
1,"Heard about #earthquake is different cities, stay safe everyone.",hear earthquake different city stay safe,1,1,1
2,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",forest fire spot pond geese flee across street I not save,1,1,1
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,1,1,1
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon kill china taiwan,1,1,1
...,...,...,...,...,...
3255,See the 16yr old PKK suicide bomber who detonated bomb in Turkey Army trench released: Harun Ìàekdar ... http://t.co/hKuT5mSdtP @MsOreo_,see yr old suicide bomber detonate bomb turkey army trench release,1,1,1
3257,The death toll in a #IS-suicide car bombing on a #YPG position in the Village of Rajman in the eastern province of Hasaka has risen to 9,death toll suicide car bombing position village eastern province rise,1,1,1
3259,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it was bombed. Around 20000K still without power,storm ri bad last hurricane hard hit yard look like bomb around k still power,1,1,1
3260,Green Line derailment in Chicago http://t.co/UtbXLcBIuY,green line derailment chicago,1,1,1


### Check which predictions FastText and GloVe agree

In [58]:
mask = (test_df["FastText"] == 1) & (test_df["GloVe"] == 1)
test_df.loc[mask]

Unnamed: 0,text,Clean Tweets,Word2Vec,FastText,GloVe
0,Just happened a terrible car crash,happen terrible car crash,1,1,1
1,"Heard about #earthquake is different cities, stay safe everyone.",hear earthquake different city stay safe,1,1,1
2,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",forest fire spot pond geese flee across street I not save,1,1,1
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,1,1,1
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon kill china taiwan,1,1,1
...,...,...,...,...,...
3257,The death toll in a #IS-suicide car bombing on a #YPG position in the Village of Rajman in the eastern province of Hasaka has risen to 9,death toll suicide car bombing position village eastern province rise,1,1,1
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTENERS XrWn,earthquake safety safety fastener,0,1,1
3259,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it was bombed. Around 20000K still without power,storm ri bad last hurricane hard hit yard look like bomb around k still power,1,1,1
3260,Green Line derailment in Chicago http://t.co/UtbXLcBIuY,green line derailment chicago,1,1,1


Overall, it appears that the Word2Vec and FastText models agree on the most on which tweets should be predicted as real disasters. 

### Vote on Test Prediction

In [59]:
test_df["Vote"] = np.round((test_df["Word2Vec"] + test_df["FastText"] + test_df["GloVe"]) / 3)
test_df["Vote"] = test_df["Vote"].astype(np.int)

### Look at which tweets were predicted as a real disaster based on the vote

In [63]:
cols = ["text", "Clean Tweets", "Vote"]
mask = test_df["Vote"] == 1
test_df.loc[mask, cols].head(n = 20)

Unnamed: 0,text,Clean Tweets,Vote
0,Just happened a terrible car crash,happen terrible car crash,1
1,"Heard about #earthquake is different cities, stay safe everyone.",hear earthquake different city stay safe,1
2,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",forest fire spot pond geese flee across street I not save,1
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,1
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon kill china taiwan,1
15,Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU,birmingham wholesale market ablaze news fire break birmingham wholesale market,1
17,#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriage crisis sets Nigerian Twitter ablaze... http://t.co/CMghxBa2XI,toke marriage crisis set nigerian twitter ablaze,1
23,Rape victim dies as she sets herself ablaze: A 16-year-old girl died of burn injuries as she set herself ablazeÛ_ http://t.co/UK8hNrbOob,rape victim die set ablaze year old girl die burn injury set,1
27,'Burning Rahm': Let's hope City Hall builds a giant wooden mayoral effigy 100 feet tall &amp; sets it ablaze. http://t.co/kFo2mksn6Y @John_Kass,burn let hope city hall build giant wooden mayoral effigy foot tall set ablaze,1
29,Accident cleared in #PaTurnpike on PATP EB between PA-18 and Cranberry slow back to #traffic http://t.co/SL0Oqn0Vyr,accident clear eb pa cranberry slow back traffic,1


### Look at which tweets were predicted as a safe based on the vote

In [64]:
cols = ["text", "Clean Tweets", "Vote"]
mask = test_df["Vote"] == 0
test_df.loc[mask, cols].head(n = 20)

Unnamed: 0,text,Clean Tweets,Vote
5,We're shaking...It's an earthquake,shake earthquake,0
6,"They'd probably still show more life than Arsenal did yesterday, eh? EH?",probably still show life arsenal yesterday,0
7,Hey! How are you?,,0
8,What a nice hat?,nice hat,0
9,Fuck off!,fuck,0
10,No I don't like cold!,no I not like cold,0
11,NOOOOOOOOO! Don't do that!,not,0
12,No don't tell me that!,no not tell I,0
13,What if?!,,0
14,Awesome!,awesome,0


We can see that for the most part, the tweets labeled as safe appear to be correct except for the first. 

# Check the Competition Submission format

In [65]:
sub = pd.read_csv("data/sample_submission.csv")

In [66]:
sub.head(n=10)

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
5,12,0
6,21,0
7,22,0
8,27,0
9,29,0


In [69]:
raw_test = pd.read_csv("data/test.csv")

In [70]:
raw_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


### Create submission based on test id and output from the vote

In [71]:
test_submission = pd.DataFrame({"id": raw_test["id"], "target": test_df["Vote"]})

In [72]:
test_submission.head(n = 15)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,0
6,21,0
7,22,0
8,27,0
9,29,0


### Write the test predictions to a csv file to be uploaded to Kaggle

In [73]:
test_df.to_csv("data/test_submission")