# Setup and Import

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re
sns.set
import spacy
import nltk
import Tweet_Normalizer as tn
import gensim
import gensim.downloader
from gensim.models.fasttext import FastText
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import pickle
import prepare_embeddings as pe



# Load the Training and Test Data

In [2]:
tweets_train = pd.read_csv("data/train.csv")
tweets_test = pd.read_csv("data/test.csv")

# Clean the Training Data

In [3]:
%%time
#USe tweet scrubber function to clean the data
tweets_train = tn.tweet_scrubber(tweets_train, verbose = True)

Running tweet scrubber...

Dropping unnecessary columns
Successfully dropped columns!

Normalizing the tweets
Successfully normalized tweets!

Removing invalid and mispelled words
Successfully removed invalid and mispelled words!

Successfully scrubbed tweets!

Wall time: 2min 39s


In [4]:
#Check for blank rows after cleaning. We expect 5
tweets_train = tweets_train.replace(r'^(\s)+$', np.nan, regex = True)
#Drop the empty rows
tweets_train.dropna(subset=["Clean Tweets"], inplace = True)
#Reset the index in place
tweets_train.reset_index(drop = True, inplace = True)

# Clean the Test Data

In [3]:
tweets_test.shape

(3263, 4)

In [8]:
pd.set_option('display.max_colwidth', 2)
tweets_test.head()

Unnamed: 0,text,Clean Tweets
0,Just happened a terrible car crash,happen terrible car crash
1,"Heard about #earthquake is different cities, stay safe everyone.",hear earthquake different city stay safe
2,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",forest fire spot pond geese flee across street I not save
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon kill china taiwan


In [14]:
tweets_test["Normalized Tweets"] = tn.normalize_corpus(tweets_test["text"])

In [17]:
tweets_test.head()

Unnamed: 0,id,keyword,location,text,Normalized Tweets
0,0,,,Just happened a terrible car crash,happen terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone.",hear earthquake different city stay safe everyone
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",forest fire spot pond geese flee across street I not save
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan


In [15]:
tweets_test = tweets_test.replace(r'^(\s)+$', np.nan, regex = True)

In [18]:
tweets_test["Normalized Tweets"].isnull().sum()

0

In [5]:
tweets_test = tn.tweet_scrubber(tweets_test, verbose = True, train = False)

Running tweet scrubber...

Dropping unnecessary columns
Successfully dropped columns!

Normalizing the tweets
Successfully normalized tweets!

Removing invalid and mispelled words
Successfully removed invalid and mispelled words!

Successfully scrubbed tweets!



In [9]:
tweets_test.tail()

Unnamed: 0,text,Clean Tweets
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTENERS XrWn,earthquake safety safety fastener
3259,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it was bombed. Around 20000K still without power,storm ri bad last hurricane hard hit yard look like bomb around k still power
3260,Green Line derailment in Chicago http://t.co/UtbXLcBIuY,green line derailment chicago
3261,MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3,meg issue hazardous weather outlook
3262,#CityofCalgary has activated its Municipal Emergency Plan. #yycstorm,activate municipal emergency plan


In [10]:
tweets_test = tweets_test.replace(r'^(\s)+$', np.nan, regex = True)

In [11]:
tweets_test["Clean Tweets"].isnull().sum()

2

In [12]:
tweets_test.loc[tweets_test["Clean Tweets"].isnull()]

Unnamed: 0,text,Clean Tweets
748,Vamos Newells,
759,@edsheeran tf is innit,


In [13]:
tweets_test.loc[tweets_test["Clean Tweets"].isnull(), "Clean Tweets"] = "no"

# Tokenize the Test Data

In [15]:
#Tokenize the training and validation set
tokenizer = ToktokTokenizer()
tokenized_train = [tokenizer.tokenize(text) for text in tweets_train["Clean Tweets"]]
tokenized_test = [tokenizer.tokenize(text) for text in tweets_test["Clean Tweets"]]

# Word2Vec Model

### Generate dense embeddings based on training data

In [19]:
#Number of features to use
w2v_num_features = 300
#Create the Word2Vec model
w2v_model = gensim.models.Word2Vec(tokenized_train, vector_size=w2v_num_features,
                                   window = 250, epochs=100, min_count=0, sample=1e-3,
                                   sg=1, workers=10)
#Creat the training data
X_train = pe.document_vectorizer(corpus=tokenized_train, model=w2v_model, num_features=w2v_num_features)

#Create the test data
X_test = pe.document_vectorizer(corpus=tokenized_test, model=w2v_model, num_features=w2v_num_features)


In [21]:
#Check test data shape
print(X_test.shape)

(3263, 300)


### Load in Word2Vec Model

In [18]:
word2vec_model = pickle.load(open("best_w2v_model.sav", 'rb'))

In [None]:
best_model = pickle.load(open("best_model.sav", 'rb'))

In [23]:
word2vec_predictions = word2vec_model.predict(X_test)

In [25]:
np.sum(word2vec_predictions)

1359

# FastText Model

### Generate dense embeddings based on training data

In [26]:
%%time
#Number of Features
ft_num_features = 300


ft_model = FastText(tokenized_train, vector_size = ft_num_features, window = 250, min_count = 0, 
                    sample=1e-3, sg=1, epochs=100, workers=10)


#Create the training set
X_train = pe.document_vectorizer(corpus=tokenized_train, model=ft_model, num_features=ft_num_features)

#Create the test data
X_test = pe.document_vectorizer(corpus=tokenized_test, model=ft_model, num_features=ft_num_features)

Wall time: 1min 36s


In [27]:
#Check test data shape
print(X_test.shape)

(3263, 300)


### Load in FastText Model

In [28]:
fasttext_model = pickle.load(open("best_ft_model.sav", "rb"))

In [30]:
fasttext_predictions = fasttext_model.predict(X_test)

In [31]:
np.sum(fasttext_predictions)

1179

# GloVe Model

### Load in pre-generated dense embeddings

In [32]:
%%time
glove_vectors = gensim.downloader.load('glove-twitter-50')
gv_num_features = glove_vectors.vector_size

Wall time: 2min 23s


In [33]:
#Create training data
X_train = pe.document_vectorizer_glove(corpus=tokenized_train, model=glove_vectors, num_features=gv_num_features)


#Create test data
X_test = pe.document_vectorizer_glove(corpus=tokenized_test, model=glove_vectors, num_features=gv_num_features)

### Load in GloVe Model

In [34]:
glove_model = pickle.load(open("best_glove_model.sav", "rb"))

In [35]:
glove_predictions = glove_model.predict(X_test)

In [36]:
np.sum(glove_predictions)

1228

# Combine the Results

In [37]:
test_df = pd.DataFrame({"Word2Vec": word2vec_predictions, "FastText": fasttext_predictions, "GloVe": glove_predictions})

In [41]:
test_df.head(n = 15)

Unnamed: 0,Word2Vec,FastText,GloVe
0,1,1,1
1,1,1,1
2,1,1,1
3,1,1,1
4,1,1,1
5,1,0,1
6,0,0,0
7,0,0,0
8,0,0,0
9,0,0,0


In [55]:
test_df["Vote"] = np.round((test_df["Word2Vec"] + test_df["FastText"] + test_df["GloVe"]) / 3)
test_df["Vote"] = test_df["Vote"].astype(np.int)

In [58]:
test_df.head(n=15)

Unnamed: 0,Word2Vec,FastText,GloVe,Vote
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
5,1,0,1,1
6,0,0,0,0
7,0,0,0,0
8,0,0,0,0
9,0,0,0,0


In [49]:
sub = pd.read_csv("data/sample_submission.csv")

In [50]:
sub.head(n=10)

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
5,12,0
6,21,0
7,22,0
8,27,0
9,29,0


In [64]:
raw_test = pd.read_csv("data/test.csv")

In [65]:
raw_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [66]:
test_submission = pd.DataFrame({"id": raw_test["id"], "target": test_df["Vote"]})

In [67]:
test_submission.head(n = 15)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [71]:
test_df.to_csv("data/test_submission")