# Setup and Imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re
sns.set
import spacy
import nltk
from nltk.corpus import brown
from nltk.corpus import wordnet
from collections import Counter
import Tweet_Normalizer as tn
from bs4 import BeautifulSoup
import requests
import spacy
import scipy
import gensim
import gensim.downloader
from sklearn.model_selection import train_test_split
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import csv
import math
from scipy.stats import uniform
from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ParameterSampler
import tensorflow as tf
from tensorflow import keras
import pickle
import prepare_embeddings as pe



# Load the Data

In [2]:
tweets = pd.read_csv("data/train.csv")

# Clean the Data

In [3]:
%%time
#USe tweet scrubber function to clean the data
tweets = tn.tweet_scrubber(tweets, verbose = True)

Running tweet scrubber...

Dropping unnecessary columns
Successfully dropped columns!

Normalizing the tweets
Successfully normalized tweets!

Removing invalid and mispelled words
Successfully removed invalid and mispelled words!

Successfully scrubbed tweets!

Wall time: 2min 42s


In [4]:
#Check for blank rows after cleaning. We expect 5
tweets = tweets.replace(r'^(\s)+$', np.nan, regex = True)
#Drop the empty rows
tweets.dropna(subset=["Clean Tweets"], inplace = True)
#Reset the index in place
tweets.reset_index(drop = True, inplace = True)

In [5]:
#Take a look at the last few rows of the data
pd.set_option('display.max_colwidth', 2)
tweets.tail(n=15)

Unnamed: 0,text,target,Clean Tweets
7593,Father-of-three Lost Control of Car After Overtaking and Collided #BathAndNorthEastSomerset http://t.co/fa3FcnlN86,1,father three lose control car overtake collide
7594,1.3 #Earthquake in 9Km Ssw Of Anza California #iPhone users download the Earthquake app for more information http://t.co/V3aZWOAmzK,1,earthquake km ssw california user download earthquake information
7595,Evacuation order lifted for town of Roosevelt: http://t.co/EDyfo6E2PU http://t.co/M5KxLPKFA1,1,evacuation order lift town roosevelt
7596,#breaking #LA Refugio oil spill may have been costlier bigger than projected http://t.co/5ueCmcv2Pk,1,break la oil spill may costlier big project
7597,a siren just went off and it wasn't the Forney tornado warning ??,1,siren go not tornado warning
7598,Officials say a quarantine is in place at an Alabama home over a possible Ebola case after developing symptoms... http://t.co/rqKK15uhEY,1,official say quarantine place alabama home possible ebola case develop symptom
7599,#WorldNews Fallen powerlines on G:link tram: UPDATE: FIRE crews have evacuated up to 30 passengers who were tr... http://t.co/EYSVvzA7Qm,1,fall g link tram update fire crew evacuate passenger
7600,on the flip side I'm at Walmart and there is a bomb and everyone had to evacuate so stay tuned if I blow up or not,1,flip side I bomb evacuate stay tune I blow not
7601,Suicide bomber kills 15 in Saudi security site mosque - Reuters via World - Google News - Wall ... http://t.co/nF4IculOje,1,suicide bomber kill saudi security site mosque world google news wall
7602,#stormchase Violent Record Breaking EF-5 El Reno Oklahoma Tornado Nearly Runs Over ... - http://t.co/3SICroAaNz http://t.co/I27Oa0HISp,1,violent record break el reno oklahoma tornado nearly run


# Split the Data intro Training and Validation Sets

In [6]:
train_corpus, val_corpus, y_train, y_val = train_test_split(tweets["Clean Tweets"], np.array(tweets["target"]), 
                                                  test_size=.15, random_state=42, stratify=np.array(tweets["target"]))

# Prepare Dense Word Embeddings

In [7]:
#Tokenize the training and validation set
tokenizer = ToktokTokenizer()
tokenized_train = [tokenizer.tokenize(text) for text in train_corpus]
tokenized_val = [tokenizer.tokenize(text) for text in val_corpus]

In [8]:
%%time
glove_vectors = gensim.downloader.load('glove-twitter-50')
gv_num_features = glove_vectors.vector_size


Wall time: 2min 36s


In [15]:
gensim.downloader.load?

In [10]:
#Create training data
X_train = pe.document_vectorizer_glove(corpus=tokenized_train, model=glove_vectors, 
                                            num_features=gv_num_features)


#Create validation data
X_val = pe.document_vectorizer_glove(corpus=tokenized_val, model=glove_vectors, 
                                            num_features=gv_num_features)

In [11]:
#Check the shapes
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(6466, 50)
(1142, 50)
(6466,)
(1142,)


# Baseline Scores

### Logistic Regression

In [12]:
lr_clf = LogisticRegression(max_iter=10000)

In [13]:
%%time
y_train_pred = cross_val_predict(lr_clf, X_train, y_train, cv = 5)
lr_base_acc = accuracy_score(y_train, y_train_pred) * 100
lr_base_f1 = f1_score(y_train, y_train_pred) * 100
print(f"Logistic Regression Baseline Accuracy: {lr_base_acc:.2f}")
print(f"Logistic Regression Baseline F1-Score: {lr_base_f1:.2f}")

Logistic Regression Baseline Accuracy: 77.06
Logistic Regression Baseline F1-Score: 71.57
Wall time: 711 ms


In [14]:
confusion_matrix(y_train, y_train_pred)

array([[3116,  570],
       [ 913, 1867]], dtype=int64)