In [2]:
import pandas as pd
import spacy
import string
import re
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import spacy.gold
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import string
from spacy.matcher import Matcher


### Spacy to preprocess text into lemmatised tokens
### Sklearn pipeline models:
    1) Countvectoriser
    2) Tfidf Vectoriser
    3) Random Forest
    4) Naive Bayes

In [5]:
# %pip install spacy && python -m spacy download en

In [6]:
df1= pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')

boot = False # resample the data to 10000 samples

In [7]:
# put all the words into the same columns
def fill_words(data):
    keyword_filled = data.keyword.fillna('')
    location_filled = data.location.fillna('')
    data['all_words'] = data.text + ' ' + keyword_filled + ' ' + location_filled
    return data

In [8]:
df_train = fill_words(df1)
df_test = fill_words(df2)


In [9]:
train, test = train_test_split(df_train, test_size=0.33, random_state=42)
if boot == True: train = train.sample(n=10000, replace = True)

In [10]:
print(len(df1), len(train), len(test))

7613 5100 2513


### Create tokenizer to clean words. This will be used in pipeline

In [28]:
# initiate nlp object and load all the things we want to get rid of
def init_parser():
    # Load English tokenizer, tagger, parser, NER and word vectors
    nlp = spacy.load("en_core_web_sm")
    # Create our list of punctuation marks
    punctuations = string.punctuation
    SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”", '']
    # Create our list of stopwords
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    matcher = Matcher(nlp.vocab)
    pattern = [{"TEXT": {"REGEX": "^https?:\/\/.*[\r\n]*"}}]
    matcher.add("URL", [pattern])

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence)
    matches = matcher(mytokens)
    
    
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in SYMBOLS ]

    # return preprocessed list of tokens
    return mytokens

In [9]:
spacy_tokenizer(df_train['all_words'][0])

['deed', 'reason', 'earthquake', 'allah', 'forgive']

### These functions are for the pipeline for ML models

In [10]:
# create dense transformer
class ToDenseTransformer(BaseEstimator,TransformerMixin):
    # define the transform operation
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    # no paramter to learn this case
    # fit just returns an unchanged object
    def fit(self, X, y=None, **fit_params):
        return self


In [11]:
def printNMostInformative(vectorizer, clf, N):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Not Disaster Best Words: ")
    for feat in topClass1:
        print(feat)
    print("Disaster Best words: ")
    for feat in topClass2:
        print(feat)

In [12]:
# Prepare Training and Verification Data
X_train = train['all_words'].tolist()
Y_train = train['target'].tolist()
X_test = test['all_words'].tolist()
Y_test = test['target'].tolist()

In [13]:
# Bag of words vectorizer
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
clf = LogisticRegression(solver='liblinear')
pipe = Pipeline(
    [("vect", vectorizer),
    ('normal', ToDenseTransformer()),
    ("clf", clf)
    ])

In [14]:
# train
pipe.fit(X_train, Y_train)
# test
preds = pipe.predict(X_test)
accu = preds == Y_test
print("accuracy: for countvectorizer model is:", accu.mean())
printNMostInformative(vectorizer, clf, 10)

accuracy: for countvectorizer model is: 0.7906884202148826
Not Disaster Best Words: 
(-1.2151818683151665, 'wrong')
(-1.0494782685218167, 'blight')
(-1.0325147227084714, 'bitch')
(-1.0168009728034149, 'attempt')
(-0.9982631376880118, 'nowplaye')
(-0.9872856067944742, 'aftershock')
(-0.9686608518945959, 'ebay')
(-0.9675184921458493, 'career')
(-0.9604457407355692, 'orlando')
(-0.9543022626632774, 'wedding')
Disaster Best words: 
(2.4350178931890007, 'hiroshima')
(1.9223857571433098, 'wildfire')
(1.876228846486541, 'earthquake')
(1.6231019375287181, 'suicide')
(1.5046531711992408, 'typhoon')
(1.488764651119117, 'debris')
(1.4853987377086897, 'migrant')
(1.4822323604861207, 'derailment')
(1.456701812650832, 'spill')
(1.4394601376531029, 'village')


In [15]:
# Tfidf vectorizer
pipe_Tfidf = Pipeline(
    [("vect", vectorizer),
    ('tfid', TfidfTransformer()),
    ("clf", clf)
    ])
pipe_Tfidf.fit(X_train, Y_train)
preds = pipe_Tfidf.predict(X_test)
accu = preds == Y_test
print("accuracy: for Tfidf model is:", accu.mean())
# print most informative words with highest coeff for Not Diaster and Diaster
printNMostInformative(vectorizer, clf, 10)

accuracy: for Tfidf model is: 0.7811380819737366
Not Disaster Best Words: 
(-2.019729482871739, 'scream')
(-2.0067750517357483, 'panic')
(-1.7876223097711863, 'love')
(-1.7220919293122998, 'ruin')
(-1.684725447747696, 'wreck')
(-1.6298055933778022, 'let')
(-1.609355192088619, 'blaze')
(-1.5697244619467763, 'aftershock')
(-1.559562139568242, 'twister')
(-1.5490903985222768, 'new')
Disaster Best words: 
(3.1825948404479796, 'hiroshima')
(2.7287800772859554, 'kill')
(2.6459608240137342, 'wildfire')
(2.528786274557909, 'earthquake')
(2.4277111843578085, 'suicide')
(2.3556583493855383, 'bombing')
(2.3363114607398563, 'typhoon')
(2.189955219273723, 'train')
(2.1766169842646947, 'debris')
(2.091640961388825, 'fire')


### Other non-vectorised Classifiers
1) Random Forest

2) Naive Bayes

In [16]:
# Random Forest Classifier
RF_clf = RandomForestClassifier(n_estimators=10)
pipe_RF = Pipeline(
    [("vect", vectorizer),
    ("clf", RF_clf)
    ])
pipe_RF.fit(X_train, Y_train)
preds = pipe_RF.predict(X_test)
accu = preds == Y_test
print("accuracy: for Random Forest model is:", accu.mean())

accuracy: for Random Forest model is: 0.7695980899323518


In [17]:
# Naive Bayes Classifier
pipe_bayes = Pipeline(
    [("vect", vectorizer),
    ('bayes', MultinomialNB())
    ])
pipe_bayes.fit(X_train, Y_train)
preds = pipe_bayes.predict(X_test)
accu = preds == Y_test
print("accuracy: for Bayes model is:", accu.mean())


accuracy: for Bayes model is: 0.7871070433744528


### Spacy CNN

In [25]:
a = list(range(0,100,2))
print(type(*l) for l in a)

<generator object <genexpr> at 0x7fbf43cda450>
