In [1]:
import pandas as pd
import spacy
import string
import re
import numpy as np
import matplotlib.pyplot as plt

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# transformers
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cluster import KMeans

# statistical models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

from sklearn.pipeline import Pipeline

ModuleNotFoundError: No module named 'matplotlib'

### Spacy to preprocess text into lemmatised tokens
### Sklearn pipeline models:
    1) Countvectoriser
    2) Tfidf Vectoriser
    3) Random Forest
    4) Naive Bayes

In [2]:
# %pip install spacy && python -m spacy download en

In [3]:
df1= pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')

boot = False # resample the data to 10000 samples

In [4]:
# concat all the words into the same columns
# not used here
def concat_words(data):
    keyword_filled = data.keyword.fillna('')
    location_filled = data.location.fillna('')
    data['all_words'] = data.text + ' ' + keyword_filled + ' ' + location_filled
    return data

In [5]:
# make a copy
df_train = concat_words(df1)
df_test = concat_words(df2)

In [6]:
train, test = train_test_split(df_train, test_size=0.1, random_state=42)
if boot == True: train = train.sample(n=10000, replace = True)

In [7]:
print(len(df1), len(train), len(test))

7613 6851 762


### function to clean words

In [8]:
# initiate nlp object
nlp = spacy.load('en_core_web_sm')
# define which pos to filter out
noisy_pos_tags = ['PROP','NUM', 'SYM']
disable_list = ["ner", "parser"]

# functions to filter out noises and clean words
# so fucking ugly please improve your coding skill!
def isNoise(token):     
    is_noise = False
    if token.pos_ in noisy_pos_tags:
        is_noise = True 
        #print('pos')
    elif token.is_stop == True:
        is_noise = True
        #print('stop')
    elif token.is_punct == True:
        is_noise = True
        #print('punct')
    elif token.lemma_ == '...':
        is_noise = True
        #print('...')
    elif token.like_url == True:
        is_noise = True
        #print('url')
    else:
        is_noise = False
    return is_noise 

# function in the tokenizer
def cleanup(token, lower = True):
    if lower:
        token = token.lemma_.lower()
    return token.strip()

# tokentize words
def lemma_tokenizer(sentence):
    doc = nlp(sentence, disable_list)
    tokens = [cleanup(word) for word in doc if isNoise(word) != True]
    # return preprocessed list of tokens
    return tokens

### These functions are for the pipeline for logistic ML models

In [9]:
# create dense transformer
class ToDenseTransformer(BaseEstimator,TransformerMixin):
    # define the transform operation
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    # no paramter to learn this case
    # fit just returns an unchanged object
    def fit(self, X, y=None, **fit_params):
        return self


In [10]:
def printNMostInformative(vectorizer, clf, N):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1] # reverse the order
    print("Not Disaster Best Words: ")
    for feat in topClass1:
        print(feat)
    print("Disaster Best words: ")
    for feat in topClass2:
        print(feat)

In [11]:
# Prepare Training and Verification Data
X_train = train['all_words'].tolist()
Y_train = train['target'].tolist()
X_test = test['all_words'].tolist()
Y_test = test['target'].tolist()

In [12]:
# Initiate vectorizer and pipeline object
# Bag of words vectorizer
vectorizer = CountVectorizer(tokenizer=lemma_tokenizer)
clf = LogisticRegression(solver='liblinear', C=1.0) 

# all objects in the pipeline should be transformers
pipe = Pipeline(
    [("vect", vectorizer),
    ('normal', ToDenseTransformer()),
    ("clf", clf)
    ])


In [13]:
# Specify logistic regression
# Ridge Regression
# C is the inverse of 'strength' lambda parameter of regularization (ie penalise large parameters), 
# in here smaller positive number greater strength)
# Scikitlearn logistic regression apparently cannot set custom decision threshold (like WTF why)

# train
pipe.fit(X_train, Y_train)
# test
preds = pipe.predict(X_test)
accu = preds == Y_test
print("accuracy: for countvectorizer model on dataset is:", accu.mean())
printNMostInformative(vectorizer, clf, 10)

accuracy: for countvectorizer model on dataset is: 0.7913385826771654
Not Disaster Best Words: 
(-1.2090964196530114, 'nowplaye')
(-1.2015768048192368, 'bitch')
(-1.1769614375324942, 'drink')
(-1.1745596507699239, 'love')
(-1.1689844530177425, 'cake')
(-1.1517056529340135, 'ebay')
(-1.146022153235248, 'blight')
(-1.1310463862256472, 'soul')
(-1.1226434941261827, 'likely')
(-1.1146303870108922, 'write')
Disaster Best words: 
(2.5137432106463407, 'hiroshima')
(2.0484842524913622, 'wildfire')
(1.8219031570221906, 'debris')
(1.8102836651714251, 'typhoon')
(1.7581133959435058, 'derailment')
(1.647875729228661, 'suicide')
(1.6410680942361067, 'earthquake')
(1.5716588680653525, 'bombing')
(1.5039706826270332, 'migrant')
(1.495766802922914, 'spill')


In [14]:
# run cross-validation analyses on training set
scores = cross_val_score(pipe, X_train,Y_train,cv=3)
print(scores)

[0.7854641  0.79640981 0.78580815]


In [15]:
# Tfidf vectorizer
pipe_Tfidf = Pipeline(
    [("vect", vectorizer),
    ('tfid', TfidfTransformer()),
    ("clf", clf)
    ])
scores = cross_val_score(pipe, X_train,Y_train,cv=3)

pipe_Tfidf.fit(X_train, Y_train)
preds = pipe_Tfidf.predict(X_test)
accu = preds == Y_test
print("accuracy: for Tfidf model is:", accu.mean())
# print most informative words with highest coeff for Not Diaster and Diaster
printNMostInformative(vectorizer, clf, 10)

accuracy: for Tfidf model is: 0.7874015748031497
Not Disaster Best Words: 
(-2.3557055833766563, 'panic')
(-2.3088845649647327, 'love')
(-1.9242884729066396, 'scream')
(-1.8667548786696497, 'blaze')
(-1.7937751556735289, 'want')
(-1.779170249434287, 'blight')
(-1.7710914546222467, 'let')
(-1.7541406528320656, 'aftershock')
(-1.686378810936804, 'ruin')
(-1.5804894152340216, 'armageddon')
Disaster Best words: 
(3.4583022753835087, 'hiroshima')
(3.207333168541642, 'kill')
(3.0035460292894247, 'wildfire')
(2.737594751283122, 'suicide')
(2.6942904411593784, 'typhoon')
(2.6268413034107376, 'bombing')
(2.5875632553075345, 'debris')
(2.551554724896847, 'derailment')
(2.5360104046771497, 'building')
(2.511480770222869, 'train')


### Other Classifiers
1) Random Forest

2) Naive Bayes

In [16]:
# Random Forest Classifier
RF_clf = RandomForestClassifier(n_estimators=10)
pipe_RF = Pipeline(
    [("vect", vectorizer),
    ("clf", RF_clf)
    ])
pipe_RF.fit(X_train, Y_train)
preds = pipe_RF.predict(X_test)
accu = preds == Y_test
print("accuracy: for Random Forest model is:", accu.mean())

accuracy: for Random Forest model is: 0.7611548556430446


In [17]:
# Naive Bayes Classifier
pipe_bayes = Pipeline(
    [("vect", vectorizer),
    ('bayes', MultinomialNB())
    ])
pipe_bayes.fit(X_train, Y_train)
preds = pipe_bayes.predict(X_test)
accu = preds == Y_test
print("accuracy: for Bayes model is:", accu.mean())


accuracy: for Bayes model is: 0.7795275590551181


### Just for fun to see the sparse NLP feature can be effective at clustering 
### Unsupervised k-means with cluster = 2 (as there are only 2 known categiries).

In [18]:
kmean_clf = KMeans(n_clusters=2, random_state=0)
pipe_kmean = Pipeline(
    [("vect", vectorizer),
    ("clf", kmean_clf),    
    ])

pipe_kmean.fit(X_train)
pred = pipe_kmean.predict(X_train) # Output cluster labels
accu = np.equal(pred, np.array(Y_train))
print("accuracy: for clustering model is {0} or {1} \
      (unsure since the kmean label was generated by the model)".format( accu.mean(), 1- accu.mean()) )

accuracy: for clustering model is 0.4343891402714932 or 0.5656108597285068       (unsure since the kmean label was generated by the model)


In [19]:
dist = pipe_kmean.transform(X_train)

plt.scatter(dist[:,0], dist[:,1], c=pred, cmap='rainbow')

NameError: name 'plt' is not defined