# Building tweet vectorizer using spacy tokenizer

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import spacy
import pickle

from nltk.stem.porter import PorterStemmer

import tools

In [2]:
upstream = []
random_seed = None

In [3]:
# Parameters
random_seed = 42
product = {
    "nb": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vectorizer_spacy.ipynb",
    "vectorizer": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vectorizer_spacy.pkl",
    "vocab": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vocab_spacy.pkl",
    "stopwords": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/stopwords_spacy.csv",
}


## Read the train and test data

In [4]:
df_train = pd.read_csv('../data/HumAID_data_v1.0/all_combined/all_train.tsv', sep='\t')
df_test = pd.read_csv('../data/HumAID_data_v1.0/all_combined/all_test.tsv', sep='\t')

df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

df_train.sample(5)

Unnamed: 0,tweet_id,tweet_text,class_label
43668,768924405397352448,RT @TheNatWorld: Asylum seekers donate daily a...,rescue_volunteering_or_donation_effort
4943,1037707590447366144,Thank you so much @airvistara for helping us i...,sympathy_and_support
11781,1040604820925767683,The winds today are here from #Florence they a...,caution_and_advice
46455,729799049688190976,RT @FVine: Calgary Construction Association #C...,rescue_volunteering_or_donation_effort
19533,911711780505866241,The death toll in Puerto Rico continues to cli...,injured_or_dead_people


## Identify the most used locations in the training set and remove from the vocabulary - this may reduce the F1 score, but will also remove any data leakage

In [5]:
#!python -m spacy download xx_ent_wiki_sm
# Fast and more accurate for LOC
nlp = spacy.load('xx_ent_wiki_sm')

In [6]:
%%time
df_locs_train, locations_set_train = tools.get_locations(nlp, df_train)

CPU times: user 3min 18s, sys: 1.89 s, total: 3min 20s
Wall time: 3min 24s


In [7]:
%%time
df_locs_test, locations_set_test = tools.get_locations(nlp, df_test)

CPU times: user 47 s, sys: 289 ms, total: 47.3 s
Wall time: 47.4 s


In [8]:
# Visually inspect if the train and test locations are about the same - they look very close, remove from the vocabulary
print('Train: ', sorted(locations_set_train))
print()
print('Test: ', sorted(locations_set_test))

Train:  ['abaco', 'abaco islands', 'alberta', 'america', 'athens', 'atlantic', 'attica', 'azad kashmir', 'bahamas', 'bangladesh', 'beira', 'butte county', 'california', 'californias', 'california’s', 'camp fire', 'canada', 'carolinas', 'centre', 'chimanimani', 'china', 'congress', 'cuba', 'cyclone idai', 'delhi', 'dominica', 'east coast', 'ecuador', 'ellicott city', 'florence', 'florida', 'florida keys', 'fort mcmurray', 'georgia', 'greece', 'haiti', 'houston', 'hurricane florence', 'hurricane maria', 'india', 'iowa', 'irma', 'irmas', 'israel', 'italy', 'jacksonville', 'japan', 'kaikoura', 'kashmir', 'kerala', 'keralafloods2018', 'kochi', 'louisiana', 'malawi', 'malibu', 'manicaland', 'mar-a-lago', 'maryland', 'mexico', 'mexico city', 'mexicos', 'miami', 'mirpur', 'mozambique', 'naples', 'nebraska', 'nepal', 'new york', 'new zealand', 'north carolina', 'northern california', 'pakistan', 'paradise', 'port arthur', 'puerto ricans', 'puerto rico', 'puerto ricos', 'red cross', 'san juan', 

In [9]:
## Dump the stopwords
stopwords = locations_set_train | {"attach", "ahead", "rt"}
df_stopwords = pd.DataFrame(data=stopwords, columns=['stopword'])
df_stopwords.to_csv(product['stopwords'], index=False)
df_stopwords.sample(5)

Unnamed: 0,stopword
14,nepal
62,miami
92,jacksonville
52,new zealand
71,ahead


## Build the tokenizer

In [10]:
# Follow https://machinelearningknowledge.ai/complete-guide-to-spacy-tokenizer-with-examples/

# Run the first time
#!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()

# Add more stop words if needed - https://machinelearningknowledge.ai/tutorial-for-stopwords-in-spacy/
nlp.Defaults.stop_words |= set(df_stopwords['stopword'])

tokenizer = tools.Tokenizer(nlp, stopwords)

tokenizer.tokenize(
    'RT @HotshotWake: Good morning from Stewart Crossing Yukon up in Canada. Big fire day ahead. #canada #yukon #wildfire https://t.co/cSoymOMwJO')

['@hotshotwake',
 'good',
 'morning',
 'stewart',
 'cross',
 'yukon',
 'big',
 'fire',
 'day',
 'yukon',
 'wildfire',
 'https://t.co/csoymomwjo']

In [11]:
vectorizer = TfidfVectorizer(
    tokenizer=tokenizer.tokenize,
    strip_accents='unicode',
    min_df=10,
    ngram_range=(1, 2),
    max_features=10000,
    use_idf=True
)

## Test/validate with logistic regression

In [12]:
%%time
X_train = vectorizer.fit_transform(df_train['tweet_text'])
y_train = list(df_train['class_label'])

X_test = vectorizer.transform(df_test['tweet_text'])
y_test = list(df_test['class_label'])

X_train.shape

CPU times: user 10min 4s, sys: 3.97 s, total: 10min 8s
Wall time: 10min 9s


(53516, 10000)

In [13]:
# Prepate the logistic regression classifier
clf = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=random_seed, max_iter=1000)
clf.fit(X_train, y_train)

In [14]:
%%time
# Predict on test, must be about the same
lr_test_preds = clf.predict(X_test)

# Score on the test data
lr_f1 = f1_score(y_test, lr_test_preds, average='macro')
print(lr_f1)

0.7019700010544871
CPU times: user 295 ms, sys: 52.2 ms, total: 347 ms
Wall time: 317 ms


## Dump the vectorizer and also the vocabulary

In [15]:
with open(str(product['vectorizer']), 'wb') as f:
    pickle.dump(vectorizer, f)

with open(str(product['vocab']), 'wb') as f:
    pickle.dump(vectorizer.vocabulary_, f)