# Building tweet vectorizer using a standard TweetTokenizer followed by a dimensionality reduction step

In [1]:
import pandas as pd
from sklearn.metrics import f1_score
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [2]:
upstream = []
random_seed = 42

In [3]:
# Parameters
random_seed = 42
product = {
    "nb": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vectorizer_svd.ipynb",
    "vectorizer": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vectorizer_svd.pkl",
    "vocab": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vocab_svd.pkl",
}


## Read the train and test data

In [4]:
df_train = pd.read_csv('../data/HumAID_data_v1.0/all_combined/all_train.tsv', sep='\t')
df_test = pd.read_csv('../data/HumAID_data_v1.0/all_combined/all_test.tsv', sep='\t')

df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

df_train.sample(5)

Unnamed: 0,tweet_id,tweet_text,class_label
10242,1039497600565764096,#Florence potentially the strongest storm to h...,caution_and_advice
43549,768860989525725185,"If you want to support, this is the way to don...",rescue_volunteering_or_donation_effort
31556,1111195521010552832,The destruction in Beira fell #CycloneIdai is ...,infrastructure_and_utility_damage
24319,908885698991845378,Victims of Hurricane #Irma that hit the #USVir...,other_relevant_information
8154,1041702423998259201,Join the The Childrens Village team! See our l...,not_humanitarian


## Build and train the vectorizer

In [5]:
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

vectorizer = TfidfVectorizer(
        tokenizer=tokenizer.tokenize, #tokenize_text,
        strip_accents='unicode',
        ngram_range=(1, 2),
        max_df=0.90,
        min_df=1,
        max_features=10000,
        use_idf=True
    )

svd = TruncatedSVD(n_components=500, n_iter=30, random_state=random_seed)

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('svd', svd)
])

In [6]:
%%time
X_train = pipeline.fit_transform(df_train['tweet_text'])
y_train = list(df_train['class_label'])

X_test = pipeline.transform(df_test['tweet_text'])
y_test = list(df_test['class_label'])

print('Categories: ', list(set(y_train)))
print('Vectorizer rows and columns: ', X_train.shape)
print()

Categories:  ['injured_or_dead_people', 'not_humanitarian', 'caution_and_advice', 'missing_or_found_people', 'displaced_people_and_evacuations', 'rescue_volunteering_or_donation_effort', 'other_relevant_information', 'sympathy_and_support', 'requests_or_urgent_needs', 'infrastructure_and_utility_damage']
Vectorizer rows and columns:  (53516, 500)

CPU times: user 3min 35s, sys: 34.5 s, total: 4min 9s
Wall time: 2min 8s


## Test/validate with logistic regression

In [7]:
%%time
# Prepate the logistic regression classifier
clf = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=random_seed, max_iter=1000)
clf.fit(X_train, y_train)

CPU times: user 3min 14s, sys: 27 s, total: 3min 41s
Wall time: 1min 19s


In [8]:
%%time
# Predict on test
lr_test_preds = clf.predict(X_test)
# Score on the test data
lr_f1 = f1_score(y_test, lr_test_preds, average='macro')
print('F1 score on the test data: ', lr_f1)

F1 score on the test data:  0.6809571308948223
CPU times: user 948 ms, sys: 313 ms, total: 1.26 s
Wall time: 256 ms


## Persist the vectorizer to be used downstream

In [9]:
with open(str(product['vectorizer']), 'wb') as f:
    pickle.dump(vectorizer, f)

with open(str(product['vocab']), 'wb') as f:
    pickle.dump(vectorizer.vocabulary_, f)