# Building tweet vectorizer using a standard TweetTokenizer

In [1]:
import pandas as pd
from sklearn.metrics import f1_score
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from sklearn.linear_model import LogisticRegression

In [2]:
upstream = []
random_seed = 42

In [3]:
# Parameters
random_seed = 42
product = {
    "nb": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vectorizer.ipynb",
    "vectorizer": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vectorizer.pkl",
    "vocab": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vocab.pkl",
}


## Read the train and test data

In [4]:
df_train = pd.read_csv('../data/HumAID_data_v1.0/all_combined/all_train.tsv', sep='\t')
df_test = pd.read_csv('../data/HumAID_data_v1.0/all_combined/all_test.tsv', sep='\t')

df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

df_train.sample(5)

Unnamed: 0,tweet_id,tweet_text,class_label
25080,907966734241591298,Currently getting hit by whats left of #Irma i...,other_relevant_information
22524,914919978595487746,US military members delivered supplies to #Vie...,rescue_volunteering_or_donation_effort
37583,1168292887295078400,@JonahNRO This is a bit petty. Dorian is the s...,infrastructure_and_utility_damage
940,1031165669604315136,"Items required at SMV school trivandrum, the c...",requests_or_urgent_needs
32951,1111856063274500097,The UK stands shoulder to shoulder with #Malaw...,rescue_volunteering_or_donation_effort


## Build and train the vectorizer

In [5]:
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

vectorizer = TfidfVectorizer(
        tokenizer=tokenizer.tokenize, #tokenize_text,
        strip_accents='unicode',
        ngram_range=(1, 2),
        max_df=0.90,
        min_df=1,
        max_features=10000,
        use_idf=True
    )

In [6]:
# from sklearn.decomposition import TruncatedSVD

# svd = TruncatedSVD(n_components=500, n_iter=30, random_state=42)

In [7]:
%%time
X_train = vectorizer.fit_transform(df_train['tweet_text'])
#X_train_reduced = svd.fit_transform(X_train)
y_train = list(df_train['class_label'])

X_test = vectorizer.transform(df_test['tweet_text'])
#X_test_reduced = svd.transform(X_test)
y_test = list(df_test['class_label'])

print('Categories: ', list(set(y_train)))
print('Vectorizer rows and columns: ', X_train.shape)
print()

Categories:  ['sympathy_and_support', 'injured_or_dead_people', 'requests_or_urgent_needs', 'not_humanitarian', 'caution_and_advice', 'rescue_volunteering_or_donation_effort', 'other_relevant_information', 'displaced_people_and_evacuations', 'infrastructure_and_utility_damage', 'missing_or_found_people']
Vectorizer rows and columns:  (53516, 10000)

CPU times: user 14.2 s, sys: 134 ms, total: 14.3 s
Wall time: 14.3 s


## Test/validate with logistic regression

In [8]:
%%time
# Prepate the logistic regression classifier
clf = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=random_seed, max_iter=1000)
clf.fit(X_train, y_train)

CPU times: user 2min 46s, sys: 2min 3s, total: 4min 49s
Wall time: 1min 11s


In [9]:
%%time
# Predict on test
lr_test_preds = clf.predict(X_test)
# Score on the test data
lr_f1 = f1_score(y_test, lr_test_preds, average='macro')
print('F1 score on the test data: ', lr_f1)

F1 score on the test data:  0.7137300362376781
CPU times: user 431 ms, sys: 21.3 ms, total: 453 ms
Wall time: 1.19 s


## Persist the vectorizer to be used downstream

In [10]:
with open(str(product['vectorizer']), 'wb') as f:
    pickle.dump(vectorizer, f)

with open(str(product['vocab']), 'wb') as f:
    pickle.dump(vectorizer.vocabulary_, f)