In [1]:
from pathlib import Path
import fastText
import sklearn
import sklearn.metrics
import numpy as np
import re
import mpld3
import matplotlib.pyplot as plt

In [2]:
root_dir = Path("..")
data_dir = root_dir / "data"
notebook_dir = root_dir / "notebooks"
model_dir = root_dir / "model" 

if not model_dir.exists():
    model_dir.mkdir()

In [3]:
data_path = data_dir / "twitter_las_vegas_shooting"
input_filename = str(data_path)
model_filename = str(model_dir / "twitter_hashtag.bin")

# Training Embedding Model

In [4]:
# Keep hashtags in preprocessing

# Preprocessing Config
preprocess_config = {
    "hashtag": False,
    "mentioned": True,
    "punctuation": False,
    "url": True,
}

# Pattern
hashtag_pattern = "#\w+"
mentioned_pattern = "@\w+"
url_pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

trans_str = "!\"$%&\'()*+,-./:;<=>?[\\]^_`{|}~" + "…"
translate_table = str.maketrans(trans_str, " " * len(trans_str))

def preprocess(s):
    s = s.lower()
    if preprocess_config["hashtag"]:
        s = re.sub(hashtag_pattern, "", s)
    if preprocess_config["mentioned"]:
        s = re.sub(mentioned_pattern, "", s)
    if preprocess_config["url"]:
        s = re.sub(url_pattern, "", s)
    if preprocess_config["punctuation"]:
        s = " ".join(s.translate(translate_table).split())
    return s


In [5]:
# example of preprocessing
example_twitter = "RT @TheLeadCNN: Remembering Keri Lynn Galvan, from Thousand Oaks, California. #LasVegasLost https://t.co/QuvXa6WvlE https://t.co/hDF2d3Owgn"
preprocess(example_twitter)

'rt : remembering keri lynn galvan, from thousand oaks, california. #lasvegaslost  '

# Training

In [6]:
# fastText Config
embedding_model = "skipgram"
lr = 0.05
dim = 100
ws = 5
epoch = 5
minCount = 5
minCountLabel = 0
minn = 3
maxn = 6
neg = 5
wordNgrams = 1
loss = "ns"
bucket = 2000000
thread = 12
lrUpdateRate = 100
t = 1e-4
verbose = 2

In [7]:
model = fastText.train_unsupervised(
    input = input_filename,
    model=embedding_model,
    lr=lr,
    dim=dim,
    ws=ws,
    epoch=epoch,
    minCount=minCount,
    minCountLabel=minCountLabel,
    minn=minn,
    maxn=maxn,
    neg=neg,
    wordNgrams=wordNgrams,
    loss=loss,
    bucket=bucket,
    thread=thread,
    lrUpdateRate=lrUpdateRate,
    t=t,
    verbose=verbose,
)

# Output model to disk if needed
model.save_model(model_filename)

In [8]:
# Load saved model if needed
model = fastText.load_model(model_filename)

# Low Dim Embedding using T-SNE

In [9]:
# TSNE Config
N_COMPONENTS = 2  # should be 2 for 2D plot
n_components = N_COMPONENTS
perplexity = 30.0
n_iter = 5000

def sklearn_tsne(embedding):
    from sklearn.manifold import TSNE
    tsne = TSNE(perplexity=perplexity, n_components=n_components,
                n_iter=n_iter, metric="cosine")
    low_dim_embedding = tsne.fit_transform(embedding)
    return low_dim_embedding

In [10]:
# TSNE
# generate 2D representation of embedding
# Note: TSNE is time-consuming

# labels = np.array(model.get_words())
# embedding = np.array([model.get_word_vector(w) for w in words])
# low_dim_embedding = sklearn_tsne(embedding)
# label_vector = {}
# for i, label in enumerate(labels):
#     label_vector[label] = (low_dim_embedding[i, :], embedding[i, :])

In [11]:
# TSNE is time-consuming, an optional way is use our pre-trained data instead
def load_text(filename):
    with open(filename) as f:
        lines = f.readlines()
    return [l.strip() for l in lines]

labels = load_text(str(data_dir / "twitter_las_vegas_shooting.labels"))
embedding = np.loadtxt(data_dir / "twitter_las_vegas_shooting.embedding")
low_dim_embedding = np.loadtxt(data_dir / "twitter_las_vegas_shooting.low_dim_embedding")

label_vector = {}
for i, label in enumerate(labels):
    label_vector[label] = (low_dim_embedding[i, :], embedding[i, :])

# Interactive Plot

In [12]:
def calc_n_cosine_neighbor(inX, X, N):
    if inX.ndim == 1:
        inX = [inX]
    distances = sklearn.metrics.pairwise.pairwise_distances(
        X, inX, metric="cosine")
    sortedDist = distances.reshape((distances.shape[0],)).argsort()
    return sortedDist[:N], distances

def nn(query, words=labels, word_vectors=embedding, k=10):
    """
    words: numpy array of words
    k: (optional, 10 by default) top k labels
    """
    global model
    v = model.get_word_vector(query)
    idx, _ = calc_n_cosine_neighbor(v, word_vectors, k)
    return words[idx]

def plot_interactive_scatter(low_dim_embedding, labels, inx, q, info):
    from matplotlib.patches import Circle

    fig, ax = plt.subplots()
    fig.set_size_inches(10, 10)
    plt.title(info)

    low_dim_embedding = np.concatenate([low_dim_embedding, inx])
    labels.append(q)

    # mark query
    c_x, c_y = inx[0]
    circle = Circle((c_x, c_y), 10, facecolor='none',
                    edgecolor='red', linewidth=3, alpha=0.5)
    ax.add_patch(circle)

    scatter = ax.scatter(
        low_dim_embedding[:, 0],
        low_dim_embedding[:, 1],
    )

    tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
    for i, label in enumerate(labels):
        x, y = low_dim_embedding[i, :]
        ax.text(x, y, label, alpha=0.4)
    mpld3.plugins.connect(fig, tooltip)
    return fig

def process_nonexistent_word(w):
    return model.get_word_vector(w)

def process_query(q):
    q = q.strip()

    def is_hashtag(x): return x.startswith("#")

    if is_hashtag(q) and q in label_vector:
        v = label_vector[q]
    elif (not is_hashtag(q)) and ("#" + q) in label_vector:
        q = "#" + q
        v = label_vector[q]
    else:
        v = (None, process_nonexistent_word(q))
    return q, v

def query(q):
    LOW_DIM_EMBEDDING = 0
    EMBEDDING = 1    
    N_NEIGHBOR = 400

    q, vs = process_query(q)

    inx_embedding = vs[EMBEDDING]
    inx_low_dim_embedding = vs[LOW_DIM_EMBEDDING]
    
    idx, _ = calc_n_cosine_neighbor(inx_embedding[np.newaxis, :], embedding, N_NEIGHBOR)
    
    plot_labels = [labels[i] for i in idx]

    
    # For nonexistent word, use it's cloest neighbor to approximately represent its position in 2D plot
    info = q
    if inx_low_dim_embedding is None:
        inx_low_dim_embedding = low_dim_embedding_sample[idx[-1], :]
        info = "Nonexistent word: " + q
    
    return plot_interactive_scatter(low_dim_embedding[idx, :], plot_labels, inx_low_dim_embedding[np.newaxis, :], q, info)


In [13]:
q = "lasvegas"
mpld3.display(query(q))