In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import torch
import torch.nn as nn

from BertForSequenceClassificationOutputPooled import *
from BertTM import *
from sentence_transformers import SentenceTransformer
from sentence_transformers import models, losses
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

### Load pretrained

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassificationOutputPooled.from_pretrained('bert-base-uncased', 
                                                              output_attentions=True, 
                                                              output_hidden_states=True)
labels = torch.tensor([1]).unsqueeze(0)
input_list = []
token_list = []
cls_ = '[CLS]'
sep_ = '[SEP]'
sentences = ['Hello, my dog is cute and cutest.', 'I am too']
for i, sent in enumerate(sentences):
    inputs = tokenizer.encode_plus(sentences[i], add_special_tokens=True)
    tokens = [cls_] + tokenizer.tokenize(sentences[i]) + [sep_]
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0)
    input_list.append(input_ids)
    token_list.append(tokens)

### Load fine-tuned

In [109]:
output_dir = "../bert-classifier-pytorch/model_save_attention_1epoch"

# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassificationOutputPooled.from_pretrained(output_dir,
                                                      output_attentions = True, 
                                                      output_hidden_states = True)
tokenizer = BertTokenizer.from_pretrained(output_dir)
labels = torch.tensor([1]).unsqueeze(0)
input_list = []
token_list = []
cls_ = '[CLS]'
sep_ = '[SEP]'
sentences = ['Hello, my dog is cute and cutest.', 'I am too']
for i, sent in enumerate(sentences):
    inputs = tokenizer.encode_plus(sentences[i], add_special_tokens=True)
    tokens = [cls_] + tokenizer.tokenize(sentences[i]) + [sep_]
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0)
    input_list.append(input_ids)
    token_list.append(tokens)

In [110]:
word_embedding_model = models.BERT(output_dir, max_seq_length = 240,)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

st_model = SentenceTransformer(modules=[word_embedding_model, pooling_model],
                               #device=torch.device("cuda")
                              )

### Test that attention and vectorization work

In [111]:
attentions = get_attention(sentences, model, tokenizer, method = 'first')
np.sum([tpl[1] for tpl in attentions[1]])

vectorized = vectorize(sentences, model, tokenizer)
torch.stack(vectorized).detach().numpy().shape

(2, 768)

In [None]:
url_re = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'

test = ["this movie was the cutest. read more at http://worstever.com"]

In [117]:
import re

data = data.apply(lambda row: ' '.join([word for word in row.split() 
                                        if (not word in stopwords) 
                                        and (not re.match(url_re, word))]))


In [118]:
data[:10]

0                                             vibing💀😳
1                                   Progression storm.
2    Snow band starting pivot Avalon. Heavy snow co...
3                                                     
4    Anytime speculated yesterday severity storm I ...
5    @EddieSheerr Eddie... TWN forecast wind starti...
6    These pictures crazyy, I complain ottawa winte...
7    Visibility reduced ares @towngfw hour. It’s mo...
8    #Hiphops #Lit #dope flow #GotDrip #BING 💯 @tik...
9                        This bonkers! Snow windows. 😮
Name: text, dtype: object

In [120]:
get_attention(["this movie was the cutest. read more at http://worstever.com"], model, tokenizer, method = 'first')

[[('this', 0.0613682),
  ('movie', 0.030429687),
  ('was', 0.035641603),
  ('the', 0.21911351),
  ('cutest', 0.06270852),
  ('.', 0.09440403),
  ('read', 0.0423442),
  ('more', 0.06480052),
  ('at', 0.06262489),
  ('http', 0.027938599),
  (':', 0.043154325),
  ('/', 0.038011733),
  ('/', 0.041409045),
  ('worstever', 0.04187157),
  ('.', 0.08471608),
  ('com', 0.049463503)]]

## Topic Model

In [174]:
df = pd.read_csv("nlwx_2020_hashtags_no_rt_predictions.csv")
df = preprocess(df)
data = df['preprocessed_text']

In [175]:
batch_size = 20
ngram = (1, 3)
n_topics = 10

#### Pooled

In [7]:
rows, attentions = [], []
counter = 0
for i in range(0, len(data), batch_size):
    index = min(i + batch_size, len(data))
    rows.append(vectorize(data[i:index], model, tokenizer))
    attentions.extend(get_attention(data[i:index], model, tokenizer))
    if counter % 50 == 0:
        print(f"Processed {counter} rows out of {len(data)}.")
    counter += 1

Processed 0 rows out of 100.


#### Sentence embeddings

In [177]:
rows, attentions = [], []
counter = 0
for i in range(0, len(data)):
    rows.extend(st_model.encode([data[i]]))
    attentions.extend(get_attention([data[i]], model, tokenizer))
    counter += 1
    if counter % 500 == 0:
        print(f"Processed {counter} rows out of {len(data)}.")

Processed 500 rows out of 20682.
Processed 1000 rows out of 20682.
Processed 1500 rows out of 20682.
Processed 2000 rows out of 20682.
Processed 2500 rows out of 20682.
Processed 3000 rows out of 20682.
Processed 3500 rows out of 20682.
Processed 4000 rows out of 20682.
Processed 4500 rows out of 20682.
Processed 5000 rows out of 20682.
Processed 5500 rows out of 20682.
Processed 6000 rows out of 20682.
Processed 6500 rows out of 20682.
Processed 7000 rows out of 20682.
Processed 7500 rows out of 20682.
Processed 8000 rows out of 20682.
Processed 8500 rows out of 20682.
Processed 9000 rows out of 20682.
Processed 9500 rows out of 20682.
Processed 10000 rows out of 20682.
Processed 10500 rows out of 20682.
Processed 11000 rows out of 20682.
Processed 11500 rows out of 20682.
Processed 12000 rows out of 20682.
Processed 12500 rows out of 20682.
Processed 13000 rows out of 20682.
Processed 13500 rows out of 20682.
Processed 14000 rows out of 20682.
Processed 14500 rows out of 20682.
Proce

In [192]:
with open('stopwords-en.json') as fopen:
    stopwords = json.load(fopen)

stopwords.extend(['#', '@', '…', "'", "’", "[UNK]", "\"", ";", "*", "_", "amp", "&"])
    
print(len(stopwords))
print(stopwords[:5])

1310
["'ll", "'tis", "'twas", "'ve", '10']


In [193]:
#concat = np.concatenate(rows, axis = 0)
#concat = [item.detach().numpy() for item in concat]
#concat = np.asarray(concat, dtype=np.float32)

In [179]:
all_model_data = []

for i in range(len(rows)):
    all_model_data.append((data[i], df.prediction[i], attentions[i], rows[i]))
    
#pickle.dump(all_model_data, open(f"attentions_sent_embeddings.pkl", "wb" ))

In [208]:
all_model_data = pickle.load(open("attentions_sent_embeddings.pkl", "rb"))
texts, _, attentions, rows = zip(*all_model_data)

In [209]:
%%time

print("Fitting kmeans model.")
#rows = rows[:1000]
#attentions = attentions[:1000]
kmeans = KMeans(n_clusters = n_topics, random_state = 0).fit(rows)
labels = kmeans.labels_

Fitting kmeans model.
CPU times: user 35.7 s, sys: 2.06 s, total: 37.7 s
Wall time: 31.8 s


In [210]:
%%time

overall, filtered_a = [], []
url_re = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
print("Filtering attentions.")
for a in attentions:
    f = [i for i in a if i[0] not in stopwords and i[0] not in url_re]
    overall.extend(f)
    filtered_a.append(f)

Filtering attentions.
CPU times: user 7.42 s, sys: 4 ms, total: 7.42 s
Wall time: 7.42 s


In [211]:
%%time

print("Generating ngrams.")
o_ngram = generate_ngram(overall, ngram)
features = []
for i in o_ngram:
    features.append(' '.join([w[0] for w in i]))
features = list(set(features))


Generating ngrams.
CPU times: user 524 ms, sys: 16 ms, total: 540 ms
Wall time: 539 ms


In [212]:
%%time

import time

print(
"""
Determining cluster components. This will take awhile. 
Progress will be printed for every 500th processed property.
""")

components = {}
words_label = {}
start_time = time.time()
for idx, label in enumerate(labels):
    if label not in components:
        components[label] = {}
        words_label[label] = []
    else:
        f = generate_ngram(filtered_a[idx], ngram)
        for w in f:
            word = ' '.join([r[0] for r in w])
            score = np.mean([r[1] for r in w])
            if word in features:
                if word in components[label]:
                    components[label][word] += score
                else:
                    components[label][word] = score
                words_label[label].append(word)
    if (idx + 1) % 500 == 0:
        print(f'Processed {(idx + 1)} texts in {round(time.time() - start_time, 2)} seconds.')
            
print(f"Finished determining cluster components. Total time {round(time.time() - start_time, 2)} seconds.")


Determining cluster components. This will take awhile. 
Progress will be printed for every 500th processed property.

Processed 500 texts in 34.58 seconds.
Processed 1000 texts in 69.98 seconds.
Processed 1500 texts in 107.74 seconds.
Processed 2000 texts in 144.56 seconds.
Processed 2500 texts in 184.06 seconds.
Processed 3000 texts in 221.92 seconds.
Processed 3500 texts in 261.15 seconds.
Processed 4500 texts in 334.02 seconds.
Processed 5000 texts in 375.48 seconds.
Processed 5500 texts in 416.82 seconds.
Processed 6000 texts in 457.95 seconds.
Processed 6500 texts in 494.66 seconds.
Processed 7000 texts in 531.15 seconds.
Processed 7500 texts in 565.59 seconds.
Processed 8000 texts in 602.03 seconds.
Processed 8500 texts in 636.78 seconds.
Processed 9000 texts in 670.91 seconds.
Processed 9500 texts in 705.01 seconds.
Processed 10000 texts in 738.05 seconds.
Processed 10500 texts in 774.64 seconds.
Processed 11000 texts in 813.0 seconds.
Processed 11500 texts in 850.06 seconds.
P

In [213]:
def dummy_fun(doc):
    return doc

tfidf_vectorizer = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None) 

tf_idf_corpus = [[item for item in words_label[key]] for key in range(0,10)]
transformed = tfidf_vectorizer.fit_transform(tf_idf_corpus)

In [214]:
index_value={i[1]:i[0] for i in tfidf_vectorizer.vocabulary_.items()}
fully_indexed = []
for row in transformed:
    fully_indexed.append({index_value[column]:value for (column,value) in zip(row.indices,row.data)})

In [215]:
#pickle.dump(components, open("components.pickle", "wb"))

In [216]:
components_tfidf_attn = {}
components_tfidf = {}
for k1 in components:
    components_tfidf_attn[k1] = {}
    components_tfidf[k1] = {}
    for k2 in components[k1]:
        components_tfidf_attn[k1][k2] = fully_indexed[k1][k2] * components[k1][k2]
        components_tfidf[k1][k2] = fully_indexed[k1][k2]

In [217]:
topics_attn = topics_df(
    10,
    components,
    n_words = 10)

pickle.dump(topics_attn, open("topics_sent_embed.pickle", "wb"))

In [218]:
topics_attn

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,assistance,snow,snow,hope,newfoundland,snow,stay,emergency,closed,snow
1,support,newfoundland,storm,stay,nlstorm,buried,stay safe,snow,power,storm
2,helping,storm,blizzard,safe,nlstorm2020,missing,safe,power,emergency,newfoundland
3,snow,day,weather,newfoundland,nltraffic,digging,warning,closed,snow,snowstorm
4,people,shovel,emergency,thinking,snow,storm,newfoundland,storm,outages,blizzard
5,food,people,newfoundland,stay safe,snowmageddon2020,stuck,advisory,city,storm,snowmageddon2020
6,emergency,love,snowfall,hoping,nlsnowstorm2020,trapped,envcanada warning,john,outage,blizzard2020
7,military,car,winds,friends,snowstorm,car,stay safe newfoundland,people,lost,weather
8,supplies,winter,statement weather,god,canada,people,storm,stay,roads,nlstorm
9,assist,time,statement,prayers,love,blizzard,safe newfoundland,newfoundland,schools,winter


In [219]:
topics_tfidf = topics_df(
    10,
    components_tfidf,
    n_words = 10)

topics_tfidf

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,assistance,snow,snow,safe,newfoundland,snow,safe,nltraffic,power,snow
1,people,newfoundland,newfoundland,newfoundland,nltraffic,buried,stay,john,john,newfoundland
2,newfoundland,day,storm,hope,nlstorm2020,car,stay safe,emergency,emergency,storm
3,snow,time,john,stay,nlstorm,house,envcanada,road,closed,snowmageddon2020
4,food,people,blizzard,stay safe,eminem,newfoundland,warning,power,snow,snowstorm
5,emergency,love,emergency,prayers,snowmageddon2020,john,newfoundland,cityofstjohns,nltraffic,blizzard
6,john,dog,winds,friends,michelleobama,street,envcanada warning,snow,outages,nlstorm2020
7,support,storm,weather,thinking,saveng,door,safe newfoundland,tomorrow,schools,nlstorm
8,military,morning,nlstorm,warm,ken,people,emergency,city,roads,day
9,helping,door,30,safe warm,starr,storm,stay safe newfoundland,people,power outages,blizzard2020


In [220]:
topics_tfidf_attn = topics_df(
    10,
    components_tfidf_attn,
    n_words = 10)

topics_tfidf_attn

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,assistance,snow,snow,safe,newfoundland,snow,stay,emergency,closed,snow
1,people,newfoundland,storm,hope,nltraffic,buried,safe,power,power,newfoundland
2,snow,day,newfoundland,newfoundland,nlstorm,car,stay safe,snow,emergency,storm
3,support,storm,blizzard,stay,nlstorm2020,newfoundland,warning,john,snow,snowstorm
4,food,people,emergency,stay safe,snowmageddon2020,storm,newfoundland,closed,outages,blizzard
5,helping,love,weather,thinking,snow,digging,envcanada warning,city,john,snowmageddon2020
6,emergency,time,winds,friends,nlsnowstorm2020,house,safe newfoundland,road,storm,nlstorm
7,military,shovel,john,hoping,eminem,missing,stay safe newfoundland,people,schools,blizzard2020
8,newfoundland,dog,snowfall,prayers,canada,stuck,storm,nltraffic,roads,nlstorm2020
9,supplies,car,wind,warm,love,trapped,emergency,storm,remain,weather
