In [169]:
import numpy as np

import pandas as pd

import re
from nltk.corpus import stopwords

import gensim
import gensim.models.word2vec as w2v

In [3]:
wa_tweets = pd.read_csv('../ethan/wa_tweets.csv')
outage_tweets = pd.read_csv('../ethan/outage_tweets.csv')
outages = pd.read_csv('../ethan/washington_outages.csv')

In [4]:
outage_tweets.drop_duplicates(subset='text', inplace=True)
wa_tweets.drop_duplicates(subset='text', inplace=True)

## Preprocessing
These steps are adapted from the work of Noah Christiansen, Jen Hill, and Vonn Napoleon Johnson, available [here](https://github.com/jenrhill/Power_Outage_Identification).

In [5]:
# set up stopwords, but include certain relevant but common words

stops = [w for w in stopwords.words('english') 
         if w not in ['out','on','off']] + ['']

In [6]:
# this function adapted from christiansen et al's
# tweets_to_tweetlist
# cleans, reformats, tokenizes, and removes stopwords from tweets

def tweets_cleaner(raw):
    
    clean = str(raw).replace('-',' ').replace('#',' ')
    clean = re.sub(r"([@][\w_-]+)", "", clean) # remove handels
    clean = re.sub(r"[^a-zA-Z ]", "", clean)
    clean = re.sub(r"http.*?\b","",clean)
    clean = re.sub(r"pictwitter.*?\b","",clean)
    clean = re.sub(r"www.*?\b","",clean)
    clean = clean.lower().split()
    clean = [w for w in clean if w not in stops]
    return clean

In [141]:
# clean tweets and compound in list

tweet_list = [tweets_cleaner(t)
              for t in wa_tweets.loc[wa_tweets['outage']==1, 'text']]
#    all tweets expected to maybe be outage-related ^

In [142]:
# visualize tweets against the words they're reduced down to

pd.DataFrame([(a,b) for a,b in zip(list(outage_tweets['text']), 
                                   tweet_list)],
             columns=['Tweet','Words'])

Unnamed: 0,Tweet,Words
0,@Number12ForLife No power in Issaquah Highland...,"[power, issaquah, highlands, understand, issaq..."
1,@KING5Seattle @KIRO7Seattle tiger mountain rd ...,"[tiger, mountain, rd, se, block, way, blocking..."
2,@KING5Seattle @KIRO7Seattle SE may valley road...,"[se, may, valley, road, closed, moment, tree, ..."
3,Just lost power. They power better be back on ...,"[lost, power, power, better, back, on]"
4,Update: Power Outages Affecting Renton's Benso...,"[update, power, outages, affecting, rentons, b..."
...,...,...
179,Power is restored! We’ll be open regular hours...,"[power, restored, well, open, regular, hours, ..."
180,"Soooo, the power is out. @psetalk says they e...","[soooo, power, out, says, expect, us, running,..."
181,Tree hanging across road on power line. @ Bain...,"[tree, hanging, across, road, on, power, line,..."
182,"Power's out, but we're still open! Feel free t...","[powers, out, still, open, feel, free, come, v..."


# EVERYTHING BELOW THIS IS UGLY

In [131]:
len(tweet_list)

10544

In [143]:
model = gensim.models.KeyedVectors.load_word2vec_format(
    'lexvec.enwiki+newscrawl.300d.W.pos.vectors')

In [158]:
model.most_similar('blackout')

[('outage', 0.6274550557136536),
 ('blackouts', 0.6006176471710205),
 ('outages', 0.484832227230072),
 ('shutdown', 0.4596997797489166),
 ('curfew', 0.41762691736221313),
 ('lockdown', 0.41292476654052734),
 ('blacked', 0.4122679829597473),
 ('shutdowns', 0.41034194827079773),
 ('cancellation', 0.40724366903305054),
 ('glitch', 0.3966738283634186)]

In [159]:
len(model.vocab)

368999

In [160]:
model.most_similar('cold', topn=10)

[('warm', 0.6232660412788391),
 ('chilly', 0.6049166321754456),
 ('frigid', 0.5835704207420349),
 ('dry', 0.565641462802887),
 ('colder', 0.555574357509613),
 ('freezing', 0.5500966310501099),
 ('wet', 0.5406850576400757),
 ('cool', 0.5254527926445007),
 ('rainy', 0.5215438604354858),
 ('winters', 0.5155574083328247)]

In [161]:
model.most_similar(positive=['blackout','power','outage'],
                           negative=['game','cod','bts'],
                           topn=10)

[('outages', 0.5678640007972717),
 ('blackouts', 0.517982006072998),
 ('shutdowns', 0.4220408797264099),
 ('electricity', 0.407345712184906),
 ('brownouts', 0.3772614300251007),
 ('shutdown', 0.3731269836425781),
 ('generators', 0.3564743101596832),
 ('disruptions', 0.3401876986026764),
 ('substations', 0.32458698749542236),
 ('meltdowns', 0.32287344336509705)]

In [248]:
# code from Boom Devahastin Na Ayudhya edited by Christiansen et al
def vectorize_corpus(keyword_list,model=model):    
    
    # Instantiate counter for number of words in keyword_list that exists
    n_words = 0
    
    # Create template for cumulative corpus vector sum
    corpus_vec_sum = np.zeros((1,300))                 
    
    # Scan through each word in list
    for word in keyword_list:
        if word in model.vocab:                    
            word_vec = model.word_vec(word)        
            n_words +=1                                
            corpus_vec_sum = corpus_vec_sum + word_vec 

    # Compute average vector by taking cumulative vector sum 
    # and dividing it by number of words traced
    corpus_avg_vec = corpus_vec_sum/n_words
    
    # Squeeze this N-dimensional nested array object into a 
    # 1-D array to streamline future processing
    corpus_avg_vec = np.squeeze(corpus_avg_vec)
    
    return(corpus_avg_vec)

In [246]:
def cos_sim(v1, v2): #adapted from christiansen et al
    return np.dot(v1,v2)/np.sqrt(np.dot(v1,v1)*np.dot(v2,v2))

def cos_sim_words(w1,w2,model=model):
    try:
        return cos_sim(model.word_vec(w1),model.word_vec(w2))
    except:
        print(f'{w1} or {w2} not in vocabulary.')
        return np.nan
    
def cos_sim_wc(word,corpus_vector,model=model):
    try:
        return cos_sim(model.word_vec(word),corpus_vector)
    except:
        print(f'{word} not in vocabulary.')
        return np.nan
    
def cos_sim_tc(tweet, corpus_vector, model=model):
    tweet_vector = vectorize_corpus(tweet)
    return cos_sim(tweet_vector, corpus_vector)

In [247]:
cos_sim_words('blackout','cod')

0.07461891

In [249]:
power_out = ['outage','blackout','brownout','power','line','transformer',
             'storm','pseg','out','damage','powerout','backup','cable',
             'circuit','cutoff','overload','fire','grid','energy',
             'surge','eia','lights','light','outages','cut','failure',
             'severe','weather','substation','lines','voltage','short']

power_out_vector = vectorize_corpus(power_out)

In [254]:
cos_sim_list = [cos_sim_tc(t,power_out_vector) for t in tweet_list]

In [255]:
# visualize tweets against the words they're reduced down to

pd.DataFrame([(a,b) for a,b in zip(list(outage_tweets['text']), 
                                   cos_sim_list)],
             columns=['Tweet','Cosine Similarity'])

Unnamed: 0,Tweet,Cosine Similarity
0,@Number12ForLife No power in Issaquah Highland...,0.297100
1,@KING5Seattle @KIRO7Seattle tiger mountain rd ...,0.477564
2,@KING5Seattle @KIRO7Seattle SE may valley road...,0.445732
3,Just lost power. They power better be back on ...,0.501879
4,Update: Power Outages Affecting Renton's Benso...,0.649435
...,...,...
179,Power is restored! We’ll be open regular hours...,0.414388
180,"Soooo, the power is out. @psetalk says they e...",0.425628
181,Tree hanging across road on power line. @ Bain...,0.459658
182,"Power's out, but we're still open! Feel free t...",0.232142


In [266]:
[ for t in list(wa_tweets['text'])][-1]

['oh', 'lord']

In [267]:
more_cs = [cos_sim_tc(tweets_cleaner(t), power_out_vector) for t in list(wa_tweets['text'])]



Now that we have an unsupervised model that outputs some metric, we'll use a decision tree stump to create a cutoff point in cos similarity for good tweets and bad.

In [272]:
wa_tweets['outage'].value_counts()[0]

5350

In [320]:
wa_tweets['Cosine Similarity'] = [cos_sim_tc(tweets_cleaner(t), power_out_vector) for t in list(wa_tweets['text'])]
wa_tweets = wa_tweets.fillna(0)



In [322]:
wa_tweets.groupby('outage').mean()

Unnamed: 0_level_0,Cosine Similarity
outage,Unnamed: 1_level_1
0,0.202299
1,0.432248


A decision tree won't work well if the data is so lopsided, so let's create an evenly split dataframe with bootstrapping.

In [280]:
# num of extra positives to add
inflate_num = wa_tweets['outage'].value_counts()[0] -\
    wa_tweets['outage'].value_counts()[1]

inflate = wa_tweets.loc[
    np.random.choice(wa_tweets[wa_tweets['outage'] == 1].index,
                     size=inflate_num)]

wa_bootstrapped = pd.concat([inflate, wa_tweets])

In [304]:
X = pd.DataFrame([cos_sim_tc(tweets_cleaner(t), power_out_vector)
                  for t in list(wa_bootstrapped['text'])]).fillna(0)
y = wa_bootstrapped['outage'].reset_index(drop=True)



In [325]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    random_state=1007)

In [327]:
# use decision tree to 

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(max_depth=1)
dtc.fit(X_train,y_train)
dtc.score(X_test,y_test)

0.8964485981308411